In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv("/kaggle/input/british-airways-customer-bookings/customer_booking.csv", encoding="ISO-8859-1")
df.head()

In [None]:
df.info()

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
df['flight_day'].unique()

In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

encoded_day = label_encoder.fit_transform(df['flight_day'])

print(encoded_day)

In [None]:
df['encoded_day'] = encoded_day
df.head()

In [None]:
is_weekend = []

for i in range(len(df)):
    if df['encoded_day'][i] == 2 or df['encoded_day'][i] == 3:
        is_weekend.append(1)
    else:
        is_weekend.append(0)

df['is_weekend'] = is_weekend
df.head()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

weekend = df.groupby('is_weekend')['num_passengers'].sum().reset_index()
ax = sns.barplot(data=weekend, x='is_weekend', y='num_passengers')
ax.bar_label(ax.containers[0])
plt.show()

In [None]:
dayperday = df.groupby('flight_day')['num_passengers'].mean().reset_index()
ax = sns.barplot(data=dayperday, x= 'flight_day', y= 'num_passengers')
for bar in ax.patches:
    bar.set_facecolor('#888888')
    
ax.bar_label(ax.containers[0])
plt.ylim(0,2)
ax.patches[5].set_facecolor('#aa3333')
ax.patches[6].set_facecolor('#aa3333')
plt.show()

In [None]:
df['flight_day'].value_counts(normalize=True).reset_index()

In [None]:
route = df[df['is_weekend']==1].groupby('route').agg({'num_passengers':'sum'}).reset_index().sort_values(by='num_passengers', ascending=False) 
route[:5]

In [None]:
df['route'].value_counts()

In [None]:
df.drop('route', axis=1, inplace=True)

In [None]:
df['booking_origin'].value_counts()

In [None]:
import pycountry_convert as pc

continent = []
index = []

df['booking_origin'] =  df['booking_origin'].replace('Myanmar (Burma)', 'Myanmar')

for i in range(len(df)):
    country = df['booking_origin'][i]

    try :
        country_code = pc.country_name_to_country_alpha2(country, cn_name_format="default")
        continent_name = pc.country_alpha2_to_continent_code(country_code)
        continent.append(continent_name)
    except:
        continent.append('Others')

df['booking_continent'] = continent

In [None]:
df['booking_continent'].value_counts()

In [None]:
df.drop('booking_origin',axis=1, inplace=True)

<h3> Outliers on Numeric Columns

In [None]:
num = ['num_passengers', 'purchase_lead', 'length_of_stay', 'flight_hour', 'flight_duration']
plt.figure(figsize=(15,6))

for i, column in enumerate(df[num].columns, 1):
    plt.subplot(4, 4, i)
    sns.boxplot(data=df[num],x=df[column])
    plt.tight_layout()

In [None]:
from scipy import stats

print(f'Total rows before delete outlier : {len(df)}')

filtered_entries = np.array([True] * len(df))

for col in num:
    zscore = abs(stats.zscore(df[col]))
    filtered_entries = (zscore < 3) & filtered_entries
    df = df[filtered_entries]

print(f'Total rows after delete outlier : {len(df)}')

In [None]:
plt.figure(figsize=(15,6))

for i, column in enumerate(df[num].columns, 1):
    plt.subplot(4,4,i)
    sns.kdeplot(data=df[num], x=df[column])
    plt.tight_layout()

<h3> Normalizing or Standard Scaling

In [None]:
num_max = df[num].max()
num_min = df[num].min()

num_features = (df[num] - num_min) / (num_max - num_min)
num_features.head()

In [None]:
df[num] = num_features

plt.figure(figsize=(12,8))

for i, column in enumerate (df[num].columns, 1):
    plt.subplot(4,4,i)
    sns.kdeplot(data=df, x=df[column])
    plt.tight_layout()

<h3> Label Encoding

In [None]:
df['sales_channel'] = label_encoder.fit_transform(df['sales_channel'])
df['trip_type'] = label_encoder.fit_transform(df['trip_type'])
df.head()

<h3>One Hot Encoding

In [None]:
from sklearn.preprocessing import OneHotEncoder

one_hot = ['booking_continent']

onehots = pd.get_dummies(df['booking_continent'], prefix='booking_continent', dtype='int')
df = df.join(onehots)


In [None]:
df.drop(columns=['booking_continent','flight_day',], axis=1, inplace=True)
df.head(5)

<h3>Split Data

In [None]:
from sklearn.model_selection import train_test_split

x = df.drop(columns=['booking_complete'], axis=1)
y = df['booking_complete']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

<h3>Over sampling

In [None]:
y_train.value_counts(normalize=True)

In [None]:
from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state=42)
x_over, y_over = sm.fit_resample(x_train, y_train)

<h3> BaseLine Modelling using XGB Classifier

In [None]:
import xgboost as xgb

clf = xgb.XGBClassifier()
clf.fit(x_train, y_train)

predictions = clf.predict(x_test)

In [None]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, classification_report, confusion_matrix, ConfusionMatrixDisplay

def confusionmatrix(predictions):
    cm = confusion_matrix(y_test, predictions)
    print(cm)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    return disp.plot()

def eval_classification(model):
    y_pred = model.predict(x_test)
    y_pred_train = model.predict(x_over)
    
    print("Accuracy (Test Set): %.2f" % accuracy_score(y_test, y_pred))
    print("Precision (Test Set): %.2f" % precision_score(y_test, y_pred))
    print("Recall (Test Set): %.2f" % recall_score(y_test, y_pred))
    print("F1-Score (Test Set): %.2f" % f1_score(y_test, y_pred))
    
    print("roc_auc (test-proba): %.2f" % roc_auc_score(y_test, y_pred))
    print("roc_auc (train-proba): %.2f" % roc_auc_score(y_over, y_pred_train))

In [None]:
confusionmatrix(predictions)
eval_classification(clf)

<h3> Modelling

In [None]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

hyperparameters = {
                    'max_depth' : [int(x) for x in np.linspace(10, 110, num = 11)],
                    'min_child_weight' : [int(x) for x in np.linspace(1, 20, num = 11)],
                    'gamma' : [float(x) for x in np.linspace(0, 1, num = 11)],
                    'tree_method' : ['auto', 'exact', 'approx', 'hist'],

                    'colsample_bytree' : [float(x) for x in np.linspace(0, 1, num = 11)],
                    'eta' : [float(x) for x in np.linspace(0, 1, num = 100)],

                    'lambda' : [float(x) for x in np.linspace(0, 1, num = 11)],
                    'alpha' : [float(x) for x in np.linspace(0, 1, num = 11)]
                    }

from xgboost import XGBClassifier
xg = XGBClassifier(random_state=42)
xg_tuned = RandomizedSearchCV(xg, hyperparameters, cv=5, random_state=42, scoring='recall')
xg_tuned.fit(x_over, y_over)

eval_classification(xg_tuned)

In [None]:
predictions = xg_tuned.predict(x_test)
confusionmatrix(predictions)
print(classification_report(y_test, predictions))

<h3>Feature Importances

In [None]:
feature_important = clf.get_booster().get_score(importance_type='weight')
keys = list(feature_important.keys())
values = list(feature_important.values())

data = pd.DataFrame(data=values, index=keys, columns=["score"]).sort_values(by = "score", ascending=True)
data.nlargest(40, columns="score").sort_values(by = "score", ascending=True).plot(kind='barh', figsize = (20,10))