In [87]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt

In [88]:
data = pd.read_csv('data/AirPass.csv')
data=data.drop ('Unnamed: 0', axis=1)
data.isnull().sum().sum()

310

In [89]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103904 entries, 0 to 103903
Data columns (total 24 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   id                                 103904 non-null  int64  
 1   Gender                             103904 non-null  object 
 2   Customer Type                      103904 non-null  object 
 3   Age                                103904 non-null  int64  
 4   Type of Travel                     103904 non-null  object 
 5   Class                              103904 non-null  object 
 6   Flight Distance                    103904 non-null  int64  
 7   Inflight wifi service              103904 non-null  int64  
 8   Departure/Arrival time convenient  103904 non-null  int64  
 9   Ease of Online booking             103904 non-null  int64  
 10  Gate location                      103904 non-null  int64  
 11  Food and drink                     1039

In [90]:
data['Arrival Delay in Minutes'].isnull().sum()

310

In [91]:
data['Arrival Delay in Minutes'] = data['Arrival Delay in Minutes'].fillna(data['Arrival Delay in Minutes'].median())

In [92]:
round(data['Arrival Delay in Minutes'].mean(), 2)

15.13

In [93]:
data['satisfaction'] = data['satisfaction'].map({'neutral or dissatisfied':0 , 'satisfied':1})
data['Customer Type'] = data['Customer Type'].map({'Loyal Customer':1, 'disloyal Customer':0})
data['Type of Travel'] = data['Type of Travel'].map({'Personal Travel':0, 'Business travel':1})
data['Gender'] = data['Gender'].map({'Male': 0, 'Female': 1})

In [94]:
data.groupby('Gender')['satisfaction'].value_counts()*100/data.shape[0]

Gender  satisfaction
0       0               27.608177
        1               21.645942
1       0               29.058554
        1               21.687327
Name: satisfaction, dtype: float64

In [95]:
data.groupby('Type of Travel')['satisfaction'].value_counts()*100/data.shape[0]

Type of Travel  satisfaction
0               0               27.881506
                1                3.155798
1               1               40.177472
                0               28.785225
Name: satisfaction, dtype: float64

In [96]:
data.groupby('Class')['satisfaction'].value_counts()*100/data.shape[0]

Class     satisfaction
Business  1               33.184478
          0               14.614452
Eco       0               36.614567
          1                8.374076
Eco Plus  0                5.437712
          1                1.774715
Name: satisfaction, dtype: float64

In [97]:
data = pd.get_dummies(data)

In [98]:
X = data.drop('satisfaction', axis=1)
y = data['satisfaction']

In [101]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [100]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=26)

In [102]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaler = scaler.transform(X_train)
X_test_scaler = scaler.transform(X_test)

In [106]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

In [110]:
log_reg = LogisticRegression()
log_reg.fit(X_train_scaler, y_train)
y_train_pred1=log_reg.predict(X_train_scaler)
y_test_pred1=log_reg.predict(X_test_scaler)

print (round(f1_score(y_test, y_test_pred1), 3))

0.855


In [118]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV

In [117]:
adaboost = AdaBoostClassifier(DecisionTreeClassifier(random_state=26),random_state=26,  learning_rate=0.01)
adaboost.fit(X_train_scaler, y_train)
y_train_pred2=adaboost.predict(X_train_scaler)
y_test_pred2=adaboost.predict(X_test_scaler)

print (round(f1_score(y_test, y_test_pred2), 3))

0.94


In [124]:
params = {"n_estimators":2**np.arange(8), "learning_rate":0.1**np.arange(3)}

In [126]:
from sklearn.metrics import make_scorer

In [127]:
grid_search=GridSearchCV(estimator=GradientBoostingClassifier(), param_grid=params, scoring=make_scorer(f1_score), cv=3)
grid_search.fit(X_train_scaler, y_train)

In [128]:
print (grid_search.best_params_)

{'learning_rate': 1.0, 'n_estimators': 128}


In [130]:
print (round(grid_search.best_score_, 3))

0.949


In [131]:
from xgboost import XGBClassifier

In [132]:
xg = XGBClassifier(random_state=26)
xg.fit(X_train_scaler, y_train)
y_train_pred3=xg.predict(X_train_scaler)
y_test_pred3=xg.predict(X_test_scaler)

print(round(f1_score(y_test, y_test_pred3), 3))

0.958


In [133]:
from catboost import CatBoostClassifier


In [137]:
catb = CatBoostClassifier(random_state=26)
catb.fit(X_train_scaler, y_train)
y_train_pred4=catb.predict(X_train_scaler)
y_test_pred4=catb.predict(X_test_scaler)

print(round(f1_score(y_test, y_test_pred4), 3))

Learning rate set to 0.068023
0:	learn: 0.6018089	total: 38.5ms	remaining: 38.5s
1:	learn: 0.5020769	total: 64ms	remaining: 31.9s
2:	learn: 0.4472481	total: 91.5ms	remaining: 30.4s
3:	learn: 0.4028675	total: 119ms	remaining: 29.5s
4:	learn: 0.3674724	total: 145ms	remaining: 28.9s
5:	learn: 0.3397844	total: 175ms	remaining: 29s
6:	learn: 0.3121211	total: 201ms	remaining: 28.6s
7:	learn: 0.2917499	total: 228ms	remaining: 28.2s
8:	learn: 0.2749039	total: 257ms	remaining: 28.3s
9:	learn: 0.2575191	total: 283ms	remaining: 28s
10:	learn: 0.2473690	total: 308ms	remaining: 27.7s
11:	learn: 0.2377531	total: 332ms	remaining: 27.3s
12:	learn: 0.2279309	total: 357ms	remaining: 27.1s
13:	learn: 0.2212512	total: 381ms	remaining: 26.9s
14:	learn: 0.2100359	total: 404ms	remaining: 26.6s
15:	learn: 0.2025733	total: 431ms	remaining: 26.5s
16:	learn: 0.1942303	total: 456ms	remaining: 26.4s
17:	learn: 0.1877939	total: 483ms	remaining: 26.3s
18:	learn: 0.1832381	total: 506ms	remaining: 26.2s
19:	learn: 0.1

In [138]:
from catboost import Pool, CatBoostClassifier
from catboost.utils import get_confusion_matrix

In [147]:
cm = get_confusion_matrix(catb, Pool(X_train_scaler, y_train))
cm = cm/y_train.shape * 100
print (np.around(cm))

[[56.  1.]
 [ 2. 42.]]


In [149]:
pd.DataFrame(
    {
        "feature_importance": catb.get_feature_importance(),
        "feature_names": data.drop(columns="satisfaction").columns,
    }
).sort_values(by=["feature_importance"], ascending=False)

Unnamed: 0,feature_importance,feature_names
6,25.364737,Inflight wifi service
4,18.391876,Type of Travel
11,7.401483,Online boarding
2,7.240564,Customer Type
22,5.420057,Class_Business
17,3.925791,Checkin service
3,3.74235,Age
16,3.640798,Baggage handling
9,3.196286,Gate location
12,3.012455,Seat comfort
