In [2]:
#Welcome to Avia Company Customers Satisfaction Prediction
#First of all i called needed libraries and methods
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_predict
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score,precision_score, classification_report
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [3]:
#those dataset are that we use in this prediction model
train=pd.read_csv("D:/aviakompaniya/train_dataset.csv")
test=pd.read_csv("D:/aviakompaniya/test_dataset.csv")
sample=pd.read_csv('D:/aviakompaniya/sample_submission.csv')
train.head(10)

Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,1,Male,disloyal Customer,33,Business travel,Eco,571,2,3,2,...,4,3,1,3,4,3,4,10,3.0,0
1,2,Female,Loyal Customer,49,Business travel,Business,1431,4,1,4,...,5,5,5,5,3,5,3,0,0.0,1
2,3,Female,Loyal Customer,43,Business travel,Eco,867,1,4,4,...,1,1,1,1,1,1,2,0,18.0,0
3,4,Female,Loyal Customer,27,Business travel,Business,1550,3,3,3,...,2,4,4,5,5,4,2,0,0.0,1
4,5,Male,Loyal Customer,11,Personal Travel,Eco,526,3,4,3,...,4,5,2,5,3,5,4,0,10.0,0
5,6,Male,Loyal Customer,30,Personal Travel,Eco Plus,1034,2,4,2,...,5,4,5,5,4,4,5,0,0.0,0
6,7,Male,Loyal Customer,38,Business travel,Eco,826,5,1,1,...,5,1,4,4,5,4,5,2,5.0,1
7,8,Female,Loyal Customer,39,Business travel,Eco Plus,162,3,4,4,...,3,3,5,5,1,3,3,0,0.0,1
8,9,Male,disloyal Customer,43,Business travel,Business,541,2,2,2,...,3,5,2,5,4,4,3,0,0.0,0
9,10,Female,disloyal Customer,24,Business travel,Eco,1065,4,4,4,...,2,4,2,4,2,4,2,10,17.0,0


In [4]:
#we can see that there are some null values and categorical columns
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 24 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   id                                 10000 non-null  int64  
 1   Gender                             10000 non-null  object 
 2   Customer Type                      10000 non-null  object 
 3   Age                                10000 non-null  int64  
 4   Type of Travel                     10000 non-null  object 
 5   Class                              10000 non-null  object 
 6   Flight Distance                    10000 non-null  int64  
 7   Inflight wifi service              10000 non-null  int64  
 8   Departure/Arrival time convenient  10000 non-null  int64  
 9   Ease of Online booking             10000 non-null  int64  
 10  Gate location                      10000 non-null  int64  
 11  Food and drink                     10000 non-null  int6

In [5]:
# there is some null values, so i chang those values to median value of the column 
train.isnull().sum()

id                                    0
Gender                                0
Customer Type                         0
Age                                   0
Type of Travel                        0
Class                                 0
Flight Distance                       0
Inflight wifi service                 0
Departure/Arrival time convenient     0
Ease of Online booking                0
Gate location                         0
Food and drink                        0
Online boarding                       0
Seat comfort                          0
Inflight entertainment                0
On-board service                      0
Leg room service                      0
Baggage handling                      0
Checkin service                       0
Inflight service                      0
Cleanliness                           0
Departure Delay in Minutes            0
Arrival Delay in Minutes             28
satisfaction                          0
dtype: int64

In [6]:
#correlation is not so good, however i decided to drop some of lower correlated columns
train.corrwith(train['satisfaction']).abs().sort_values(ascending=False)

satisfaction                         1.000000
Online boarding                      0.504986
Inflight entertainment               0.400900
Seat comfort                         0.342294
On-board service                     0.326426
Leg room service                     0.324368
Cleanliness                          0.310338
Flight Distance                      0.298703
Inflight wifi service                0.277395
Baggage handling                     0.247965
Inflight service                     0.240584
Checkin service                      0.232359
Food and drink                       0.207854
Ease of Online booking               0.164662
Age                                  0.144302
Departure/Arrival time convenient    0.067638
Arrival Delay in Minutes             0.048650
Departure Delay in Minutes           0.042960
Gate location                        0.007169
id                                   0.002931
dtype: float64

In [7]:
#this function helps me to drop lower correlated columns easly
waste_columns=['Departure/Arrival time convenient','Arrival Delay in Minutes',
'Departure Delay in Minutes', 'Gate location','id']
def waste_deleter(x):
    return x.drop(x[waste_columns], axis=1)

In [10]:
train_data=waste_deleter(train)

In [11]:
X=train_data.drop('satisfaction', axis=1)
y=train_data['satisfaction']

In [13]:
#classified numerical and categorical values to utilize pipeline
numeric_x=X[['Age','Flight Distance', 'Inflight wifi service',
       'Ease of Online booking','Food and drink', 'Online boarding', 'Seat comfort',
       'Inflight entertainment', 'On-board service', 'Leg room service',
       'Baggage handling', 'Checkin service', 'Inflight service','Cleanliness']]
categ_x=X[['Gender','Customer Type','Type of Travel','Class']]

In [14]:
pipeline=Pipeline([
  ('imputer',SimpleImputer(strategy='median')),
    ('s_scaler',StandardScaler())
])

In [15]:
numerical=list(numeric_x)
categorical=list(categ_x)
full_pipeline=ColumnTransformer([
    ('categorical',OneHotEncoder(),categorical),
    ('numerical_data',pipeline,numerical)
])

In [16]:
prepared_x=full_pipeline.fit_transform(X)

In [17]:
x_train,x_test,y_train,y_test=train_test_split(prepared_x,y,test_size=0.2, random_state=42)

In [38]:
#knn model
knn_model=KNeighborsClassifier(n_neighbors=11)
knn_model.fit(x_train,y_train) 

y_predicted=knn_model.predict(x_test)
print(precision_score(y_test,y_predicted))
print(accuracy_score(y_test,y_predicted))

0.9459459459459459
0.9135


In [22]:
#Finding the most approprate k neighbor

param_grids={'n_neighbors': np.arange(1,20)}
knn_grid=GridSearchCV(knn_model, param_grids, cv=5, error_score='raise')
knn_grid.fit(prepared_x,y)

In [24]:
knn_grid.best_params_, knn_grid.best_score_

({'n_neighbors': 11}, 0.9099999999999999)

In [26]:
# cross validation predict model
from sklearn.model_selection import cross_val_predict
predict=cross_val_predict(estimator=knn_model, X=prepared_x, y=y, cv=5)
print(classification_report(y, predict))

              precision    recall  f1-score   support

           0       0.88      0.95      0.91      5000
           1       0.94      0.87      0.91      5000

    accuracy                           0.91     10000
   macro avg       0.91      0.91      0.91     10000
weighted avg       0.91      0.91      0.91     10000



In [39]:
#tree model
tree_model=DecisionTreeClassifier()
tree_model.fit(x_train, y_train)

tree_predicted=tree_model.predict(x_test)
print(classification_report(y_test,tree_predicted))
print(precision_score(y_test,tree_predicted))

              precision    recall  f1-score   support

           0       0.92      0.92      0.92      1002
           1       0.92      0.92      0.92       998

    accuracy                           0.92      2000
   macro avg       0.92      0.92      0.92      2000
weighted avg       0.92      0.92      0.92      2000

0.9227683049147443


In [40]:
# Random Forest Classifier model
rf_model=RandomForestClassifier()
rf_model.fit(x_train,y_train)

rf_predict=rf_model.predict(x_test)
print(classification_report(y_test,rf_predict))

              precision    recall  f1-score   support

           0       0.94      0.97      0.95      1002
           1       0.96      0.94      0.95       998

    accuracy                           0.95      2000
   macro avg       0.95      0.95      0.95      2000
weighted avg       0.95      0.95      0.95      2000



In [31]:
# random forest classifier model score
# Warning! : this model is most accurated model,so I use this model to predict test data
print(precision_score(y_test,rf_predict))

0.9609053497942387

In [41]:
#XGBoost Classifier model
xgb_model=XGBClassifier()
xgb_model.fit(x_train,y_train)

xgb_predict=xgb_model.predict(x_test)
print(classification_report(y_test,xgb_predict))
print(accuracy_score(y_test,xgb_predict))

              precision    recall  f1-score   support

           0       0.94      0.96      0.95      1002
           1       0.96      0.94      0.95       998

    accuracy                           0.95      2000
   macro avg       0.95      0.95      0.95      2000
weighted avg       0.95      0.95      0.95      2000

0.9505


In [43]:
#preparing the prediction of test data 
test_data=waste_deleter(test)

prepared_test_data=full_pipeline.fit_transform(test_data)

test_predicted=rf_model.predict(prepared_test_data)

sample['satisfaction']=test_predicted
sample.to_csv('AviaCompanySatisfaction.csv')