In [179]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split

### Data Ingestion, Summary and Investigation

In [180]:
df= pd.read_csv('train.csv')

In [181]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103904 entries, 0 to 103903
Data columns (total 25 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   Unnamed: 0                         103904 non-null  int64  
 1   id                                 103904 non-null  int64  
 2   Gender                             103904 non-null  object 
 3   Customer Type                      103904 non-null  object 
 4   Age                                103904 non-null  int64  
 5   Type of Travel                     103904 non-null  object 
 6   Class                              103904 non-null  object 
 7   Flight Distance                    103904 non-null  int64  
 8   Inflight wifi service              103904 non-null  int64  
 9   Departure/Arrival time convenient  103904 non-null  int64  
 10  Ease of Online booking             103904 non-null  int64  
 11  Gate location                      1039

* There are null values in the Arrival Delay in Minutes column

In [182]:
def produce_missing_report(df):
    percent_missing= df.isnull().sum()*100 / len(df)
    missing_values_df= pd.DataFrame(
        {
        'column_name' : df.columns,
        'percent_missing' : percent_missing
        }
    )
    return missing_values_df

In [183]:
produce_missing_report(df)

Unnamed: 0,column_name,percent_missing
Unnamed: 0,Unnamed: 0,0.0
id,id,0.0
Gender,Gender,0.0
Customer Type,Customer Type,0.0
Age,Age,0.0
Type of Travel,Type of Travel,0.0
Class,Class,0.0
Flight Distance,Flight Distance,0.0
Inflight wifi service,Inflight wifi service,0.0
Departure/Arrival time convenient,Departure/Arrival time convenient,0.0


In [184]:
df.head(10)

Unnamed: 0.1,Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,0,70172,Male,Loyal Customer,13,Personal Travel,Eco Plus,460,3,4,...,5,4,3,4,4,5,5,25,18.0,neutral or dissatisfied
1,1,5047,Male,disloyal Customer,25,Business travel,Business,235,3,2,...,1,1,5,3,1,4,1,1,6.0,neutral or dissatisfied
2,2,110028,Female,Loyal Customer,26,Business travel,Business,1142,2,2,...,5,4,3,4,4,4,5,0,0.0,satisfied
3,3,24026,Female,Loyal Customer,25,Business travel,Business,562,2,5,...,2,2,5,3,1,4,2,11,9.0,neutral or dissatisfied
4,4,119299,Male,Loyal Customer,61,Business travel,Business,214,3,3,...,3,3,4,4,3,3,3,0,0.0,satisfied
5,5,111157,Female,Loyal Customer,26,Personal Travel,Eco,1180,3,4,...,1,3,4,4,4,4,1,0,0.0,neutral or dissatisfied
6,6,82113,Male,Loyal Customer,47,Personal Travel,Eco,1276,2,4,...,2,3,3,4,3,5,2,9,23.0,neutral or dissatisfied
7,7,96462,Female,Loyal Customer,52,Business travel,Business,2035,4,3,...,5,5,5,5,4,5,4,4,0.0,satisfied
8,8,79485,Female,Loyal Customer,41,Business travel,Business,853,1,2,...,1,1,2,1,4,1,2,0,0.0,neutral or dissatisfied
9,9,65725,Male,disloyal Customer,20,Business travel,Eco,1061,3,3,...,2,2,3,4,4,3,2,0,0.0,neutral or dissatisfied


* Columns Inflight wifi service to Cleanliness have values ranging 1 to 5.
* Satisfaction may be 1 of 2 values.
* The Unnamed and id columns are not meaningful and can be removed.

In [185]:
df['Arrival Delay in Minutes']=df['Arrival Delay in Minutes'].fillna(0.0)

In [186]:
df.isna().any()

Unnamed: 0                           False
id                                   False
Gender                               False
Customer Type                        False
Age                                  False
Type of Travel                       False
Class                                False
Flight Distance                      False
Inflight wifi service                False
Departure/Arrival time convenient    False
Ease of Online booking               False
Gate location                        False
Food and drink                       False
Online boarding                      False
Seat comfort                         False
Inflight entertainment               False
On-board service                     False
Leg room service                     False
Baggage handling                     False
Checkin service                      False
Inflight service                     False
Cleanliness                          False
Departure Delay in Minutes           False
Arrival Del

In [187]:
df['Customer Type'].unique()

array(['Loyal Customer', 'disloyal Customer'], dtype=object)

In [188]:
df['Gender'].unique()

array(['Male', 'Female'], dtype=object)

In [189]:
df['Type of Travel'].unique()

array(['Personal Travel', 'Business travel'], dtype=object)

In [190]:
df['Class'].unique()

array(['Eco Plus', 'Business', 'Eco'], dtype=object)

In [191]:
df['satisfaction'].unique()

array(['neutral or dissatisfied', 'satisfied'], dtype=object)

In [192]:
df[df['Flight Distance']<=0].shape

(0, 25)

In [193]:
df['Flight Distance'].describe()

count    103904.000000
mean       1189.448375
std         997.147281
min          31.000000
25%         414.000000
50%         843.000000
75%        1743.000000
max        4983.000000
Name: Flight Distance, dtype: float64

In [194]:
df[df['Arrival Delay in Minutes']<0].shape

(0, 25)

In [195]:
df['Arrival Delay in Minutes'].describe()

count    103904.000000
mean         15.133392
std          38.649776
min           0.000000
25%           0.000000
50%           0.000000
75%          13.000000
max        1584.000000
Name: Arrival Delay in Minutes, dtype: float64

In [196]:
df[df['Departure Delay in Minutes']<0].shape

(0, 25)

In [197]:
df['Departure Delay in Minutes'].describe()

count    103904.000000
mean         14.815618
std          38.230901
min           0.000000
25%           0.000000
50%           0.000000
75%          12.000000
max        1592.000000
Name: Departure Delay in Minutes, dtype: float64

In [198]:
df.value_counts("id")!=1

id
1         False
86507     False
86519     False
86518     False
86517     False
          ...  
43254     False
43253     False
43252     False
43250     False
129880    False
Length: 103904, dtype: bool

In [199]:
df1= df.value_counts("id")!=1
df1.any()

False

Therefore there are no duplicates

In [200]:
df[['Inflight wifi service', 'Departure/Arrival time convenient', 'Ease of Online booking', 'Gate location', 'Food and drink', 'Online boarding', 'Seat comfort', 'Inflight entertainment', 'On-board service', 'Leg room service', 'Baggage handling', 'Checkin service', 'Inflight service', 'Cleanliness']].max()

Inflight wifi service                5
Departure/Arrival time convenient    5
Ease of Online booking               5
Gate location                        5
Food and drink                       5
Online boarding                      5
Seat comfort                         5
Inflight entertainment               5
On-board service                     5
Leg room service                     5
Baggage handling                     5
Checkin service                      5
Inflight service                     5
Cleanliness                          5
dtype: int64

In [201]:
df[['Inflight wifi service', 'Departure/Arrival time convenient', 'Ease of Online booking', 'Gate location', 'Food and drink', 'Online boarding', 'Seat comfort', 'Inflight entertainment', 'On-board service', 'Leg room service', 'Baggage handling', 'Checkin service', 'Inflight service', 'Cleanliness']].min()

Inflight wifi service                0
Departure/Arrival time convenient    0
Ease of Online booking               0
Gate location                        0
Food and drink                       0
Online boarding                      0
Seat comfort                         0
Inflight entertainment               0
On-board service                     0
Leg room service                     0
Baggage handling                     1
Checkin service                      0
Inflight service                     0
Cleanliness                          0
dtype: int64

In [202]:
df[['Inflight wifi service', 'Departure/Arrival time convenient', 'Ease of Online booking', 'Gate location', 'Food and drink', 'Online boarding', 'Seat comfort', 'Inflight entertainment', 'On-board service', 'Leg room service', 'Baggage handling', 'Checkin service', 'Inflight service', 'Cleanliness']].isin([1,2,3,4,5]).sum()

Inflight wifi service                100801
Departure/Arrival time convenient     98604
Ease of Online booking                99417
Gate location                        103903
Food and drink                       103797
Online boarding                      101476
Seat comfort                         103903
Inflight entertainment               103890
On-board service                     103901
Leg room service                     103432
Baggage handling                     103904
Checkin service                      103903
Inflight service                     103901
Cleanliness                          103892
dtype: int64

* The assumption is made that these 14 columns contain a rating from 1 to 5.
* Therefore a rating of 0 indicates missing data.


### Data Cleaning and Imputation

In [203]:
df= df.drop(['Unnamed: 0', 'id'], axis=1)
df.head(5)

Unnamed: 0,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,Gate location,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,Male,Loyal Customer,13,Personal Travel,Eco Plus,460,3,4,3,1,...,5,4,3,4,4,5,5,25,18.0,neutral or dissatisfied
1,Male,disloyal Customer,25,Business travel,Business,235,3,2,3,3,...,1,1,5,3,1,4,1,1,6.0,neutral or dissatisfied
2,Female,Loyal Customer,26,Business travel,Business,1142,2,2,2,2,...,5,4,3,4,4,4,5,0,0.0,satisfied
3,Female,Loyal Customer,25,Business travel,Business,562,2,5,5,5,...,2,2,5,3,1,4,2,11,9.0,neutral or dissatisfied
4,Male,Loyal Customer,61,Business travel,Business,214,3,3,3,3,...,3,3,4,4,3,3,3,0,0.0,satisfied


In [204]:
df['Customer Type']=df['Customer Type'].replace({'Loyal Customer':'0', 'disloyal Customer':'1'})

In [205]:
df['Type of Travel']= df['Type of Travel'].replace({'Personal Travel':'0', 'Business travel':'1'})

In [206]:
df['Gender']= df['Gender'].replace({'Male':'0', 'Female':'1'})

In [207]:
df['Class']=df['Class'].replace({'Eco': '0', 'Eco Plus':'1', 'Business':'2'})

In [208]:
df['satisfaction']=df['satisfaction'].replace({'satisfied':'1','neutral or dissatisfied':'0' })

In [209]:
df.astype({'Type of Travel': 'float64', 'Customer Type': 'float64', 'Class':'float64', 'Gender':'float64', 'satisfaction':'float64', 'Age':'float64', 'Flight Distance': 'float64', 'Inflight wifi service':'float64', 'Departure/Arrival time convenient':'float64', 'Ease of Online booking':'float64', 'Gate location':'float64', 'Food and drink':'float64', 'Online boarding':'float64', 'Seat comfort':'float64','Inflight entertainment':'float64', 'On-board service':'float64', 'Leg room service':'float64', 'Baggage handling':'float64', 'Checkin service':'float64', 'Inflight service':'float64', 'Cleanliness':'float64', 'Departure Delay in Minutes':'float64', 'satisfaction':'float64'}).dtypes

Gender                               float64
Customer Type                        float64
Age                                  float64
Type of Travel                       float64
Class                                float64
Flight Distance                      float64
Inflight wifi service                float64
Departure/Arrival time convenient    float64
Ease of Online booking               float64
Gate location                        float64
Food and drink                       float64
Online boarding                      float64
Seat comfort                         float64
Inflight entertainment               float64
On-board service                     float64
Leg room service                     float64
Baggage handling                     float64
Checkin service                      float64
Inflight service                     float64
Cleanliness                          float64
Departure Delay in Minutes           float64
Arrival Delay in Minutes             float64
satisfacti

### Preprocessing

In [210]:
scalar=StandardScaler()
scalar.fit(df)

In [211]:
std_data=scalar.transform(df)
features=std_data.T
features

array([[-1.01503056, -1.01503056,  0.98519201, ..., -1.01503056,
         0.98519201, -1.01503056],
       [-0.4727667 ,  2.11520819, -0.4727667 , ...,  2.11520819,
         2.11520819, -0.4727667 ],
       [-1.7452793 , -0.95136024, -0.88520032, ..., -0.62056063,
        -1.14984   , -0.8190404 ],
       ...,
       [ 0.26639265, -0.36137482, -0.3875318 , ..., -0.20443295,
        -0.3875318 , -0.3875318 ],
       [ 0.07416916, -0.23631279, -0.39155376, ..., -0.02932482,
        -0.39155376, -0.39155376],
       [-0.87447349, -0.87447349,  1.14354524, ..., -0.87447349,
        -0.87447349, -0.87447349]])

In [212]:
covariance_matrix= np.cov(features)
covariance_matrix

array([[ 1.00000962e+00,  3.17320520e-02, -8.92789621e-03,
         6.90894137e-03, -8.25293808e-03, -5.82761241e-03,
        -8.97436644e-03, -9.14191627e-03, -6.95075732e-03,
        -3.32756767e-04, -5.77224753e-03,  4.22599443e-02,
         2.65025056e-02, -6.11719795e-03, -8.15596180e-03,
        -3.17980253e-02, -3.73911676e-02, -1.05630374e-02,
        -3.94107904e-02, -6.50497064e-03, -2.93470940e-03,
        -3.47355564e-04, -1.22113922e-02],
       [ 3.17320520e-02,  1.00000962e+00, -2.81676650e-01,
         3.08336188e-01, -1.05735612e-01, -2.25202533e-01,
        -7.54298705e-03, -2.06870573e-01, -1.95093101e-02,
         6.07853002e-03, -5.94549988e-02, -1.89263760e-01,
        -1.59485253e-01, -1.09930871e-01, -5.65610181e-02,
        -4.78684719e-02,  2.47235918e-02, -3.21777602e-02,
         2.26700215e-02, -8.36138237e-02,  4.03592697e-03,
         4.81471340e-03, -1.87639977e-01],
       [-8.92789621e-03, -2.81676650e-01,  1.00000962e+00,
         4.85249343e-02,  1.4

In [213]:
eig_vals, eig_vecs= np.linalg.eig(covariance_matrix)
eig_vecs

array([[ 7.11970293e-03, -6.29983331e-04,  4.45227046e-02,
        -7.51699539e-03, -1.98858255e-02, -4.92893027e-02,
         7.63295105e-01,  6.19458704e-01, -1.31248649e-01,
         9.01174202e-03,  1.78268456e-03,  5.48816559e-02,
         1.88279961e-02, -1.32999800e-02,  1.03150893e-02,
        -3.19513276e-03,  1.12596207e-02,  2.50897628e-02,
        -4.42554591e-03,  3.56470638e-03,  7.72451324e-02,
        -3.16990964e-02, -2.44010537e-03],
       [ 9.00921921e-02, -5.76266041e-02, -1.15431575e-01,
        -3.65218050e-02, -3.31148592e-02, -6.66009456e-01,
         7.96294376e-02, -7.87120801e-02,  1.00956691e-01,
        -2.94160153e-01, -4.97110946e-03, -4.37567335e-03,
        -1.80811248e-01,  3.67659676e-01,  1.41853531e-01,
        -3.51926376e-01,  4.84566868e-02, -1.64548973e-01,
        -1.52270423e-02, -1.92145318e-01, -1.64798731e-01,
         8.63161857e-02, -8.31275830e-02],
       [-8.70390539e-02,  1.72077126e-02,  1.21350671e-01,
        -6.86030964e-02, -1.0

In [214]:
_sum= sum (eig_vals)
for _v in eig_vals:
    print((_v/_sum)*100)

19.694895448083905
10.322675748833271
9.574279260374295
8.613542304609831
8.389229536480935
6.512083303214594
4.445539097241722
4.23126133941195
4.0602523922170395
3.5825349640522446
0.17255666565341915
3.011946585914125
0.7574274902465915
2.51673437336242
0.990721764445308
1.13706412841356
1.2725026693159318
1.3741479497032019
1.5946739353160013
1.7387286461598896
1.871978110890139
2.0958971439315364
2.039327142128112
