In [14]:
# Importing Pandas and NumPy
import pandas as pd, numpy as np
from sklearn.linear_model import LogisticRegression
main_data = pd.read_csv("/Invistico_Airline.csv")
main_data.head()

Unnamed: 0,satisfaction,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Seat comfort,Departure/Arrival time convenient,Food and drink,...,Online support,Ease of Online booking,On-board service,Leg room service,Baggage handling,Checkin service,Cleanliness,Online boarding,Departure Delay in Minutes,Arrival Delay in Minutes
0,satisfied,Female,Loyal Customer,65,Personal Travel,Eco,265,0,0,0,...,2,3,3,0,3,5,3,2,0,0.0
1,satisfied,Male,Loyal Customer,47,Personal Travel,Business,2464,0,0,0,...,2,3,4,4,4,2,3,2,310,305.0
2,satisfied,Female,Loyal Customer,15,Personal Travel,Eco,2138,0,0,0,...,2,2,3,3,4,4,4,2,0,0.0
3,satisfied,Female,Loyal Customer,60,Personal Travel,Eco,623,0,0,0,...,3,1,1,0,1,4,1,3,0,0.0
4,satisfied,Female,Loyal Customer,70,Personal Travel,Eco,354,0,0,0,...,4,2,2,0,2,4,2,5,0,0.0


In [15]:
# Creating a dummy variable for some of the categorical variables and dropping the first one.
dummy_var = pd.get_dummies(main_data[["satisfaction","Gender","Customer Type","Type of Travel","Class"]], drop_first=True)

# Adding the results to the master dataframe
main_data = pd.concat([main_data, dummy_var], axis=1)

In [16]:
main_data['Gender_Female'] = main_data.Gender_Male.map({1: 0, 0: 1})

### Checking for Missing values

In [17]:
# Adding up missing values (column-wise)
main_data.isnull().sum()

satisfaction                           0
Gender                                 0
Customer Type                          0
Age                                    0
Type of Travel                         0
Class                                  0
Flight Distance                        0
Seat comfort                           0
Departure/Arrival time convenient      0
Food and drink                         0
Gate location                          0
Inflight wifi service                  0
Inflight entertainment                 0
Online support                         0
Ease of Online booking                 0
On-board service                       0
Leg room service                       0
Baggage handling                       0
Checkin service                        0
Cleanliness                            0
Online boarding                        0
Departure Delay in Minutes             0
Arrival Delay in Minutes             393
satisfaction_satisfied                 0
Gender_Male     

In [18]:
# Checking the percentage of missing values
round(100*(main_data.isnull().sum()/len(main_data.index)), 2)

satisfaction                         0.0
Gender                               0.0
Customer Type                        0.0
Age                                  0.0
Type of Travel                       0.0
Class                                0.0
Flight Distance                      0.0
Seat comfort                         0.0
Departure/Arrival time convenient    0.0
Food and drink                       0.0
Gate location                        0.0
Inflight wifi service                0.0
Inflight entertainment               0.0
Online support                       0.0
Ease of Online booking               0.0
On-board service                     0.0
Leg room service                     0.0
Baggage handling                     0.0
Checkin service                      0.0
Cleanliness                          0.0
Online boarding                      0.0
Departure Delay in Minutes           0.0
Arrival Delay in Minutes             0.3
satisfaction_satisfied               0.0
Gender_Male     

In [19]:
# Removing NaN rows
main_data = main_data[~np.isnan(main_data['Arrival Delay in Minutes'])]

In [20]:
round(100*(main_data.isnull().sum()/len(main_data.index)), 2)

satisfaction                         0.0
Gender                               0.0
Customer Type                        0.0
Age                                  0.0
Type of Travel                       0.0
Class                                0.0
Flight Distance                      0.0
Seat comfort                         0.0
Departure/Arrival time convenient    0.0
Food and drink                       0.0
Gate location                        0.0
Inflight wifi service                0.0
Inflight entertainment               0.0
Online support                       0.0
Ease of Online booking               0.0
On-board service                     0.0
Leg room service                     0.0
Baggage handling                     0.0
Checkin service                      0.0
Cleanliness                          0.0
Online boarding                      0.0
Departure Delay in Minutes           0.0
Arrival Delay in Minutes             0.0
satisfaction_satisfied               0.0
Gender_Male     

In [21]:
main_data.columns

Index(['satisfaction', 'Gender', 'Customer Type', 'Age', 'Type of Travel',
       'Class', 'Flight Distance', 'Seat comfort',
       'Departure/Arrival time convenient', 'Food and drink', 'Gate location',
       'Inflight wifi service', 'Inflight entertainment', 'Online support',
       'Ease of Online booking', 'On-board service', 'Leg room service',
       'Baggage handling', 'Checkin service', 'Cleanliness', 'Online boarding',
       'Departure Delay in Minutes', 'Arrival Delay in Minutes',
       'satisfaction_satisfied', 'Gender_Male',
       'Customer Type_disloyal Customer', 'Type of Travel_Personal Travel',
       'Class_Eco', 'Class_Eco Plus', 'Gender_Female'],
      dtype='object')

In [22]:
X = main_data[['Gender_Female', 'Age', 'Class_Eco', 'Flight Distance', 'Departure/Arrival time convenient', 'Arrival Delay in Minutes']]

In [23]:
Y = main_data['satisfaction_satisfied']

In [24]:
model = LogisticRegression()
model.fit(X, Y)

LogisticRegression()

In [25]:
print(model.intercept_)
print(model.coef_)

[-0.06483559]
[[ 9.16890266e-01  1.21994947e-02 -1.22012150e+00 -4.57014516e-05
   1.30780577e-02 -4.77082956e-03]]
