<a href="https://colab.research.google.com/github/Giogeorge213/Python_Projects/blob/main/AirlineSatisfaction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import matplotlib as plt

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

import statsmodels.api as sm

In [None]:
df1 = pd.read_csv('test.csv')
df2= pd.read_csv('train.csv')

In [None]:
df = pd.concat([df1,df2], ignore_index= True)

### Handle NANs

In [None]:
nan_counts = df.isna().sum()
print(nan_counts)

Unnamed: 0                             0
id                                     0
Gender                                 0
Customer Type                          0
Age                                    0
Type of Travel                         0
Class                                  0
Flight Distance                        0
Inflight wifi service                  0
Departure/Arrival time convenient      0
Ease of Online booking                 0
Gate location                          0
Food and drink                         0
Online boarding                        0
Seat comfort                           0
Inflight entertainment                 0
On-board service                       0
Leg room service                       0
Baggage handling                       0
Checkin service                        0
Inflight service                       0
Cleanliness                            0
Departure Delay in Minutes             0
Arrival Delay in Minutes             393
satisfaction    

#### Arrival Delay in Minutes NANs correspond to no values entered. Drop the rows

In [None]:
naResult = df[df['Arrival Delay in Minutes'].isna()]
print(naResult)
df.dropna(inplace = True)

        Unnamed: 0      id  Gender      Customer Type  Age   Type of Travel  \
516            516  107365  Female     Loyal Customer   21  Personal Travel   
656            656  108648    Male     Loyal Customer    9  Personal Travel   
1071          1071   16797    Male  disloyal Customer   25  Business travel   
1224          1224   30090    Male     Loyal Customer    7  Personal Travel   
1589          1589   41924  Female     Loyal Customer   58  Business travel   
...            ...     ...     ...                ...  ...              ...   
128043      102067   36729    Male     Loyal Customer   49  Personal Travel   
128360      102384   71241    Male     Loyal Customer   58  Business travel   
128528      102552   27684  Female  disloyal Customer   29  Business travel   
128936      102960   36787    Male     Loyal Customer   58  Business travel   
129516      103540   45022  Female     Loyal Customer   33  Personal Travel   

           Class  Flight Distance  Inflight wifi se

### Have to convert String entries to usable types for model training

In [None]:
df.select_dtypes(include='object')

Unnamed: 0,Gender,Customer Type,Type of Travel,Class,satisfaction
0,Female,Loyal Customer,Business travel,Eco,satisfied
1,Female,Loyal Customer,Business travel,Business,satisfied
2,Male,disloyal Customer,Business travel,Eco,neutral or dissatisfied
3,Male,Loyal Customer,Business travel,Business,satisfied
4,Female,Loyal Customer,Business travel,Eco,satisfied
...,...,...,...,...,...
129875,Female,disloyal Customer,Business travel,Eco,neutral or dissatisfied
129876,Male,Loyal Customer,Business travel,Business,satisfied
129877,Male,disloyal Customer,Business travel,Business,neutral or dissatisfied
129878,Female,disloyal Customer,Business travel,Eco,neutral or dissatisfied


In [None]:
df = pd.get_dummies(df, columns=['Gender'], drop_first=True)
df = pd.get_dummies(df, columns=['Customer Type'], drop_first=True)
df = pd.get_dummies(df, columns=['Type of Travel'], drop_first=True)

In [None]:
class_mapping = {'Eco': 0, 'Eco Plus': 1, 'Business': 2}
df['Class'] = df['Class'].map(class_mapping)
satisfaction_mapping = {'neutral or dissatisfied': 0, 'satisfied': 1}
df['satisfaction'] = df['satisfaction'].map(satisfaction_mapping)

In [None]:
df.head()

Unnamed: 0.1,Unnamed: 0,id,Age,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,Gate location,Food and drink,...,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction,Gender_Male,Customer Type_disloyal Customer,Type of Travel_Personal Travel
0,0,19556,52,0,160,5,4,3,4,3,...,5,2,5,5,50,44.0,1,0,0,0
1,1,90035,36,2,2863,1,1,3,1,5,...,4,3,4,5,0,0.0,1,0,0,0
2,2,12360,20,0,192,2,0,2,4,2,...,3,2,2,2,0,0.0,0,1,1,0
3,3,77959,44,2,3377,0,0,0,2,3,...,1,3,1,4,0,6.0,1,1,0,0
4,4,36875,49,0,1182,2,3,4,3,4,...,2,4,2,4,0,20.0,1,0,0,0


### Create two separate dfs for analysis input set and output. Inputs predict the output which is satisfaction

In [None]:
X = df.drop(columns = ['satisfaction'])
y = df['satisfaction']

### Create training and test sets and model based on Logistic Regression Sklearn

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [None]:
model1 = LogisticRegression(max_iter=1000)
model1.fit(X_train, y_train)
y_pred = model1.predict(X_test)

In [None]:
accuracy1 = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy1)

Accuracy: 0.7751563827322573


### GLM analysis using statsmodels

In [None]:
X = sm.add_constant(X)

In [None]:
model2 = sm.GLM(y, X, family=sm.families.Gaussian())


In [None]:
result = model2.fit()

In [None]:
print(result.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:           satisfaction   No. Observations:               129487
Model:                            GLM   Df Residuals:                   129462
Model Family:                Gaussian   Df Model:                           24
Link Function:               identity   Scale:                         0.11023
Method:                          IRLS   Log-Likelihood:                -40951.
Date:                Wed, 26 Jul 2023   Deviance:                       14271.
Time:                        20:19:49   Pearson chi2:                 1.43e+04
No. Iterations:                     3   Pseudo R-squ. (CS):             0.7075
Covariance Type:            nonrobust                                         
                                        coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------------------------
const 