#### Customer Satisfaction Prediction

In [None]:
# Import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn as sk

In [None]:
# Set display options to show all columns
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

In [None]:
# Load datasets
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [None]:
train_df.head()

Unnamed: 0.1,Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,Gate location,Food and drink,Online boarding,Seat comfort,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,0,70172,Male,Loyal Customer,13,Personal Travel,Eco Plus,460,3,4,3,1,5,3,5,5,4,3,4,4,5,5,25,18.0,neutral or dissatisfied
1,1,5047,Male,disloyal Customer,25,Business travel,Business,235,3,2,3,3,1,3,1,1,1,5,3,1,4,1,1,6.0,neutral or dissatisfied
2,2,110028,Female,Loyal Customer,26,Business travel,Business,1142,2,2,2,2,5,5,5,5,4,3,4,4,4,5,0,0.0,satisfied
3,3,24026,Female,Loyal Customer,25,Business travel,Business,562,2,5,5,5,2,2,2,2,2,5,3,1,4,2,11,9.0,neutral or dissatisfied
4,4,119299,Male,Loyal Customer,61,Business travel,Business,214,3,3,3,3,4,5,5,3,3,4,4,3,3,3,0,0.0,satisfied


In [None]:
# Data Cleaning
train_df = train_df.drop(columns=['Unnamed: 0'])
test_df = test_df.drop(columns=['Unnamed: 0'])

In [None]:
# Data Preprocessing
# Access rows with missing values from column
# Variable Arrival Delay in Minutes has NAN values representing no Delay
# Assign 0 to say 0 minutes of Delay
print(train_df.isna().sum(axis=0))
print(train_df.isna().sum(axis=0))

id                                     0
Gender                                 0
Customer Type                          0
Age                                    0
Type of Travel                         0
Class                                  0
Flight Distance                        0
Inflight wifi service                  0
Departure/Arrival time convenient      0
Ease of Online booking                 0
Gate location                          0
Food and drink                         0
Online boarding                        0
Seat comfort                           0
Inflight entertainment                 0
On-board service                       0
Leg room service                       0
Baggage handling                       0
Checkin service                        0
Inflight service                       0
Cleanliness                            0
Departure Delay in Minutes             0
Arrival Delay in Minutes             310
satisfaction                           0
dtype: int64
id 

In [None]:
train_df=train_df.dropna(axis=0)
test_df=test_df.dropna(axis=0)

In [None]:
# Fillna
# Assuming train_df is your DataFrame
train_df['Arrival Delay in Minutes'] = train_df['Arrival Delay in Minutes'].fillna(0)
test_df['Arrival Delay in Minutes'] = test_df['Arrival Delay in Minutes'].fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['Arrival Delay in Minutes'] = train_df['Arrival Delay in Minutes'].fillna(0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['Arrival Delay in Minutes'] = test_df['Arrival Delay in Minutes'].fillna(0)


In [None]:
# View unique categories in categorical columns
print(train_df['Customer Type'].value_counts())
print(train_df['Type of Travel'].value_counts())
print(train_df['Class'].value_counts())


Loyal Customer       84662
disloyal Customer    18932
Name: Customer Type, dtype: int64
Business travel    71465
Personal Travel    32129
Name: Type of Travel, dtype: int64
Business    49533
Eco         46593
Eco Plus     7468
Name: Class, dtype: int64


In [None]:
# Define a  mapping dictionary for all categorical columns
mapping = {
    'Gender': {'Male': 0, 'Female': 1},
    'satisfaction': {'satisfied': 1, 'neutral or dissatisfied': 0},
    'Customer Type':{'Loyal Customer':1,'disloyal Customer':0},
    'Type of Travel':{'Business travel':1,'Personal Travel':0},
    'Class':{'Business':2,'Eco Plus':1,'Eco':0}

}

In [None]:
# Loop through the columns and apply the mapping
for col, col_mapping in mapping.items():
    train_df[col] = train_df[col].map(col_mapping)
    test_df[col] = test_df[col].map(col_mapping)

In [None]:
train_df.head()

Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,Gate location,Food and drink,Online boarding,Seat comfort,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,70172,0,1,13,0,1,460,3,4,3,1,5,3,5,5,4,3,4,4,5,5,25,18.0,0
1,5047,0,0,25,1,2,235,3,2,3,3,1,3,1,1,1,5,3,1,4,1,1,6.0,0
2,110028,1,1,26,1,2,1142,2,2,2,2,5,5,5,5,4,3,4,4,4,5,0,0.0,1
3,24026,1,1,25,1,2,562,2,5,5,5,2,2,2,2,2,5,3,1,4,2,11,9.0,0
4,119299,0,1,61,1,2,214,3,3,3,3,4,5,5,3,3,4,4,3,3,3,0,0.0,1


In [None]:
train_df= train_df.drop(columns='id')

In [None]:
test_df= test_df.drop(columns='id')

In [None]:
train_df.columns

Index(['Gender', 'Customer Type', 'Age', 'Type of Travel', 'Class',
       'Flight Distance', 'Inflight wifi service',
       'Departure/Arrival time convenient', 'Ease of Online booking',
       'Gate location', 'Food and drink', 'Online boarding', 'Seat comfort',
       'Inflight entertainment', 'On-board service', 'Leg room service',
       'Baggage handling', 'Checkin service', 'Inflight service',
       'Cleanliness', 'Departure Delay in Minutes', 'Arrival Delay in Minutes',
       'satisfaction'],
      dtype='object')

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler


In [None]:
features_to_scale =['Age', 'Flight Distance', 'Inflight wifi service',
       'Departure/Arrival time convenient', 'Ease of Online booking',
       'Gate location', 'Food and drink', 'Online boarding', 'Seat comfort',
       'Inflight entertainment', 'On-board service', 'Leg room service',
       'Baggage handling', 'Checkin service', 'Inflight service',
       'Cleanliness', 'Departure Delay in Minutes', 'Arrival Delay in Minutes']

In [None]:
# Feature Selection
# All features will be taken into consideratio
scaler = StandardScaler()
# Apply Standard Scaler
scaled_train = scaler.fit_transform(train_df[features_to_scale])
scaled_test = scaler.transform(test_df[features_to_scale])

In [None]:
train_df[features_to_scale]= scaled_train
test_df[features_to_scale]= scaled_test

In [None]:
train_df.head()

Unnamed: 0,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,Gate location,Food and drink,Online boarding,Seat comfort,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,0,1,-1.745542,0,1,-0.731305,0.203521,0.616249,0.173716,-1.547312,1.352401,-0.185632,1.182991,1.23153,0.479237,-0.267143,0.311853,0.549773,1.156211,1.305913,0.268966,0.072905,0
1,0,0,-0.951526,1,2,-0.956916,0.203521,-0.695032,0.173716,0.017981,-1.656487,-0.185632,-1.849863,-1.769166,-1.849452,1.253304,-0.534854,-1.821038,0.30558,-1.742432,-0.360682,-0.237184,0
2,1,1,-0.885358,1,2,-0.047454,-0.549571,-0.695032,-0.541118,-0.764666,1.352401,1.296479,1.182991,1.23153,0.479237,-0.267143,0.311853,0.549773,0.30558,1.305913,-0.386917,-0.392229,1
3,1,1,-0.951526,1,2,-0.629028,-0.549571,1.27189,1.603383,1.583273,-0.904265,-0.926688,-1.091649,-1.018992,-1.073222,1.253304,-0.534854,-1.821038,0.30558,-0.980345,-0.098328,-0.159662,0
4,0,1,1.430521,1,2,-0.977973,0.203521,-0.039391,0.173716,0.017981,0.600179,1.296479,1.182991,-0.268818,-0.296993,0.493081,0.311853,-0.240497,-0.545051,-0.218259,-0.386917,-0.392229,1


In [None]:
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

In [None]:
# Split train and test into X Features and y Target
X_train = train_df.iloc[:, :-1]
y_train = train_df.iloc[:, -1]

X_test = test_df.iloc[:, :-1]
y_test = test_df.iloc[:, -1]



In [None]:
# Logistic Regression
logreg_model = LogisticRegression()
logreg_model.fit(X_train, y_train)
logreg_preds = logreg_model.predict(X_test)
print("Logistic Regression Accuracy:", accuracy_score(y_test, logreg_preds))
print(classification_report(y_test, logreg_preds))


Logistic Regression Accuracy: 0.8714324334762291
              precision    recall  f1-score   support

           0       0.87      0.90      0.89     14528
           1       0.87      0.83      0.85     11365

    accuracy                           0.87     25893
   macro avg       0.87      0.87      0.87     25893
weighted avg       0.87      0.87      0.87     25893



In [None]:
# Random Forest
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)
rf_preds = rf_model.predict(X_test)
print("Random Forest Accuracy:", accuracy_score(y_test, rf_preds))
print(classification_report(y_test, rf_preds))


Random Forest Accuracy: 0.963078824392693
              precision    recall  f1-score   support

           0       0.96      0.98      0.97     14528
           1       0.97      0.94      0.96     11365

    accuracy                           0.96     25893
   macro avg       0.96      0.96      0.96     25893
weighted avg       0.96      0.96      0.96     25893



In [None]:

# Gradient Boosting
gb_model = GradientBoostingClassifier()
gb_model.fit(X_train, y_train)
gb_preds = gb_model.predict(X_test)
print("Gradient Boosting Accuracy:", accuracy_score(y_test, gb_preds))
print(classification_report(y_test, gb_preds))


Gradient Boosting Accuracy: 0.9419920441818252
              precision    recall  f1-score   support

           0       0.94      0.96      0.95     14528
           1       0.95      0.92      0.93     11365

    accuracy                           0.94     25893
   macro avg       0.94      0.94      0.94     25893
weighted avg       0.94      0.94      0.94     25893



In [None]:

# Support Vector Machine (SVM)
svm_model = SVC()
svm_model.fit(X_train, y_train)
svm_preds = svm_model.predict(X_test)
print("SVM Accuracy:", accuracy_score(y_test, svm_preds))
print(classification_report(y_test, svm_preds))

SVM Accuracy: 0.9528057776232959
              precision    recall  f1-score   support

           0       0.95      0.97      0.96     14528
           1       0.96      0.93      0.95     11365

    accuracy                           0.95     25893
   macro avg       0.95      0.95      0.95     25893
weighted avg       0.95      0.95      0.95     25893



In [None]:
# K-Nearest Neighbors (KNN)
knn_model = KNeighborsClassifier(n_neighbors=5)  # You can adjust the number of neighbors (k)
knn_model.fit(X_train, y_train)
knn_preds = knn_model.predict(X_test)
print("KNN Accuracy:", accuracy_score(y_test, knn_preds))
print(classification_report(y_test, knn_preds))

KNN Accuracy: 0.9292472869115205
              precision    recall  f1-score   support

           0       0.92      0.96      0.94     14528
           1       0.94      0.89      0.92     11365

    accuracy                           0.93     25893
   macro avg       0.93      0.93      0.93     25893
weighted avg       0.93      0.93      0.93     25893



In [None]:
# Naive Bayes
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)
nb_preds = nb_model.predict(X_test)
print("Naive Bayes Accuracy:", accuracy_score(y_test, nb_preds))
print(classification_report(y_test, nb_preds))


Naive Bayes Accuracy: 0.8617773143320588
              precision    recall  f1-score   support

           0       0.86      0.89      0.88     14528
           1       0.86      0.82      0.84     11365

    accuracy                           0.86     25893
   macro avg       0.86      0.86      0.86     25893
weighted avg       0.86      0.86      0.86     25893

