# <div align="center"> AIRLINE PASSENGER SATISFACTION </div>

## Import Library

In [3]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

pd.set_option('display.max_columns', 200)

from xgboost import XGBClassifier
from sklearn import preprocessing
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, accuracy_score
from sklearn.metrics import confusion_matrix, roc_auc_score, f1_score, recall_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier

ModuleNotFoundError: No module named 'xgboost'

## Read Dataset

In [None]:
#read dataset
df = pd.read_csv('train.csv')
df.head()

In [None]:
#display details
df.info()

In [None]:
#drop 'Unnamed: 0' column because is not necesary
df.drop(['Unnamed: 0'],axis=1)

In [None]:
#display the shape of the data
df.shape

In [None]:
#describe the data
df.describe().T

In [None]:
#check for any null values
df.isnull().sum()

In [None]:
((df.shape[0] - df.dropna().shape[0]) / df.shape[0]) * 100

In [None]:
#check missing values
def getmissing(df):
    values = {}
    for feat in df.columns:
        if df[feat].isna().any():
            values[feat] = round(df[feat].mean(),2)
    return values

In [None]:
values = getmissing(df)

In [None]:
values

In [None]:
#fill missing value with median 
df['Arrival Delay in Minutes'].fillna(df['Arrival Delay in Minutes'].median(),inplace=True)

In [None]:
#checking duplicated data
df.duplicated().sum()

In [None]:
#drop 'Unnamed: 0' column and id colums because is not necesary
df1 = df.drop(columns = ['Unnamed: 0','id'])

In [None]:
df1.head()

## Data Understanding 

In [None]:
categorical = ['Gender', 'Customer Type', 'Type of Travel', 'Class', 'satisfaction']
numerical = ['Age', 'Flight Distance', 'Inflight wifi service', 'Departure/Arrival time convenient', 'Ease of Online booking', 'Gate location',
              'Food and drink', 'Online boarding', 'Seat comfort', 'Inflight entertainment', 'On-board service',
              'Leg room service', 'Baggage handling', 'Checkin service', 'Inflight service', 'Cleanliness', 
              'Departure Delay in Minutes', 'Arrival Delay in Minutes']

In [None]:
#show statistic of numerical column
df1[numerical].describe()

In [None]:
#show statistical summary for categorical column
df1[categorical].describe()

In [None]:
df['satisfaction'].unique()

In [None]:
# Indicates the exact number of values
# this code is very useful because it has multiple categorical columns
for col in categorical:
    print (f'Value counts of {col} column')
    print(df[col].value_counts(), '\n')

## Univariate Analysis :

### Numeric Column ###

### (1). Show Outliers with Boxplot ###

In [None]:
# Create the box plot with subplots and layout
df1.plot(kind='box', subplots=True, layout=(8, 4), figsize=(25, 60))

# Display the plot
plt.show()

In [None]:
sns.set(rc={
            "font.size":10,
            "axes.titlesize":10,
            "axes.labelsize":15},
             style="darkgrid",
            )

fig, axs = plt.subplots(6, 3, figsize=(20,40))
fig.tight_layout(pad=3.0)

for f,ax in zip(numerical,axs.ravel()):
    sns.set(font_scale = 2)
    ax=sns.boxplot(ax=ax,data=df1,y=df1[f],color='#E3CF57')

In [None]:
# show the profile of outlier 1
outlier1 = df1['Flight Distance'].max()

df1[df1['Flight Distance'] == outlier1]

In [None]:
# Show the profile of outlier 2
outlier2 = df1['Departure Delay in Minutes'].max()

df1[df1['Departure Delay in Minutes'] == outlier2]

In [None]:
# Show the profile of outlier 3
outlier3 = df1['Arrival Delay in Minutes'].max()

df1[df1['Arrival Delay in Minutes'] == outlier3]

In [None]:
# Show the profile of outlier 4
outlier4 = df1['Checkin service'].min()

df1[df1['Checkin service'] == outlier4]

In [None]:
df1['Checkin service'].unique()

In [None]:
df1['Gender'].unique()

In [None]:
#array of non continuos values
non_continues = ['Inflight wifi service', 'Departure/Arrival time convenient',
              'Ease of Online booking', 'Gate location', 'Food and drink', 'Online boarding','Seat comfort', 'Inflight entertainment','On-board service',
              'Leg room service','Baggage handling', 'Checkin service','Inflight service','Cleanliness']
len(non_continues)

In [None]:
#displaying non continuos values in pie
sns.set(rc={
            "font.size":10,
            "axes.titlesize":10,
            "axes.labelsize":13},
             style="darkgrid")
fig, axes = plt.subplots(7, 2, figsize = (20, 30))
for i, col in enumerate(non_continues):
    column_values = df1[col].value_counts()
    labels = column_values.index
    sizes = column_values.values
    axes[i//2, i%2].pie(sizes,labels = labels, colors = sns.color_palette("pastel"),autopct = '%1.0f%%', startangle = 90)
    axes[i//2, i%2].axis('equal')
    axes[i//2, i%2].set_title(col)
plt.show()

In [None]:
#plotting the barplot for numerical values
sns.set(rc={'figure.figsize':(4, 4),
            "font.size":8,
            "axes.titlesize":8,
            "axes.labelsize":8},
             style="darkgrid")
for i in numerical:
    sns.barplot(data=df1, x='satisfaction', y=i, palette='rocket_r')
    plt.show()

In [None]:
plt.figure(figsize=(15,30))
for i in range(len(non_continues)):
    plt.subplot(7,2,i+1)
    plt.title(non_continues[i])
    sns.countplot(x=df1[non_continues[i]],hue=df1['satisfaction'], palette = 'coolwarm')    
plt.tight_layout()

In [None]:
#using the countplot for the categorical columns
for col in categorical:
    plt.figure(figsize = (4, 3))
    sns.countplot(x = col, data = df1, palette = "cividis")
    plt.show();

In [None]:
sns.set(rc={'figure.figsize':(8, 8),
            "font.size":10,
            "axes.titlesize":10,
            "axes.labelsize":15},
             style="darkgrid",
            )

for col in categorical[:-1]:
    plt.figure(figsize=(8, 6)) 
    sns.countplot(data=df1, x=col, hue ='satisfaction', palette='viridis')
    plt.legend(loc=(1.05, 0.5))

In [None]:
sns.set(rc={
            "font.size":10,
            "axes.titlesize":10,
            "axes.labelsize":13},
             style="darkgrid")
fig, axes = plt.subplots(3, 2, figsize = (10, 10))
for i, col in enumerate(categorical):
    column_values = df1[col].value_counts()
    labels = column_values.index
    sizes = column_values.values
    axes[i//2, i%2].pie(sizes,labels = labels, colors = sns.color_palette("BrBG", 3),autopct = '%1.0f%%', startangle = 90)
    axes[i//2, i%2].axis('equal')
    axes[i//2, i%2].set_title(col)
plt.show()

## EDA Question

1. What is the level of airline passenger satisfaction based on Departure Delay and Arrival Delay?
2. What is the level of airline passenger satisfaction when viewed based on Flight Distance and Departure Delay?
3. What is the demographics of airline passengers in terms of age and type of trips taken by passengers?
4. What is the demographics of airline passengers when viewed based on age and class used by passengers when using the airline?
5. What is the level of airline passenger satisfaction when viewed based on Flight Distance?nce?

### 1. Departure Delay & Arrival Delay:

In [None]:
#level of satisfaction based on arrival and departure delay
df1.groupby('satisfaction')['Departure Delay in Minutes'].mean()

In [None]:
df1.groupby('satisfaction')['Arrival Delay in Minutes'].mean()

In [None]:
sns.set(rc={
            "font.size":10,
            "axes.titlesize":10,
            "axes.labelsize":13},
             style="darkgrid")
plt.figure(figsize=(10, 5), dpi=100)
sns.scatterplot(data=df1, x='Arrival Delay in Minutes', y='Departure Delay in Minutes', hue='satisfaction', palette='cubehelix', alpha=0.8);

### 2.  Flight Distance & Departure Delay:

In [None]:
#satisfaction level based on departure delay and flight distance
sns.set(rc={
            "font.size":10,
            "axes.titlesize":10,
            "axes.labelsize":13},
             style="darkgrid")
plt.figure(figsize=(10,5), dpi=100)
sns.scatterplot(data=df1, x='Flight Distance', y='Departure Delay in Minutes', hue='satisfaction', palette='magma_r', alpha=0.8)
plt.ylim(0,1000);

### 3. Age & Customer Type:

In [None]:
#demographic of passengers based on age and customer type
f, ax = plt.subplots(1, 2, figsize = (10, 5))
sns.boxplot(x = "Customer Type", y = "Age", palette = "PRGn", data = df, ax = ax[0])
sns.histplot(df1, x = "Age", hue = "Customer Type", multiple = "stack", palette = "PRGn", edgecolor = ".3", 
             linewidth = .5, ax = ax[1]);

### 4. Age & Class:

In [None]:
#demographic based on age and class
f, ax = plt.subplots(1, 2, figsize = (15, 5))
sns.boxplot(x = "Class", y = "Age", palette = "gnuplot2_r", data = df1, ax = ax[0])
sns.histplot(df1, x = "Age", hue = "Class", multiple = "stack", palette = "gnuplot2_r", edgecolor = ".3", 
             linewidth = .5, ax = ax[1]);

In [None]:
f, ax = plt.subplots(1, 2, figsize = (15,5))
sns.boxplot(x = "Class", y = "Flight Distance", palette = "gnuplot2_r", data = df1, ax = ax[0])
sns.histplot(df1, x = "Flight Distance", hue = "Class", multiple = "stack", palette = "gnuplot2_r", edgecolor = ".3", 
             linewidth = .5, ax = ax[1]);

### 5. Flight Distance

In [None]:
#level of satisfaction based on just the flight distance
f, ax = plt.subplots(2, 2, figsize = (15, 8))
sns.boxplot(x = "Inflight entertainment", y = "Flight Distance", palette = "gnuplot2", data = df1, ax = ax[0, 0])
sns.histplot(df1, x = "Flight Distance", hue = "Inflight entertainment", multiple = "stack", palette = "gnuplot2", 
             edgecolor = ".3", linewidth = .5, ax = ax[0, 1])
sns.boxplot(x = "Leg room service", y = "Flight Distance", palette = "gnuplot2_r", data = df1, ax = ax[1, 0])
sns.histplot(df1, x = "Flight Distance", hue = "Leg room service", multiple = "stack", palette = "gnuplot2", 
             edgecolor = ".3", linewidth = .5, ax = ax[1, 1]);

## Multivariate Analysis

### Using Pearson's Correlation for Drawing Heatmap:

### Correlation Heatmap for Numeric Column

In [None]:
correlation = df1.corr(numeric_only=True)
plt.figure(figsize = (20, 15))
sns.heatmap(correlation, annot = True, fmt = '.2f', cmap = 'viridis');

In [None]:
plt.scatter(df1['Departure Delay in Minutes'], df['Arrival Delay in Minutes'], alpha = 0.5, color = '#7880b5')
pass;

### Dependent Variable or Feature:

In [None]:
df1["satisfaction"].value_counts()

### Check Dependent Variable is Balanced or Not:

In [None]:
round(df1["satisfaction"].value_counts()[1]/(df1["satisfaction"].value_counts()[0]+df1["satisfaction"].value_counts()[1])*100,2)

In [None]:
target = df1['satisfaction'].value_counts().reset_index()
target.rename(columns={'index':'satisfaction','satisfaction':'freq'}, inplace=True)
target['percentage'] = round((target['freq']/target['freq'].sum())*100,2)
target

### EDA on Feature Satisfaction:

In [None]:
fig, (ax1,ax2) = plt.subplots(1,2,figsize=(8,4))
df1['satisfaction'].value_counts().plot.pie(explode=[0,0.1],colors = sns.color_palette("RdYlBu"),autopct='%1.1f%%',ax=ax1)
ax1.set_title('Percentage of Satisfaction')
sns.countplot(x='satisfaction',data=df,ax=ax2,palette='RdYlBu')
ax2.set_title('Distribution of Satisfaction')
plt.show();

## Data Preprocessing and Feature Engineering:

### Handling Categorical Column (Encoding)

In [None]:
def categoricals_unique(data):
    for column in data:
        if data[column].dtypes == 'object':
            print(f'{column} : {data[column].unique()}')

In [None]:
categoricals_unique(df1)

In [None]:
for i in df1.columns:
    if df1[i].dtype=='object':
            label_encoder=preprocessing.LabelEncoder()
            df1[i]=label_encoder.fit_transform(df1[i])

In [None]:
df1['Class'].unique()

In [None]:
df1.head()

In [None]:
df1.info()

### Split Data: Training-Testing

In [None]:
feature = df1.drop(columns = 'satisfaction')
target = df1[['satisfaction']]

feature_train, feature_test, target_train, target_test = train_test_split(feature, target, test_size = 0.20, random_state = 42)

In [None]:
feature_train.shape

## 1. LGBMClasifier

In [None]:
import lightgbm as lgb

## 2. DecisionTreeClassifier

In [88]:
data = pd.DataFrame({
    'Customer Type': ['First-time', 'Returning', 'First-time', 'Returning'],
    'Age': [25, 30, 22, 35],
    'Type of Travel': ['Business', 'Personal', 'Business', 'Personal'],
    'Flight Distance': [1000, 2000, 1500, 3000],
    'Inflight wifi service': [4, 3, 5, 2],
    'Departure/Arrival time convenient': [5, 4, 3, 2],
    'Ease of Online booking': [3, 5, 4, 2],
    'Gate location': [3, 2, 4, 5],
    'Food and drink': [4, 3, 5, 2],
    'Online boarding': [5, 4, 3, 2],
    'Seat comfort': [4, 2, 5, 3],
    'Inflight entertainment': [5, 3, 4, 2],
    'On-board service': [4, 5, 3, 2],
    'Leg room service': [4, 3, 5, 2],
    'Baggage handling': [5, 4, 3, 2],
    'Checkin service': [4, 5, 3, 2],
    'Cleanliness': [5, 4, 2, 3],
    'Departure Delay': [10, 20, 15, 5],
    'Arrival Delay': [5, 10, 0, 5],
    'Satisfaction': [1, 0, 1, 0]  # 1 for satisfied, 0 for not satisfied
})

In [96]:
# Create the DecisionTreeClassifier model
decision_tree = DecisionTreeClassifier()

# Train the model on the training data
decision_tree.fit(feature_train, target_train)

NameError: name 'feature_train' is not defined

### Model Prediction

In [None]:
target_pred = decision_tree.predict(feature_test)

### Accuracy of the Model:

In [None]:
accuracy = accuracy_score(target_pred, target_test)
print('Decision Tree Model accuracy score: {0:0.4f}'.format(accuracy))

### Compare accuracy between training and test sets

In [None]:
target_pred_train = decision_tree.predict(feature_train)
print('Training-set accuracy score: {0:0.4f}'.format(accuracy_score(target_train, target_pred_train)))

### Check Overfitting

In [None]:
print('Training set score: {:.4f}'.format(decision_tree.score(feature_train, target_train)))
print('Test set score: {:.4f}'.format(decision_tree.score(feature_test, target_test)))

### Confusion Matrix:

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

fig, ax = plt.subplots(figsize=(10,7))

cm = confusion_matrix(target_test, target_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=np.unique(target_test))  # Include class labels for better interpretation
disp.plot()
plt.show()

### Classification Metrics:

In [None]:
from sklearn.metrics import classification_report
print(classification_report(target_test, target_pred))

## 3.XGB CClassifier

In [None]:
import xgboost as xgb
from xgboost.sklearn import XGBClassifier

In [None]:
xgb = XGBClassifier(n_estimators=100)
xgb.fit(feature_train, target_train)

### MODEL PREDICTION

In [None]:
# predict the results
target_pred1 = xgb.predict(feature_test)

## Show The Accuracy of the model

In [None]:
from sklearn.metrics import accuracy_score
accuracy1 = accuracy_score(target_pred1, target_test)
print('XGBClassifier Model accuracy score: {0:0.4f}'.format(accuracy_score(target_test, target_pred1)))

## Coimpare Between Train dan Test set Accuracy:

In [None]:
target_pred_train1 = xgb.predict(feature_train)
print('Training-set accuracy score: {0:0.4f}'. format(accuracy_score(target_train, target_pred_train1)))

In [None]:
#import the model

from xgboost import XGBClassifier

#fit the model
xgb = XGBClassifier()
xgb.fit(feature_train, target_train)

# prediction
pred_train = xgb.predict(feature_train)

# model name
model_name = str(type(xgb)).split(".")[-1][0:-2]
print(f"\t\t{model_name.upper()} MODEL\n")

print('Training part:')
print(classification_report(target_train, pred_train,
                                    target_names=['Neutral or Dissatisfied', 'Satisfied']))

print("Accuracy score for training dataset",accuracy_score(target_train, pred_train))

## Check Overfitting

In [None]:
print('Training set score: {:.4f}'.format(xgb.score(feature_train, target_train)))
print('Test set score: {:.4f}'.format(xgb.score(feature_test, target_test)))

##Saving the trained model

In [48]:
!pip install pickle-mixin

Collecting pickle-mixin
  Using cached pickle_mixin-1.0.2-py3-none-any.whl
Installing collected packages: pickle-mixin
Successfully installed pickle-mixin-1.0.2


In [76]:
import pickle
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

# Example data preparation
# Replace this with your actual data loading and preprocessing
X = [[1, 2], [2, 3], [3, 4]]
y = [0, 1, 0]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, y_train)

# Save the model
with open('decision_tree_model.pkl', 'wb') as model_file:
    pickle.dump(decision_tree, model_file)

print("Model saved successfully.")


Model saved successfully.


In [78]:
import pickle

# Loading the saved model
try:
    loaded_model = pickle.load(open('C:/Users/imaya/Y3S1/FDM/FDM-Project/decision_tree_model.pkl', 'rb'))
    print("Model loaded successfully.")
except FileNotFoundError:
    print("Model file not found. Please check the file name and path.")
except Exception as e:
    print(f"An error occurred: {e}")


Model loaded successfully.
