In [1]:
import pandas as pd
# Import csv file
df = pd.read_csv('./data/Airline_customer_satisfaction_original.csv')
df.head()

Unnamed: 0,satisfaction,Customer Type,Age,Type of Travel,Class,Flight Distance,Seat comfort,Departure/Arrival time convenient,Food and drink,Gate location,...,Online support,Ease of Online booking,On-board service,Leg room service,Baggage handling,Checkin service,Cleanliness,Online boarding,Departure Delay in Minutes,Arrival Delay in Minutes
0,satisfied,Loyal Customer,65,Personal Travel,Eco,265,0,0,0,2,...,2,3,3,0,3,5,3,2,0,0.0
1,satisfied,Loyal Customer,47,Personal Travel,Business,2464,0,0,0,3,...,2,3,4,4,4,2,3,2,310,305.0
2,satisfied,Loyal Customer,15,Personal Travel,Eco,2138,0,0,0,3,...,2,2,3,3,4,4,4,2,0,0.0
3,satisfied,Loyal Customer,60,Personal Travel,Eco,623,0,0,0,3,...,3,1,1,0,1,4,1,3,0,0.0
4,satisfied,Loyal Customer,70,Personal Travel,Eco,354,0,0,0,3,...,4,2,2,0,2,4,2,5,0,0.0


In [2]:
# Drop columns that are not needed
df = df.drop(['Departure/Arrival time convenient', 'Gate location'], axis=1)

# Show nan values
df.isnull().sum()

satisfaction                    0
Customer Type                   0
Age                             0
Type of Travel                  0
Class                           0
Flight Distance                 0
Seat comfort                    0
Food and drink                  0
Inflight wifi service           0
Inflight entertainment          0
Online support                  0
Ease of Online booking          0
On-board service                0
Leg room service                0
Baggage handling                0
Checkin service                 0
Cleanliness                     0
Online boarding                 0
Departure Delay in Minutes      0
Arrival Delay in Minutes      393
dtype: int64

In [3]:
# Drop rows with nan values
df = df.dropna()

In [4]:
# Save data to csv
df.to_csv('Airline_customer_satisfaction.csv', index=False)

# Show nan values
df.isnull().sum()

satisfaction                  0
Customer Type                 0
Age                           0
Type of Travel                0
Class                         0
Flight Distance               0
Seat comfort                  0
Food and drink                0
Inflight wifi service         0
Inflight entertainment        0
Online support                0
Ease of Online booking        0
On-board service              0
Leg room service              0
Baggage handling              0
Checkin service               0
Cleanliness                   0
Online boarding               0
Departure Delay in Minutes    0
Arrival Delay in Minutes      0
dtype: int64

In [22]:
df.shape

(129487, 20)

In [23]:
# Train test split
from sklearn.model_selection import train_test_split
X = df.drop('satisfaction', axis=1)
y = df['satisfaction']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [24]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier

# Identify categorical and numerical features
categorical_features = X_train.select_dtypes(include = ["object"]).columns.to_list()
numerical_features = X_train.select_dtypes(include = ["float", "int"]).columns.to_list()

# Create the categorical pipeline
categorical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("ohe", OneHotEncoder(sparse_output=False))
])

# No scaling for numerical pipeline as it's not necessary for decision trees
numerical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="mean"))
])

# Create the transformer
transformer = ColumnTransformer([("cat", categorical_pipeline, categorical_features), ("num", numerical_pipeline, numerical_features)], remainder='passthrough') 

# Create the pipeline
pipeline_steps = [("preprocessor", transformer), ("classifier", DecisionTreeClassifier(random_state=42))]
pipe = Pipeline(pipeline_steps)

In [25]:
from sklearn.model_selection import GridSearchCV

# Define the grid for model tuning
param_grid = {
    'classifier__max_depth': [None, 3, 5, 10], 
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4],
    'classifier__min_impurity_decrease': [0.0, 0.1, 0.2]
}

# Perform grid search
grid = GridSearchCV(pipe, param_grid, cv=5, scoring='accuracy')
grid.fit(X_train, y_train)
score = grid.best_score_ # best cross-validation score

print("Best accuracy score:", score)

Best accuracy score: 0.9340952981943499


In [26]:
# Check for overfitting
generalization_score = grid.best_estimator_.score(X_test, y_test)

print("Generalization accuracy:", generalization_score)

Generalization accuracy: 0.9362112904471388


In [27]:
# Also check the confusion matrix
from sklearn.metrics import confusion_matrix
y_pred = grid.best_estimator_.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[11106,   715],
       [  937, 13140]])

In [16]:
# Check the classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

dissatisfied       0.92      0.94      0.93     11821
   satisfied       0.95      0.93      0.94     14077

    accuracy                           0.94     25898
   macro avg       0.94      0.94      0.94     25898
weighted avg       0.94      0.94      0.94     25898



In [28]:
# Print the feature importances
importances = grid.best_estimator_.named_steps['classifier'].feature_importances_
features = grid.best_estimator_.named_steps['preprocessor'].transformers_[0][1].named_steps['ohe'].get_feature_names_out()
features = list(features) + numerical_features
pd.DataFrame(importances, index=features, columns=['importance']).sort_values(by='importance', ascending=False)


Unnamed: 0,importance
Inflight entertainment,0.418534
Seat comfort,0.187478
Ease of Online booking,0.072075
x0_disloyal Customer,0.0349
Flight Distance,0.030039
Age,0.025055
x2_Business,0.02248
Online support,0.021539
Checkin service,0.02143
Leg room service,0.021214


In [29]:
 # Save the model
import joblib
joblib.dump(grid.best_estimator_, 'decision_tree.pkl')

['decision_tree.pkl']