In [1]:
import pandas as pd
# Import csv file
df = pd.read_csv('./data/Airline_customer_satisfaction_original.csv')
df.head()

Unnamed: 0,satisfaction,Customer Type,Age,Type of Travel,Class,Flight Distance,Seat comfort,Departure/Arrival time convenient,Food and drink,Gate location,...,Online support,Ease of Online booking,On-board service,Leg room service,Baggage handling,Checkin service,Cleanliness,Online boarding,Departure Delay in Minutes,Arrival Delay in Minutes
0,satisfied,Loyal Customer,65,Personal Travel,Eco,265,0,0,0,2,...,2,3,3,0,3,5,3,2,0,0.0
1,satisfied,Loyal Customer,47,Personal Travel,Business,2464,0,0,0,3,...,2,3,4,4,4,2,3,2,310,305.0
2,satisfied,Loyal Customer,15,Personal Travel,Eco,2138,0,0,0,3,...,2,2,3,3,4,4,4,2,0,0.0
3,satisfied,Loyal Customer,60,Personal Travel,Eco,623,0,0,0,3,...,3,1,1,0,1,4,1,3,0,0.0
4,satisfied,Loyal Customer,70,Personal Travel,Eco,354,0,0,0,3,...,4,2,2,0,2,4,2,5,0,0.0


In [2]:
# Drop columns that are not needed
df = df.drop(['Departure/Arrival time convenient', 'Gate location'], axis=1)

# Show nan values
df.isnull().sum()

satisfaction                    0
Customer Type                   0
Age                             0
Type of Travel                  0
Class                           0
Flight Distance                 0
Seat comfort                    0
Food and drink                  0
Inflight wifi service           0
Inflight entertainment          0
Online support                  0
Ease of Online booking          0
On-board service                0
Leg room service                0
Baggage handling                0
Checkin service                 0
Cleanliness                     0
Online boarding                 0
Departure Delay in Minutes      0
Arrival Delay in Minutes      393
dtype: int64

In [3]:
# Drop rows with nan values
df = df.dropna()

In [4]:
# Save data to csv
df.to_csv('Airline_customer_satisfaction.csv', index=False)

# Show nan values
df.isnull().sum()

satisfaction                  0
Customer Type                 0
Age                           0
Type of Travel                0
Class                         0
Flight Distance               0
Seat comfort                  0
Food and drink                0
Inflight wifi service         0
Inflight entertainment        0
Online support                0
Ease of Online booking        0
On-board service              0
Leg room service              0
Baggage handling              0
Checkin service               0
Cleanliness                   0
Online boarding               0
Departure Delay in Minutes    0
Arrival Delay in Minutes      0
dtype: int64

In [5]:
df.shape

(129487, 20)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 129487 entries, 0 to 129879
Data columns (total 20 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   satisfaction                129487 non-null  object 
 1   Customer Type               129487 non-null  object 
 2   Age                         129487 non-null  int64  
 3   Type of Travel              129487 non-null  object 
 4   Class                       129487 non-null  object 
 5   Flight Distance             129487 non-null  int64  
 6   Seat comfort                129487 non-null  int64  
 7   Food and drink              129487 non-null  int64  
 8   Inflight wifi service       129487 non-null  int64  
 9   Inflight entertainment      129487 non-null  int64  
 10  Online support              129487 non-null  int64  
 11  Ease of Online booking      129487 non-null  int64  
 12  On-board service            129487 non-null  int64  
 13  Leg room service   

In [4]:
# Train test split
from sklearn.model_selection import train_test_split
X = df.drop('satisfaction', axis=1)
y = df['satisfaction']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### AutoML

xxxx

In [None]:
import autosklearn.classification
from sklearn.metrics import accuracy_score

# Initialize and fit Auto-sklearn
automl = autosklearn.classification.AutoSklearnClassifier(time_left_for_this_task=300, per_run_time_limit=30)
automl.fit(X_train, y_train)

# Predict and evaluate
y_pred = automl.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Model details
print(automl.show_models())
print(automl.sprint_statistics())

### Decision Tree

In [5]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

# Identify categorical and numerical features
categorical_features = X_train.select_dtypes(include = ["object"]).columns.to_list()
numerical_features = X_train.select_dtypes(include = ["float", "int"]).columns.to_list()

# Create the categorical pipeline
categorical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("ohe", OneHotEncoder(sparse_output=False))
])

# create numerical pipeline
numerical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

# Create the transformer
transformer = ColumnTransformer([("cat", categorical_pipeline, categorical_features), ("num", numerical_pipeline, numerical_features)], remainder='passthrough') 

# Create the pipeline
pipeline_steps_dt = [("preprocessor", transformer), ("classifier", DecisionTreeClassifier(random_state=42))]
pipe_dt = Pipeline(pipeline_steps_dt)

In [6]:
# Define the grid for model tuning
param_grid = {
    'classifier__max_depth': [None, 3, 5, 10], 
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4],
    'classifier__min_impurity_decrease': [0.0, 0.1, 0.2]
}

# Perform grid search
grid = GridSearchCV(pipe_dt, param_grid, cv=5, scoring='accuracy')
grid.fit(X_train, y_train)
score = grid.best_score_ # best cross-validation score

print("Best accuracy score:", score)

Best accuracy score: 0.9340663364720928


In [12]:
# Check for overfitting
generalization_score = grid.best_estimator_.score(X_test, y_test)

print("Generalization accuracy:", generalization_score)

Generalization accuracy: 0.9360568383658969


In [12]:
# Also check the confusion matrix
from sklearn.metrics import confusion_matrix
y_pred = grid.best_estimator_.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[11104,   717],
       [  938, 13139]], dtype=int64)

In [13]:
# Check the classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

dissatisfied       0.92      0.94      0.93     11821
   satisfied       0.95      0.93      0.94     14077

    accuracy                           0.94     25898
   macro avg       0.94      0.94      0.94     25898
weighted avg       0.94      0.94      0.94     25898



In [14]:
# Print the feature importances
importances = grid.best_estimator_.named_steps['classifier'].feature_importances_
features = grid.best_estimator_.named_steps['preprocessor'].transformers_[0][1].named_steps['ohe'].get_feature_names_out()
features = list(features) + numerical_features
pd.DataFrame(importances, index=features, columns=['importance']).sort_values(by='importance', ascending=False)


Unnamed: 0,importance
Inflight entertainment,0.41867
Seat comfort,0.187693
Ease of Online booking,0.072012
x0_Loyal Customer,0.034334
Flight Distance,0.030043
Age,0.025309
x2_Business,0.022586
Online support,0.021715
Leg room service,0.021459
Checkin service,0.021288


In [15]:
 # Save the model
import joblib
joblib.dump(grid.best_estimator_, 'decision_tree.pkl')

['decision_tree.pkl']

### Logistic Regression

In [7]:
from sklearn.linear_model import LogisticRegression


# Create the pipeline
pipeline_steps_lr = [("preprocessor", transformer), ("classifier", LogisticRegression(random_state=42))]
pipe_lr = Pipeline(pipeline_steps_lr)

In [8]:
# define the grid for logistic regression model tuning
param_grid_lr = {
    'classifier__C': [0.01, 0.1, 1, 10, 100],
    'classifier__max_iter': [100, 200, 300],
    'classifier__solver': ['newton-cg', 'lbfgs', 'liblinear'],
    'classifier__penalty': ['l2']  # Note: 'l2' penalty is compatible with these solvers
}

# Perform grid search
grid_lr = GridSearchCV(pipe_lr, param_grid_lr, cv=5, scoring='accuracy')
grid_lr.fit(X_train, y_train)
score_lr = grid_lr.best_score_ # best cross-validation score

In [9]:
# evaluate the model
generalization_score_lr = grid_lr.best_estimator_.score(X_test, y_test)

print("Best accuracy score:", score_lr)
print("Generalization accuracy:", generalization_score_lr)

Best accuracy score: 0.8263715217689857
Generalization accuracy: 0.8255077612170824


### XGBoost

In [28]:
# import librarie for xgboost
import xgboost as xgb

# transform target variable in "0" and "1" for xgboost 
y_train_xgb = y_train.map({'dissatisfied': 0, 'satisfied': 1})
y_test_xgb = y_test.map({'dissatisfied': 0, 'satisfied': 1})

# Create the pipeline
pipeline_steps_xgb = [("preprocessor", transformer), ("classifier", xgb.XGBClassifier(random_state=42))]
pipe_xgb = Pipeline(pipeline_steps_xgb)

In [31]:
# define the grid for xgboost model tuning
param_grid_xgb = {
    'classifier__max_depth': [3, 5, 7],
    'classifier__n_estimators': [50, 100, 150],
    'classifier__learning_rate': [0.1, 0.01, 0.001]
}

# Perform grid search
grid_xgb = GridSearchCV(pipe_xgb, param_grid_xgb, cv=5, scoring='accuracy')
grid_xgb.fit(X_train, y_train_xgb)
score_xgb = grid_xgb.best_score_ # best cross-validation score

In [32]:
# evaluate the model
generalization_score_xgb = grid_xgb.best_estimator_.score(X_test, y_test_xgb)

print("Best accuracy score:", score_xgb)
print("Generalization accuracy:", generalization_score_xgb)

Best accuracy score: 0.9499367475221719
Generalization accuracy: 0.9527762761603212


### Support Vector Classifier

In [10]:
from sklearn.svm import SVC

# Create the pipeline
pipeline_steps_svm = [("preprocessor", transformer), ("classifier", SVC(random_state=42))]
pipe_svm = Pipeline(pipeline_steps_svm)

In [None]:
param_grid_svm = {
    'classifier__C': [0.1, 1, 10],
    #'classifier__kernel': ['linear', 'rbf'],  # Focus on the most common kernels
    #'classifier__gamma': ['scale']  # Use the most commonly used value
}

# Perform grid search
grid_svm = GridSearchCV(pipe_svm, param_grid_svm, cv=3, scoring='accuracy', n_jobs=-1)
grid_svm.fit(X_train, y_train)
score_svm = grid_svm.best_score_  # best cross-validation score

In [13]:
# evaluate the model

print("Best accuracy score:", score_svm)

Best accuracy score: 0.9439901894821907
