### Model Use NEW Data

In [1]:
import numpy as np
import pandas as pd
from IPython.display import Markdown, display

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from seaborn import diverging_palette

import sklearn
from sklearn.exceptions import ConvergenceWarning
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.semi_supervised import LabelPropagation, LabelSpreading

from sklearn.linear_model import (
    LogisticRegression,
    Lasso
)

from sklearn.ensemble import (
    RandomForestClassifier,
    AdaBoostClassifier
)

from sklearn.model_selection import (
    train_test_split,
    cross_val_score,
    cross_val_predict,
    RepeatedStratifiedKFold,
    GridSearchCV,
    KFold
)

from sklearn.metrics import (
    accuracy_score, 
    precision_score, average_precision_score,
    precision_recall_curve,
    confusion_matrix, plot_confusion_matrix,
    roc_curve, roc_auc_score,
    classification_report
)

import lightgbm as lgb
from lightgbm import LGBMClassifier, plot_importance

from yellowbrick.classifier import (
    ConfusionMatrix,
    ROCAUC
)

import warnings
from warnings import simplefilter
warnings.filterwarnings("ignore")
simplefilter("ignore", category=ConvergenceWarning)

pd.set_option('display.max_columns', None) # Show all the columns

In [2]:
url = 'https://raw.githubusercontent.com/McGill-MMA-EnterpriseAnalytics/Airline-Passenger-Satisfaction-Prediction-Part2/Dev/Data/df6.csv'
data = pd.read_csv(url)
data

Unnamed: 0,id,Age,Class,Flight_Distance,Inflight_wifi_service,Departure/Arrival_time_convenient,Ease_of_Online_booking,Gate_location,Food_and_drink,Online_boarding,Seat_comfort,Inflight_entertainment,On-board_service,Leg_room_service,Baggage_handling,Checkin_service,Inflight_service,Cleanliness,Departure_Delay_in_Minutes,satisfaction,Gender_Male,Customer_Type_Loyal Customer,Type_of_Travel_Business travel
0,110028.0,26.0,0.0,1142.0,2.0,2.0,2.0,2.0,5.0,5.0,5.0,5.0,4.0,3.0,4.0,4.0,4.0,5.0,0.0,1.0,0.0,1.0,1.0
1,119299.0,61.0,0.0,214.0,3.0,3.0,3.0,3.0,4.0,5.0,5.0,3.0,3.0,4.0,4.0,3.0,3.0,3.0,0.0,1.0,1.0,1.0,1.0
2,82113.0,47.0,2.0,1276.0,2.0,4.0,2.0,3.0,2.0,2.0,2.0,2.0,3.0,3.0,4.0,3.0,5.0,2.0,9.0,0.0,1.0,1.0,0.0
3,96462.0,52.0,0.0,2035.0,4.0,3.0,4.0,4.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,4.0,5.0,4.0,4.0,1.0,0.0,1.0,1.0
4,65725.0,20.0,2.0,1061.0,3.0,3.0,3.0,4.0,2.0,3.0,3.0,2.0,2.0,3.0,4.0,4.0,3.0,2.0,0.0,0.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42897,86549.0,26.0,0.0,712.0,4.0,4.0,4.0,4.0,5.0,5.0,5.0,5.0,3.0,4.0,4.0,3.0,4.0,5.0,17.0,1.0,1.0,1.0,1.0
42898,102203.0,60.0,0.0,1599.0,5.0,5.0,5.0,5.0,5.0,5.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,9.0,1.0,0.0,1.0,1.0
42899,60666.0,50.0,2.0,1620.0,3.0,1.0,3.0,4.0,2.0,3.0,2.0,2.0,4.0,3.0,4.0,2.0,4.0,2.0,0.0,0.0,1.0,1.0,0.0
42900,94171.0,23.0,2.0,192.0,2.0,1.0,2.0,3.0,2.0,2.0,2.0,2.0,3.0,1.0,4.0,2.0,3.0,2.0,3.0,0.0,0.0,0.0,1.0


# Data Preparation

## Standardization of numeric variables

In [3]:
numeric_cols = ['Age', 'Flight_Distance','Departure_Delay_in_Minutes']
scaler = StandardScaler()
data[numeric_cols] = scaler.fit_transform(data[numeric_cols])

## Handling Missing values

The satisfaction level of each feature should range from 1 to 5.

If the rating is 0, that means the customer did not rate for this feature.

In [4]:
def list_column_values(df, number_of_values, print_all):
    display(Markdown('**Results:**' ))
    for col in df.columns[0:]:
        if df[col].nunique() <= number_of_values:
            print(f"{col.ljust(25)}" +  ' ==> ' + str(df[col].sort_values().unique().tolist()) )
        else:  
            if print_all=='True':
               
                print(f"{col.ljust(25)}" + ' ==> more than ' + str(number_of_values) + ' values')

list_column_values(data, data.shape[1],'True')

**Results:**

id                        ==> more than 23 values
Age                       ==> more than 23 values
Class                     ==> [0.0, 1.0, 2.0]
Flight_Distance           ==> more than 23 values
Inflight_wifi_service     ==> [0.0, 1.0, 2.0, 3.0, 4.0, 5.0]
Departure/Arrival_time_convenient ==> [0.0, 1.0, 2.0, 3.0, 4.0, 5.0]
Ease_of_Online_booking    ==> [0.0, 1.0, 2.0, 3.0, 4.0, 5.0]
Gate_location             ==> [1.0, 2.0, 3.0, 4.0, 5.0]
Food_and_drink            ==> [1.0, 2.0, 3.0, 4.0, 5.0]
Online_boarding           ==> [0.0, 1.0, 2.0, 3.0, 4.0, 5.0]
Seat_comfort              ==> [1.0, 2.0, 3.0, 4.0, 5.0]
Inflight_entertainment    ==> [1.0, 2.0, 3.0, 4.0, 5.0]
On-board_service          ==> [1.0, 2.0, 3.0, 4.0, 5.0]
Leg_room_service          ==> [0.0, 1.0, 2.0, 3.0, 4.0, 5.0]
Baggage_handling          ==> [1.0, 2.0, 3.0, 4.0, 5.0]
Checkin_service           ==> [1.0, 2.0, 3.0, 4.0, 5.0]
Inflight_service          ==> [1.0, 2.0, 3.0, 4.0, 5.0]
Cleanliness               ==> [1.0, 2.0, 3.

In [5]:
incomplete = ['Inflight_wifi_service','Departure/Arrival_time_convenient',
              'Ease_of_Online_booking','Online_boarding','Leg_room_service']
(data[incomplete] == 0).sum()

Inflight_wifi_service                162
Departure/Arrival_time_convenient    276
Ease_of_Online_booking               145
Online_boarding                        9
Leg_room_service                       3
dtype: int64

#### We should consider rating=0 as missing values as the customer did not provide any rating feedback.

In [6]:
(data[incomplete] == 0).sum().sum()/data.shape[0]

0.013868817304554566

#### missing ratings only takes up 1.3% of the dataset, try advanced imputation techniques here

## KNN Imputation

In [7]:
#replace the 0 value as nan
data[incomplete] = data[incomplete].replace(0, np.nan)

In [8]:
data[incomplete].isnull().sum()

Inflight_wifi_service                162
Departure/Arrival_time_convenient    276
Ease_of_Online_booking               145
Online_boarding                        9
Leg_room_service                       3
dtype: int64

In [9]:
null_rows_idx = data.isnull().any(axis=1)

data.loc[null_rows_idx].head(3)

Unnamed: 0,id,Age,Class,Flight_Distance,Inflight_wifi_service,Departure/Arrival_time_convenient,Ease_of_Online_booking,Gate_location,Food_and_drink,Online_boarding,Seat_comfort,Inflight_entertainment,On-board_service,Leg_room_service,Baggage_handling,Checkin_service,Inflight_service,Cleanliness,Departure_Delay_in_Minutes,satisfaction,Gender_Male,Customer_Type_Loyal Customer,Type_of_Travel_Business travel
14,14849.0,-0.056376,0.0,-1.041118,,,,3.0,2.0,5.0,3.0,4.0,4.0,4.0,4.0,4.0,4.0,3.0,-0.454514,1.0,1.0,1.0,1.0
66,85554.0,0.545156,0.0,-1.02066,,1.0,1.0,1.0,3.0,4.0,5.0,4.0,4.0,4.0,4.0,4.0,4.0,5.0,-0.454514,1.0,0.0,1.0,1.0
342,48727.0,0.620348,0.0,2.036342,,5.0,,4.0,2.0,5.0,5.0,4.0,4.0,4.0,4.0,3.0,4.0,3.0,-0.454514,1.0,0.0,1.0,1.0


In [10]:
from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=5)

data_num = data.select_dtypes(include=[np.number])
imputer.fit(data_num)

X_ = imputer.transform(data_num)

#save in df4_knn
data_knn = pd.DataFrame(X_, columns=data_num.columns, index=data_num.index)

data_knn.loc[null_rows_idx].head(3)

Unnamed: 0,id,Age,Class,Flight_Distance,Inflight_wifi_service,Departure/Arrival_time_convenient,Ease_of_Online_booking,Gate_location,Food_and_drink,Online_boarding,Seat_comfort,Inflight_entertainment,On-board_service,Leg_room_service,Baggage_handling,Checkin_service,Inflight_service,Cleanliness,Departure_Delay_in_Minutes,satisfaction,Gender_Male,Customer_Type_Loyal Customer,Type_of_Travel_Business travel
14,14849.0,-0.056376,0.0,-1.041118,3.6,3.6,2.8,3.0,2.0,5.0,3.0,4.0,4.0,4.0,4.0,4.0,4.0,3.0,-0.454514,1.0,1.0,1.0,1.0
66,85554.0,0.545156,0.0,-1.02066,3.4,1.0,1.0,1.0,3.0,4.0,5.0,4.0,4.0,4.0,4.0,4.0,4.0,5.0,-0.454514,1.0,0.0,1.0,1.0
342,48727.0,0.620348,0.0,2.036342,3.0,5.0,3.0,4.0,2.0,5.0,5.0,4.0,4.0,4.0,4.0,3.0,4.0,3.0,-0.454514,1.0,0.0,1.0,1.0


#### Since the satisfaction level can only be 1,2,3,4, or 5. we need to conver the value to integer

In [11]:
import math

def ceil(x):
    return math.ceil(x - 0.5)

data_knn[incomplete] = data_knn[incomplete].applymap(ceil)

data_knn.loc[null_rows_idx].head(3)

Unnamed: 0,id,Age,Class,Flight_Distance,Inflight_wifi_service,Departure/Arrival_time_convenient,Ease_of_Online_booking,Gate_location,Food_and_drink,Online_boarding,Seat_comfort,Inflight_entertainment,On-board_service,Leg_room_service,Baggage_handling,Checkin_service,Inflight_service,Cleanliness,Departure_Delay_in_Minutes,satisfaction,Gender_Male,Customer_Type_Loyal Customer,Type_of_Travel_Business travel
14,14849.0,-0.056376,0.0,-1.041118,4,4,3,3.0,2.0,5,3.0,4.0,4.0,4,4.0,4.0,4.0,3.0,-0.454514,1.0,1.0,1.0,1.0
66,85554.0,0.545156,0.0,-1.02066,3,1,1,1.0,3.0,4,5.0,4.0,4.0,4,4.0,4.0,4.0,5.0,-0.454514,1.0,0.0,1.0,1.0
342,48727.0,0.620348,0.0,2.036342,3,5,3,4.0,2.0,5,5.0,4.0,4.0,4,4.0,3.0,4.0,3.0,-0.454514,1.0,0.0,1.0,1.0


In [12]:
list_column_values(data_knn, data_knn.shape[1],'True')

**Results:**

id                        ==> more than 23 values
Age                       ==> more than 23 values
Class                     ==> [0.0, 1.0, 2.0]
Flight_Distance           ==> more than 23 values
Inflight_wifi_service     ==> [1, 2, 3, 4, 5]
Departure/Arrival_time_convenient ==> [1, 2, 3, 4, 5]
Ease_of_Online_booking    ==> [1, 2, 3, 4, 5]
Gate_location             ==> [1.0, 2.0, 3.0, 4.0, 5.0]
Food_and_drink            ==> [1.0, 2.0, 3.0, 4.0, 5.0]
Online_boarding           ==> [1, 2, 3, 4, 5]
Seat_comfort              ==> [1.0, 2.0, 3.0, 4.0, 5.0]
Inflight_entertainment    ==> [1.0, 2.0, 3.0, 4.0, 5.0]
On-board_service          ==> [1.0, 2.0, 3.0, 4.0, 5.0]
Leg_room_service          ==> [1, 2, 3, 4, 5]
Baggage_handling          ==> [1.0, 2.0, 3.0, 4.0, 5.0]
Checkin_service           ==> [1.0, 2.0, 3.0, 4.0, 5.0]
Inflight_service          ==> [1.0, 2.0, 3.0, 4.0, 5.0]
Cleanliness               ==> [1.0, 2.0, 3.0, 4.0, 5.0]
Departure_Delay_in_Minutes ==> more than 23 values
satisfactio

## H2O- Auto ML

In [14]:
import h2o
h2o.init(max_mem_size='8G')
from h2o.automl import H2OAutoML

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: java version "1.8.0_321"; Java(TM) SE Runtime Environment (build 1.8.0_321-b07); Java HotSpot(TM) 64-Bit Server VM (build 25.321-b07, mixed mode)
  Starting server from /opt/anaconda3/lib/python3.9/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /var/folders/y8/z84_qvjd3838gv1wm23qyc4h0000gn/T/tmpqbz71tje
  JVM stdout: /var/folders/y8/z84_qvjd3838gv1wm23qyc4h0000gn/T/tmpqbz71tje/h2o_zsl_started_from_python.out
  JVM stderr: /var/folders/y8/z84_qvjd3838gv1wm23qyc4h0000gn/T/tmpqbz71tje/h2o_zsl_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,09 secs
H2O_cluster_timezone:,America/Toronto
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.40.0.3
H2O_cluster_version_age:,18 days
H2O_cluster_name:,H2O_from_python_zsl_ss3tk7
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,7.104 Gb
H2O_cluster_total_cores:,4
H2O_cluster_allowed_cores:,4


In [18]:
import h2o
from h2o.automl import H2OAutoML

# Convert the target column to categorical
data_knn['satisfaction'] = data_knn['satisfaction'].astype('category')

y = 'satisfaction'
data = h2o.H2OFrame(data_knn)
train, test = data.split_frame(ratios=[0.8])
x = data_knn.drop(columns=['satisfaction']).columns.tolist()

# Initialize the AutoML object
aml = H2OAutoML(max_models=20, max_runtime_secs=12000)

# Train the AutoML model
aml.train(x=x, y=y, training_frame=train)

# Update the hyperparameters
aml.set_params(
    nfolds=5,              # Update the number of cross-validation folds
    max_runtime_secs=6000, # Update the maximum runtime for AutoML
    include_algos=["GBM", "XGBoost","DummyClassifier","LogisticRegression","DecisionTreeClassifier","KNeighborsClassifier","LinearDiscriminantAnalysis","SVC"],# Update the list of algorithms to include
    stopping_metric="AUC", # Update the stopping metric for early stopping
    seed=42                # Update the random seed
)

# Retrieve the best model from AutoML
best_model = aml.leader
lb = aml.leaderboard
preds = aml.predict(test)


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
AutoML progress: |
15:22:02.814: _response param, We have detected that your response column has only 2 unique values (0/1). If you wish to train a binary model instead of a regression model, convert your target column to categorical before training.

██
15:22:41.503: _response param, We have detected that your response column has only 2 unique values (0/1). If you wish to train a binary model instead of a regression model, convert your target column to categorical before training.


15:22:44.241: _response param, We have detected that your response column has only 2 unique values (0/1). If you wish to train a binary model instead of a regression model, convert your target column to categorical before training.

███████████████████████████
15:26:25.798: _response param, We have detected that your response column has only 2 unique values (0/1). If you wish to train a binary model instead of a 

H2OJobCancelled: Job<$03017f00000132d4ffffffff$_a727b9f2697b3ed9663f3e12271845ad> was cancelled by the user.

In [None]:
import h2o
from h2o.automl import H2OAutoML
from sklearn.metrics import accuracy_score

# Convert the target column to categorical
data_knn['satisfaction'] = data_knn['satisfaction'].astype('category')

y = 'satisfaction'
data = h2o.H2OFrame(data_knn)
train, val, test = data.split_frame(ratios=[0.6, 0.2])
x = data_knn.drop(columns=['satisfaction']).columns.tolist()

# Initialize the AutoML object
aml = H2OAutoML(max_models=20, max_runtime_secs=12000)

# Train the AutoML model
aml.train(x=x, y=y, training_frame=train, validation_frame=val)

# Retrieve the best model from AutoML
best_model = aml.leader
preds = best_model.predict(test)
y_true = test[y].as_data_frame().values.flatten()
y_pred = preds.as_data_frame().values.flatten()

# Get the accuracy
accuracy = accuracy_score(y_true, y_pred)
print(f"Accuracy: {accuracy}")


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
AutoML progress: |
15:47:03.748: User specified a validation frame with cross-validation still enabled. Please note that the models will still be validated using cross-validation only, the validation frame will be used to provide purely informative validation metrics on the trained models.
15:47:03.796: _response param, We have detected that your response column has only 2 unique values (0/1). If you wish to train a binary model instead of a regression model, convert your target column to categorical before training.

██
15:47:43.408: _response param, We have detected that your response column has only 2 unique values (0/1). If you wish to train a binary model instead of a regression model, convert your target column to categorical before training.


15:47:45.895: _response param, We have detected that your response column has only 2 unique values (0/1). If you wish to train a binary model in