# **Finding the best Model fo this Problem**

Importing necessary libraries

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns

In [2]:
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC

In [3]:
import warnings
warnings.filterwarnings("ignore")

Calling the data from the path

In [4]:
data = pd.read_csv(r"/root/MLOPS_PROJEST_WSL/Airline-Passenger-Satisfaction/project/notebooks/processed_data.csv")
data.head(5)

Unnamed: 0.1,Unnamed: 0,Gender,Age,Customer Type,Type of Travel,Class,Flight Distance,Departure Delay,Arrival Delay,Departure and Arrival Time Convenience,Satisfaction_Label,flight_service_1,flight_service_2,flight_service_3
0,0,1,48,0,0,0,1,2,2,3,0,3.33,4.0,4.5
1,1,0,35,1,0,0,1,3,3,2,1,3.33,4.4,4.25
2,2,1,41,1,0,0,2,0,0,4,1,4.33,4.2,3.25
3,3,1,50,1,0,0,3,0,0,2,1,3.0,4.6,4.25
4,4,0,49,1,0,0,3,0,2,3,1,3.67,4.0,3.0


Dividing the data into target and features in seperate variables

In [5]:
X = data.drop(columns=["Satisfaction_Label"], axis=1)
y = data["Satisfaction_Label"]

Spliting data 

In [6]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shapes of the training and testing sets
X_train.shape, X_test.shape  # Shows the shapes of training and testing sets


((103904, 13), (25976, 13))

Creating an function to check evaluation of model

In [7]:
def evaluate_model(true, pred):
    # Calculate accuracy using true labels and predicted labels
    accuracy = accuracy_score(true, pred)
    
    # Generate confusion matrix using true labels and predicted labels
    confusion_matrix_result = confusion_matrix(true, pred)
    
    # Calculate ROC AUC score using true labels and predicted labels
    roc_auc = roc_auc_score(true, pred)
    
    # Return the calculated metrics: accuracy, confusion matrix, and ROC AUC score
    return accuracy, confusion_matrix_result, roc_auc


In [8]:
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score

# Assuming X_train, X_test, y_train, y_test are already defined

model = {
    "LogisticRegression": LogisticRegression(),
    "RidgeClassifier": RidgeClassifier(),
    "RandomForestClassifier": RandomForestClassifier(),
    "AdaBoostClassifier": AdaBoostClassifier(),
    "KNeighborsClassifier": KNeighborsClassifier(),
    "MultinomialNB": MultinomialNB(),
    # "SVC": SVC()
}

model_list = []
accuracy_list = []

for model_name, model_instant in model.items():
    # Train each model
    model_instant.fit(X_train, y_train)

    # Predict on training and testing sets
    y_train_pred = model_instant.predict(X_train)
    y_test_pred = model_instant.predict(X_test)

    # Evaluate model performance on training set
    model_train_accuracy, model_train_confusion_matrix, model_train_roc_auc = evaluate_model(y_train, y_train_pred)
    
    # Evaluate model performance on testing set
    model_test_accuracy, model_test_confusion_matrix, model_test_roc_auc = evaluate_model(y_test, y_test_pred)

    model_list.append(model_name)

    # Display model performance for training set
    print(f"-----------{model_name}-----------")
    print("Model performance on training set")
    print(f"accuracy_score : {(model_train_accuracy).round(2)}")
    print(f"confusion_matrix :\n {(model_train_confusion_matrix).round(2)}")
    print(f"roc_auc : {(model_train_roc_auc).round(2)}")

    # Display model performance for testing set
    print("Model performance on testing set")
    print(f"accuracy_score : {(model_test_accuracy).round(2)}")
    print(f"confusion_matrix :\n {(model_test_confusion_matrix).round(2)}")
    print(f"roc_auc : {(model_test_roc_auc).round(2)}")
    print("\n")

    accuracy_list.append(model_test_accuracy)


-----------LogisticRegression-----------
Model performance on training set
accuracy_score : 0.57
confusion_matrix :
 [[58729     0]
 [45175     0]]
roc_auc : 0.5
Model performance on testing set
accuracy_score : 0.57
confusion_matrix :
 [[14723     0]
 [11253     0]]
roc_auc : 0.5


-----------RidgeClassifier-----------
Model performance on training set
accuracy_score : 0.86
confusion_matrix :
 [[51804  6925]
 [ 7592 37583]]
roc_auc : 0.86
Model performance on testing set
accuracy_score : 0.86
confusion_matrix :
 [[12984  1739]
 [ 1846  9407]]
roc_auc : 0.86


-----------RandomForestClassifier-----------
Model performance on training set
accuracy_score : 1.0
confusion_matrix :
 [[58729     0]
 [    1 45174]]
roc_auc : 1.0
Model performance on testing set
accuracy_score : 0.92
confusion_matrix :
 [[14004   719]
 [ 1399  9854]]
roc_auc : 0.91


-----------AdaBoostClassifier-----------
Model performance on training set
accuracy_score : 0.88
confusion_matrix :
 [[52651  6078]
 [ 6292 38883

In [9]:
pd.DataFrame(list(zip(model_list, accuracy_list)), columns=["Model_Name", "Accuracy_Score"]).sort_values(by=["Accuracy_Score"], ascending=False)

Unnamed: 0,Model_Name,Accuracy_Score
2,RandomForestClassifier,0.918463
3,AdaBoostClassifier,0.879196
1,RidgeClassifier,0.861988
4,KNeighborsClassifier,0.676201
5,MultinomialNB,0.657607
0,LogisticRegression,0.566792


In [10]:
model = RandomForestClassifier()  # Initializing RandomForestClassifier
model.fit(X_train, y_train)  # Fitting the model to the training data
y_pred = model.predict(X_test)  # Predicting on the test set
score = accuracy_score(y_test, y_pred) * 100  # Calculating accuracy
print("Accuracy : {:.1f}".format(score))  # Printing the accuracy score

Accuracy : 91.8


**The RandomForestClassifier demonstrates high accuracy on both training and testing sets, with minimal misclassifications (as seen in the confusion matrix) and a robust ROC AUC score, indicating strong discriminatory power between classes.**

Hyper parametric tuning

In [18]:
import numpy as np
from sklearn.model_selection import RandomizedSearchCV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start=100, stop=1000, num=10)]

# to measure the quality of a split
criterion = ['gini', 'entropy', 'log_loss']

# The maximum depth of the tree
# max_depth = [int(x) for x in np.linspace(1, 10, 1)]

# The minimum number of samples required to split an internal node
min_samples_split = [2,4,6]

# The minimum number of samples required to be at a leaf node.
min_samples_leaf = [2,4,6]

# The number of features to consider when looking for the best split
max_features = ['sqrt', 'log2', None]

# create an random grid
random_grid = {
    "n_estimators" : n_estimators,
    "criterion" : criterion,
    # "max_depth" : max_depth,
    "min_samples_split" : min_samples_split,
    "min_samples_leaf" : min_samples_leaf,
    "max_features" : max_features,
}
print(random_grid)

{'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000], 'criterion': ['gini', 'entropy', 'log_loss'], 'min_samples_split': [2, 4, 6], 'min_samples_leaf': [2, 4, 6], 'max_features': ['sqrt', 'log2', None]}


In [19]:
rf = RandomForestClassifier()
rf_randomcv = RandomizedSearchCV(estimator=rf, param_distributions=random_grid, n_iter=100, cv=2, verbose=2, random_state=100, n_jobs=1)
rf_randomcv.fit(X_train,y_train)

Fitting 2 folds for each of 100 candidates, totalling 200 fits
[CV] END criterion=entropy, max_features=None, min_samples_leaf=6, min_samples_split=6, n_estimators=900; total time= 2.6min
[CV] END criterion=entropy, max_features=None, min_samples_leaf=6, min_samples_split=6, n_estimators=900; total time= 2.7min
[CV] END criterion=gini, max_features=log2, min_samples_leaf=4, min_samples_split=2, n_estimators=600; total time=  33.0s
[CV] END criterion=gini, max_features=log2, min_samples_leaf=4, min_samples_split=2, n_estimators=600; total time=  31.5s
[CV] END criterion=gini, max_features=sqrt, min_samples_leaf=2, min_samples_split=4, n_estimators=1000; total time=  55.3s
[CV] END criterion=gini, max_features=sqrt, min_samples_leaf=2, min_samples_split=4, n_estimators=1000; total time=  55.2s
[CV] END criterion=gini, max_features=log2, min_samples_leaf=2, min_samples_split=6, n_estimators=500; total time=  27.0s
[CV] END criterion=gini, max_features=log2, min_samples_leaf=2, min_samples

KeyboardInterrupt: 

In [21]:
model = RandomForestClassifier(criterion='gini', max_features='log2', min_samples_leaf=4, min_samples_split=2, n_estimators=600).fit(X_train, y_train)
pred = model.predict(X_test)
print(accuracy_score(y_test, pred))


0.9170388050508161


Taking an long time for hyper parametric tuning, however the accuracy will increase only in short level , so I take an random one of above