<a href="https://colab.research.google.com/github/Giogeorge213/DataScienceProjects/blob/main/scikit_learn_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

# scikit-learn : KNN, Naive Bayes, and Logistic Regression models on airline satisfaction data for hypothetical loyalty program

## from https://www.kaggle.com/datasets/teejmahal20/airline-passenger-satisfaction

## Wrangling

### Combine datasets for complete data

In [None]:
df1 = pd.read_csv('test.csv')
df2= pd.read_csv('train.csv')

In [None]:
X = pd.concat([df1,df2], ignore_index= True)

### Handle NANs

In [None]:
nan_counts = X.isna().sum()
print(nan_counts)

Unnamed: 0                             0
id                                     0
Gender                                 0
Customer Type                          0
Age                                    0
Type of Travel                         0
Class                                  0
Flight Distance                        0
Inflight wifi service                  0
Departure/Arrival time convenient      0
Ease of Online booking                 0
Gate location                          0
Food and drink                         0
Online boarding                        0
Seat comfort                           0
Inflight entertainment                 0
On-board service                       0
Leg room service                       0
Baggage handling                       0
Checkin service                        0
Inflight service                       0
Cleanliness                            0
Departure Delay in Minutes             0
Arrival Delay in Minutes             393
satisfaction    

### Arrival Delay in Minutes NANs correspond to no values entered. Drop the rows

In [None]:
naResult = X[X['Arrival Delay in Minutes'].isna()]
print(naResult)
X.dropna(inplace = True)

        Unnamed: 0      id  Gender      Customer Type  Age   Type of Travel  \
516            516  107365  Female     Loyal Customer   21  Personal Travel   
656            656  108648    Male     Loyal Customer    9  Personal Travel   
1071          1071   16797    Male  disloyal Customer   25  Business travel   
1224          1224   30090    Male     Loyal Customer    7  Personal Travel   
1589          1589   41924  Female     Loyal Customer   58  Business travel   
...            ...     ...     ...                ...  ...              ...   
128043      102067   36729    Male     Loyal Customer   49  Personal Travel   
128360      102384   71241    Male     Loyal Customer   58  Business travel   
128528      102552   27684  Female  disloyal Customer   29  Business travel   
128936      102960   36787    Male     Loyal Customer   58  Business travel   
129516      103540   45022  Female     Loyal Customer   33  Personal Travel   

           Class  Flight Distance  Inflight wifi se

### One-hot encoding for model training

In [None]:
X.select_dtypes(include='object')

Unnamed: 0,Gender,Customer Type,Type of Travel,Class,satisfaction
0,Female,Loyal Customer,Business travel,Eco,satisfied
1,Female,Loyal Customer,Business travel,Business,satisfied
2,Male,disloyal Customer,Business travel,Eco,neutral or dissatisfied
3,Male,Loyal Customer,Business travel,Business,satisfied
4,Female,Loyal Customer,Business travel,Eco,satisfied
...,...,...,...,...,...
129875,Female,disloyal Customer,Business travel,Eco,neutral or dissatisfied
129876,Male,Loyal Customer,Business travel,Business,satisfied
129877,Male,disloyal Customer,Business travel,Business,neutral or dissatisfied
129878,Female,disloyal Customer,Business travel,Eco,neutral or dissatisfied


In [None]:
X = pd.get_dummies(X, columns=['Gender'], drop_first=True)
X = pd.get_dummies(X, columns=['Customer Type'], drop_first=True)
X = pd.get_dummies(X, columns=['Type of Travel'], drop_first=True)

In [None]:
class_mapping = {'Eco': 0, 'Eco Plus': 1, 'Business': 2}
X['Class'] = X['Class'].map(class_mapping)
satisfaction_mapping = {'neutral or dissatisfied': 0, 'satisfied': 1}
X['satisfaction'] = X['satisfaction'].map(satisfaction_mapping)

In [None]:
X.head()

Unnamed: 0.1,Unnamed: 0,id,Age,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,Gate location,Food and drink,...,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction,Gender_Male,Customer Type_disloyal Customer,Type of Travel_Personal Travel
0,0,19556,52,0,160,5,4,3,4,3,...,5,2,5,5,50,44.0,1,0,0,0
1,1,90035,36,2,2863,1,1,3,1,5,...,4,3,4,5,0,0.0,1,0,0,0
2,2,12360,20,0,192,2,0,2,4,2,...,3,2,2,2,0,0.0,0,1,1,0
3,3,77959,44,2,3377,0,0,0,2,3,...,1,3,1,4,0,6.0,1,1,0,0
4,4,36875,49,0,1182,2,3,4,3,4,...,2,4,2,4,0,20.0,1,0,0,0


### Separate and drop satisfaction column the target for model

In [None]:
y = X['satisfaction']
X.drop(columns=['satisfaction'], inplace=True)

### Split data into train and test sets

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

## KNN

### Train model with nn = 5, nn=7, nn = 10 and evaluate

In [None]:
knn5 = KNeighborsClassifier(n_neighbors=5)
knn5.fit(X_train, y_train)

knn7 = KNeighborsClassifier(n_neighbors=7)
knn7.fit(X_train, y_train)

knn10 = KNeighborsClassifier(n_neighbors=10)
knn10.fit(X_train, y_train)

y_pred_knn5 = knn5.predict(X_test)
y_pred_knn7 = knn7.predict(X_test)
y_pred_knn10 = knn10.predict(X_test)

In [None]:
report_knn5 = classification_report(y_test, y_pred_knn5)
report_knn7 = classification_report(y_test, y_pred_knn7)
report_knn10 = classification_report(y_test, y_pred_knn10)

In [None]:
print(report_knn5)
print(report_knn7)
print(report_knn10)

              precision    recall  f1-score   support

           0       0.63      0.71      0.67     14597
           1       0.55      0.46      0.50     11301

    accuracy                           0.60     25898
   macro avg       0.59      0.58      0.58     25898
weighted avg       0.59      0.60      0.59     25898

              precision    recall  f1-score   support

           0       0.63      0.74      0.68     14597
           1       0.56      0.44      0.49     11301

    accuracy                           0.61     25898
   macro avg       0.60      0.59      0.59     25898
weighted avg       0.60      0.61      0.60     25898

              precision    recall  f1-score   support

           0       0.62      0.83      0.71     14597
           1       0.61      0.35      0.45     11301

    accuracy                           0.62     25898
   macro avg       0.62      0.59      0.58     25898
weighted avg       0.62      0.62      0.60     25898



## Naive Bayes

### Train and test model

In [None]:
nb = MultinomialNB()
nb.fit(X_train, y_train)
y_pred_nb = nb.predict(X_test)

### Analysis of model

In [None]:
report_nb = classification_report(y_test, y_pred_nb)
print(report_nb)

              precision    recall  f1-score   support

           0       0.62      0.54      0.58     14597
           1       0.49      0.58      0.53     11301

    accuracy                           0.56     25898
   macro avg       0.56      0.56      0.55     25898
weighted avg       0.57      0.56      0.56     25898



## Logistic Regression

In [None]:
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

report_lr = classification_report(y_test, y_pred_lr)
print(report_lr)

              precision    recall  f1-score   support

           0       0.81      0.79      0.80     14597
           1       0.74      0.75      0.75     11301

    accuracy                           0.78     25898
   macro avg       0.77      0.77      0.77     25898
weighted avg       0.78      0.78      0.78     25898



## Cross-Validation on recall

In [None]:
knn5_recalls = cross_val_score(knn5, X, y, cv=5, scoring='recall')
knn5_avg_recall = knn5_recalls.mean()

In [None]:
nb_recalls = cross_val_score(nb, X, y, cv=5, scoring='recall')
nb_avg_recall = nb_recalls.mean()

In [None]:
lr_recalls = cross_val_score(lr, X, y, cv=5, scoring='recall')
lr_avg_recall = lr_recalls.mean()

In [None]:
print("KNN5 Cross-Validation Recall:", round(knn5_avg_recall,3))
print("Naive Bayes Cross-Validation Recall:", round(nb_avg_recall,3))
print("Logistic Regression Cross-Validation Recall:", round(lr_avg_recall,3))

KNN5 Cross-Validation Recall: 0.385
Naive Bayes Cross-Validation Recall: 0.555
Logistic Regression Cross-Validation Recall: 0.739


### Recall decreased for all models when performing 5-fold CV. The Logistic Regression model had the highest recall value, which is important when identifying loyal customers to target for a loyalty program.