# Projeto Final - Fundamentos de Machine Learning
## Modelos de Classificação

# 0.0. Imports

In [130]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import ParameterGrid
from sklearn import metrics as mt
import matplotlib.pyplot as plt

# 0.1. Funções

In [156]:
def get_metrics(y, y_pred):
    scores = pd.DataFrame({
        'accuracy': [mt.accuracy_score(y, y_pred)], 
        'precision' : [mt.precision_score(y, y_pred)],
        'recall': [mt.recall_score(y, y_pred)], 
        'f1': [mt.f1_score(y, y_pred)]
    })
    return scores

def get_score_table(scores_train, scores_val, scores_test):
    score_comp = pd.concat([scores_train, scores_val, scores_test])
    score_comp.index = ['train', 'validation', 'test']
    return score_comp

def search_best_params(x_train, y_train, x_val, y_val, algorithm, parameter_grid):
    # DataFrame for storing metrics for each parameter set
    best_accuracy = 0
    best_params = {}
    # Iterate over each set of parameters in the grid
    for params in parameter_grid:
        # Create and fit the model with the current set of parameters
        model = algorithm(**params)
        model.fit(x_train, y_train.values.ravel())
        # Predict using the validation set
        y_pred_val = model.predict(x_val)
        # Get metrics and convert to DataFrame
        current_metrics = get_metrics(y_val, y_pred_val)
        current_accuracy = current_metrics['accuracy'].iloc[0]
        if current_accuracy > best_accuracy:
            best_accuracy = current_accuracy
            best_params = params
    
    return best_accuracy, best_params

def modeling(x_train, y_train, x_val, y_val, x_test, y_test, algorithm, parameters):
    model = algorithm(**parameters).fit(x_train, y_train.values.ravel())
    y_pred_train = model.predict(x_train)
    y_pred_val = model.predict(x_val)
    y_pred_test = model.predict(x_test)

    scores_train = get_metrics(y_train, y_pred_train)
    scores_val = get_metrics(y_val, y_pred_val)
    scores_test = get_metrics(y_test, y_pred_test)
    scores = get_score_table(scores_train, scores_val, scores_test)

    return scores

# 1.0 Loading Data

In [132]:
x_train = pd.read_csv('data/classificacao/X_training.csv')
y_train = pd.read_csv('data/classificacao/y_training.csv')

x_val = pd.read_csv('data/classificacao/X_validation.csv')
y_val = pd.read_csv('data/classificacao/y_validation.csv')

x_test = pd.read_csv('data/classificacao/X_test.csv')
y_test = pd.read_csv('data/classificacao/y_test.csv')

In [133]:
x_train.head()

Unnamed: 0,id,customer_type,age,class,flight_distance,inflight_wifi_service,departure_arrival_time_convenient,ease_of_online_booking,gate_location,food_and_drink,...,baggage_handling,checkin_service,inflight_service,cleanliness,departure_delay_in_minutes,arrival_delay_in_minutes,gender_Female,gender_Male,type_of_travel_business_travel,type_of_travel_personal_travel
0,13508,1,0.5,0.0,0.03958,0.6,0.6,0.6,0.6,1.0,...,0.5,1.0,0.6,0.4,0.0,0.013848,1.0,0.0,1.0,0.0
1,28874,1,0.24359,0.0,0.205775,0.6,0.4,0.4,0.4,0.6,...,0.5,0.5,0.2,0.6,0.0,0.0,0.0,1.0,1.0,0.0
2,21484,0,0.435897,1.0,0.026858,0.6,0.6,0.6,0.2,1.0,...,0.0,1.0,0.6,1.0,0.0,0.0,1.0,0.0,1.0,0.0
3,48280,1,0.589744,0.5,0.041397,0.6,1.0,0.6,0.6,0.8,...,0.0,1.0,0.4,0.4,0.029499,0.020772,1.0,0.0,0.0,1.0
4,472,0,0.423077,1.0,0.016559,0.2,0.2,0.2,0.8,0.6,...,1.0,0.75,0.8,0.6,0.021632,0.019782,0.0,1.0,1.0,0.0


In [134]:
# excluindo a coluna 'id' por não ser uma variável relevante
x_train.drop('id', axis=1, inplace=True)
x_val.drop('id', axis=1, inplace=True)
x_test.drop('id', axis=1, inplace=True)

# 2.0 Ensaios

## 2.1. KNN (K-Nearest Neighbor)

In [135]:
# Parametros 
knn_param = ParameterGrid({
    'n_neighbors' : np.arange(2,20,1)
})

best_score_knn = search_best_params(x_train, y_train, x_val, y_val, KNeighborsClassifier, knn_param)

In [136]:
best_accuracy = best_score_knn[0]
best_k = list(best_score_knn[1].values())[0]
print('Melhor acurácia é: {} com melhor k = {}'.format(best_accuracy, best_k))

Melhor acurácia é: 0.9265098619646707 com melhor k = 15


In [137]:
best_param_knn = {'n_neighbors' : best_k}
knn_scores = modeling(x_train, y_train, x_val, y_val, x_test, y_test, KNeighborsClassifier, best_param_knn)
knn_scores['model_name'] = 'KNN'
knn_scores

Unnamed: 0,accuracy,precision,recall,f1,model_name
train,0.934055,0.964572,0.880171,0.920441,KNN
validation,0.92651,0.957389,0.869107,0.911115,KNN
test,0.924999,0.955173,0.869952,0.910573,KNN


## 2.2. Decision Tree

In [138]:
dt_param = ParameterGrid({
    'max_depth' : np.arange(1,50,1)
})
best_score_dt = search_best_params(x_train, y_train, x_val, y_val, DecisionTreeClassifier, dt_param)

In [139]:
best_accuracy_tree = best_score_dt[0]
best_depth_tree = list(best_score_dt[1].values())[0]
print('Melhor acurácia é: {} com melhor max_depth = {}'.format(best_accuracy_tree, best_depth_tree))

Melhor acurácia é: 0.9508027928826539 com melhor max_depth = 14


In [140]:
best_params_dt = {'max_depth' : best_depth_tree}
dt_scores = modeling(x_train, y_train, x_val, y_val, x_test, y_test, DecisionTreeClassifier, best_params_dt)
dt_scores['model_name'] = 'Decision Tree'
dt_scores

Unnamed: 0,accuracy,precision,recall,f1,model_name
train,0.973716,0.981976,0.956917,0.969285,Decision Tree
validation,0.950545,0.953688,0.931101,0.942259,Decision Tree
test,0.951609,0.955906,0.932776,0.9442,Decision Tree


## 2.3. Random Forest

In [141]:
rf_param = ParameterGrid({
    'n_estimators' : [1,100],
    'max_depth' : np.arange(1,20,2)
})
best_score_rf = search_best_params(x_train, y_train, x_val, y_val, RandomForestClassifier, rf_param)
best_score_rf

(0.9619357122172528, {'max_depth': 19, 'n_estimators': 100})

In [142]:
best_accuracy_rf = best_score_rf[0]
best_depth_rf = list(best_score_rf[1].values())[0]
best_estimators_rf = list(best_score_rf[1].values())[1]
print('Melhor acurácia é: {} com melhor max_depth = {} e melhor estimador = {}'.format(best_accuracy_rf, best_depth_rf, best_estimators_rf))

Melhor acurácia é: 0.9508027928826539 com melhor max_depth = 19 e melhor estimador = 100


In [143]:
best_params_rf = {'max_depth' : best_depth_rf,
                  'n_estimators' : best_estimators_rf}
rf_scores = modeling(x_train, y_train, x_val, y_val, x_test, y_test, RandomForestClassifier, best_params_rf)
rf_scores['model_name'] = 'Random Forest'
rf_scores

Unnamed: 0,accuracy,precision,recall,f1,model_name
train,0.973702,0.980469,0.958413,0.969316,Random Forest
validation,0.957656,0.960446,0.94105,0.950649,Random Forest
test,0.958599,0.961611,0.943335,0.952385,Random Forest


## 2.4. Logistic Regression

In [144]:
lr_param = ParameterGrid({
    'C' : [0.01, 0.1, 1, 10, 100],
    'solver' : ['lbfgs', 'newton-cg', 'liblinear'],
    'max_iter' : [100, 200, 300]
})
best_score_lr = search_best_k(x_train, y_train, x_val, y_val, LogisticRegression, lr_param)
best_score_lr

(0.8741594002381029, {'C': 1, 'max_iter': 100, 'solver': 'lbfgs'})

In [145]:
best_accuracy_lr = best_score_lr[0]
best_C_lr = list(best_score_lr[1].values())[0]
best_max_iter_lr = list(best_score_lr[1].values())[1]
best_solver_lr = list(best_score_lr[1].values())[2]
print('Melhor acurácia é: {} com C = {}, max_iter = {} e solver = {}'.format(best_accuracy_lr, best_accuracy_lr, best_max_iter_lr, best_solver_lr))

Melhor acurácia é: 0.8741594002381029 com C = 0.8741594002381029, max_iter = 100 e solver = lbfgs


In [146]:
best_params_rf = {'C' : best_C_lr,
                  'max_iter' : best_max_iter_lr,
                  'solver' : best_solver_lr}
lr_scores = modeling(x_train, y_train, x_val, y_val, x_test, y_test, LogisticRegression, best_params_rf)
lr_scores['model_name'] = 'Logistic Regression'
lr_scores

Unnamed: 0,accuracy,precision,recall,f1,model_name
train,0.875267,0.870714,0.836388,0.853206,Logistic Regression
validation,0.874159,0.869206,0.835326,0.851929,Logistic Regression
test,0.871471,0.868568,0.83326,0.850548,Logistic Regression


# 3.0. Comparando os modelos

In [147]:
scores = pd.concat([knn_scores, dt_scores, rf_scores, lr_scores])

## 3.1. Treinamento

In [155]:
scores.loc['train']

Unnamed: 0,accuracy,precision,recall,f1,model_name
train,0.934055,0.964572,0.880171,0.920441,KNN
train,0.973716,0.981976,0.956917,0.969285,Decision Tree
train,0.973702,0.980469,0.958413,0.969316,Random Forest
train,0.875267,0.870714,0.836388,0.853206,Logistic Regression


## 3.2. Validação

In [150]:
scores.loc['validation']

Unnamed: 0,accuracy,precision,recall,f1,model_name
validation,0.92651,0.957389,0.869107,0.911115,KNN
validation,0.950545,0.953688,0.931101,0.942259,Decision Tree
validation,0.957656,0.960446,0.94105,0.950649,Random Forest
validation,0.874159,0.869206,0.835326,0.851929,Logistic Regression


## 3.3. Teste

In [151]:
scores.loc['test']

Unnamed: 0,accuracy,precision,recall,f1,model_name
test,0.924999,0.955173,0.869952,0.910573,KNN
test,0.951609,0.955906,0.932776,0.9442,Decision Tree
test,0.958599,0.961611,0.943335,0.952385,Random Forest
test,0.871471,0.868568,0.83326,0.850548,Logistic Regression
