# Model Training 
---

## 0. Constants

In [2]:
TRAIN_DATA_PATH = "../data/processed/train.csv"
TEST_DATA_PATH = "../data/processed/test.csv"

LOG_DATA_DETAILS_PKL    =  "../artifacts/data_details.pkl"
LOG_MODEL_PATH = "../models/model.pkl"

---
## 1. Imports

In [5]:
import os
import pickle
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
sns.set_theme(style="darkgrid", palette="bright")

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import accuracy_score, roc_auc_score

In [6]:
train = pd.read_csv(TRAIN_DATA_PATH)
test = pd.read_csv(TEST_DATA_PATH)

## 1.1 Functions

In [7]:
def save_pickle_object(obj, file_path):
    """Serialize Python object as pickle object"""
    
    #create directory if not yet existing
    os.makedirs(os.path.dirname(file_path), exist_ok=True)

    #save the object in the file
    with open(file_path, "wb") as file:
        pickle.dump(obj, file)

In [9]:
def evaluate_model(true, predicted):
    """Calculates Accuracy & ROC_AUC scores of Classification models"""
    
    accuracy = accuracy_score(true, predicted)
    roc_auc = roc_auc_score(true, predicted)
    
    return accuracy, roc_auc

---
## 2. Data Processing

In [10]:
#Create X_train, y_train
X_train = train.drop("booking_complete", axis=1)
y_train = train["booking_complete"]

#Create X_test, y_test
X_test = test.drop("booking_complete", axis=1)
y_test = test["booking_complete"]

In [17]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((39424, 13), (9857, 13), (39424,), (9857,))

In [12]:
scaler = StandardScaler()

X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

---
## 3. Baseline Models

In [27]:
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest Classifier": RandomForestClassifier(),
    "XGBClasifier": XGBClassifier(), 
    "CatBoosting Classifier": CatBoostClassifier(verbose=False)
}

In [28]:
scores = pd.DataFrame()

for i in range(len(list(models))):
    
    #get the model
    model = list(models.values())[i]
    #train the model
    model.fit(X_train, y_train) 

    #make predictions
    pred_train = model.predict(X_train)
    pred_test = model.predict(X_test)
    
    #evaluate Train and Test dataset
    accuracy_train, roc_auc_train = evaluate_model(y_train, pred_train)
    accuracy_test, roc_auc_test = evaluate_model(y_test, pred_test)
    
    #create a series of scores
    model_scores = pd.Series([accuracy_train, accuracy_test, roc_auc_train, roc_auc_test], 
                             name=f"{list(models.keys())[i]}", 
                             index=["Accuracy-train", "Accuracy-test", "ROC_AUC-train", "ROC_AUC-test"])
    
    scores = pd.concat([scores ,model_scores], axis=1)

In [29]:
scores.T

Unnamed: 0,Accuracy-train,Accuracy-test,ROC_AUC-train,ROC_AUC-test
Logistic Regression,0.848925,0.854418,0.5,0.5
Random Forest Classifier,0.999822,0.850259,0.999481,0.536011
XGBClasifier,0.87987,0.848737,0.623052,0.539746
CatBoosting Classifier,0.867619,0.854925,0.576155,0.532672


---
## 4. Model 1: Random Forest Regressor + Top 6 Features

In [31]:
top_6_features=['route','booking_origin','flight_duration','wants_extra_baggage', 'length_of_stay','flight_hour']


In [32]:
rf2 = RandomForestClassifier()
rf2.fit(X_train[top_6_features], y_train) 

In [36]:
#make predictions
pred_train = rf2.predict(X_train[top_6_features])
pred_test = rf2.predict(X_test[top_6_features])

#evaluate Train and Test dataset
accuracy_train, roc_auc_train = evaluate_model(y_train, pred_train)
accuracy_test, roc_auc_test = evaluate_model(y_test, pred_test)

#create a series of scores
rf2_model_scores = pd.Series([accuracy_train, accuracy_test, roc_auc_train, roc_auc_test], 
                         name="RandomForest Classifier 2", 
                         index=["Accuracy-train", "Accuracy-test", "ROC_AUC-train", "ROC_AUC-test"]).to_frame().T

rf2_model_scores

Unnamed: 0,Accuracy-train,Accuracy-test,ROC_AUC-train,ROC_AUC-test
RandomForest Classifier 2,0.965808,0.830983,0.912715,0.565489


## 4.1 Hyperparameter Tuning

In [37]:
rf_params={
    'n_estimators': [50,100,200],
    'max_depth':[2,4,None],
    'max_features':['sqrt',None]
}

In [38]:
#create rf3 model
rf3 = RandomForestClassifier()
#create GridSearchCV
grid_search = GridSearchCV(rf3, param_grid=rf_params, cv=3, n_jobs=-1)
#fit the model
grid_search.fit(X_train, y_train)

In [39]:
#get the best model from the GridSearchCV
gs1_best_model = grid_search.best_estimator_

#get predictions for train & test data
pred_train = gs1_best_model.predict(X_train)
pred_test = gs1_best_model.predict(X_test)

#evaluate Train and Test dataset
accuracy_train, roc_auc_train = evaluate_model(y_train, pred_train)
accuracy_test, roc_auc_test = evaluate_model(y_test, pred_test)

#create a series of scores
rf3_model_scores = pd.Series([accuracy_train, accuracy_test, roc_auc_train, roc_auc_test], 
                         name="RandomForest Classifier 3", 
                         index=["Accuracy-train", "Accuracy-test", "ROC_AUC-train", "ROC_AUC-test"]).to_frame().T

rf3_model_scores

Unnamed: 0,Accuracy-train,Accuracy-test,ROC_AUC-train,ROC_AUC-test
RandomForest Classifier 3,0.999061,0.852795,0.996963,0.543855


In [45]:
#print best estimator
print("Best Model with parameters is:\n", gs1_best_model)

Best Model with parameters is:
 RandomForestClassifier(n_estimators=50)


---
## 5. Model 2: XGBRegressor + Top 6 Features

In [40]:
xgb1 = XGBClassifier()
xgb1.fit(X_train[top_6_features], y_train) 

In [41]:
#make predictions
pred_train = xgb1.predict(X_train[top_6_features])
pred_test = xgb1.predict(X_test[top_6_features])

#evaluate Train and Test dataset
accuracy_train, roc_auc_train = evaluate_model(y_train, pred_train)
accuracy_test, roc_auc_test = evaluate_model(y_test, pred_test)

#create a series of scores
xgb1_model_scores = pd.Series([accuracy_train, accuracy_test, roc_auc_train, roc_auc_test], 
                         name="XGBoost Classifier 1", 
                         index=["Accuracy-train", "Accuracy-test", "ROC_AUC-train", "ROC_AUC-test"]).to_frame().T

xgb1_model_scores

Unnamed: 0,Accuracy-train,Accuracy-test,ROC_AUC-train,ROC_AUC-test
XGBoost Classifier 1,0.86531,0.850462,0.575072,0.528036


## 5.1 Hyperparameter Tuning: Using top 5 features

In [42]:
xgb_params={
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 1],
    'n_estimators': [100, 500, 1000]
}

In [43]:
#create xgb2 model
xgb2 = XGBClassifier()
#create GridSearchCV
grid_search = GridSearchCV(xgb2, param_grid=xgb_params, cv=3, n_jobs=-1)
#fit the model on top 5 features
grid_search.fit(X_train, y_train)

In [44]:
#get the best model from the GridSearchCV
gs2_best_model = grid_search.best_estimator_

#get predictions for train & test data
pred_train = gs2_best_model.predict(X_train)
pred_test = gs2_best_model.predict(X_test)

#evaluate Train and Test dataset
accuracy_train, roc_auc_train = evaluate_model(y_train, pred_train)
accuracy_test, roc_auc_test = evaluate_model(y_test, pred_test)

#create a series of scores
xgb2_model_scores = pd.Series([accuracy_train, accuracy_test, roc_auc_train, roc_auc_test], 
                         name="XGBoost Classifier 2", 
                         index=["Accuracy-train", "Accuracy-test", "ROC_AUC-train", "ROC_AUC-test"]).to_frame().T

xgb2_model_scores

Unnamed: 0,Accuracy-train,Accuracy-test,ROC_AUC-train,ROC_AUC-test
XGBoost Classifier 2,0.856585,0.856143,0.53391,0.516908


In [46]:
#print best estimator
print("Best Model with parameters is:\n", gs2_best_model)

Best Model with parameters is:
 XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.01, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=7, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=500, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=None, ...)


---
## 6. Models Comparison

In [47]:
final_scores = pd.concat([rf2_model_scores, rf3_model_scores, xgb1_model_scores, xgb2_model_scores], axis=0)
final_scores

Unnamed: 0,Accuracy-train,Accuracy-test,ROC_AUC-train,ROC_AUC-test
RandomForest Classifier 2,0.965808,0.830983,0.912715,0.565489
RandomForest Classifier 3,0.999061,0.852795,0.996963,0.543855
XGBoost Classifier 1,0.86531,0.850462,0.575072,0.528036
XGBoost Classifier 2,0.856585,0.856143,0.53391,0.516908


---
## 7. Log Model

In [48]:
# Model
model = {"model_description": "RandomForest Classifier + Hyperparamter tuning",
         "model_details": str(gs1_best_model),
         "model_object": gs1_best_model} 

save_pickle_object(model, LOG_MODEL_PATH)