## 0. Configs

In [1]:
import sys
sys.path.append("../../util/")

from calculate_scores import calculate_scores

In [2]:
import warnings
warnings.filterwarnings('ignore')

## 1. Import libraries and data

In [3]:
import pandas as pd
import numpy as np

from imblearn.over_sampling import SMOTE

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
import scipy.stats as stats


df = pd.read_csv("../../data/cleaned/train_V2_cleaned.csv")

There are 779/2592 guests that have damaged the room

In [4]:
df["outcome_damage_inc"].value_counts()

outcome_damage_inc
0    2536
1     763
Name: count, dtype: int64

## 2. Preparation for model training

### **X, y split**

Split the columns which I will need to train a model.

In [5]:
X = df.drop(['outcome_damage_inc', "outcome_damage_amount", "outcome_profit"], axis=1)
y = df['outcome_damage_inc']

In [6]:
# Sample the classes with `SMOTE`

smote = SMOTE(random_state=1234)
X_resampled, y_resampled = smote.fit_resample(X, y)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled,y_resampled, test_size=0.2, random_state=1234)

In [8]:
num_feat = ['income_am', 'profit_last_am', 'profit_am', 'damage_am', 'damage_inc',
       'crd_lim_rec', 'credit_use_ic', 'gluten_ic', 'lactose_ic',
       'insurance_ic', 'spa_ic', 'empl_ic', 'cab_requests', 'married_cd',
       'bar_no', 'sport_ic', 'neighbor_income', 'age', 'marketing_permit',
       'urban_ic', 'dining_ic', 'presidential', 'client_segment', 'sect_empl',
       'prev_stay', 'prev_all_in_stay', 'divorce', 'fam_adult_size',
       'children_no', 'tenure_mts', 'tenure_yrs', 'company_ic', 'claims_no',
       'claims_am', 'nights_booked', 'shop_am', 'shop_use', 'retired',
       'gold_status', 'gender_M', 'gender_V']

## 3. Train all 3 models

In [9]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

### **1. DecisionTreeClassifier**

1. Get the best hyperparameters with RandomizedSearchCV

In [10]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier()

dt_param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None] + list(stats.randint(5, 30).rvs(2)),
    'max_features': ['auto', 'sqrt', 'log2', None],
    'min_samples_leaf': stats.randint(1, 5),
    'min_samples_split': stats.randint(2, 10),
}

dt_random_search = RandomizedSearchCV(dt, param_distributions=dt_param_grid, n_iter=100, cv=5, verbose=1, scoring='accuracy', n_jobs=-1)
dt_random_search.fit(X_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


In [11]:
dt_best_params = dt_random_search.best_params_
dt_best_params

{'criterion': 'entropy',
 'max_depth': 13,
 'max_features': None,
 'min_samples_leaf': 1,
 'min_samples_split': 3}

2. Refined **GridSearchCV** with hyperparameters from RandomizedSearchCV

In [12]:
dt_refined_param_grid = {
    'criterion': [dt_best_params['criterion']],
    'max_depth': [dt_best_params['max_depth'] - 2, dt_best_params['max_depth'], dt_best_params['max_depth'] + 2] if dt_best_params['max_depth'] is not None else [None],
    'max_features': [dt_best_params['max_features']],
    'min_samples_leaf': [dt_best_params['min_samples_leaf'] - 1, dt_best_params['min_samples_leaf'], dt_best_params['min_samples_leaf'] + 1],
    'min_samples_split': [dt_best_params['min_samples_split'] - 1, dt_best_params['min_samples_split'], dt_best_params['min_samples_split'] + 1],
}

dt_grid_search_refined = GridSearchCV(dt, dt_refined_param_grid, cv=5, verbose=1, scoring='accuracy', n_jobs=-1)
dt_grid_search_refined.fit(X_train, y_train)

Fitting 5 folds for each of 27 candidates, totalling 135 fits


3. Get the best model and make predictions on the test set (from X,y split)

In [13]:
dt_best = dt_grid_search_refined.best_estimator_
dt_pred = dt_best.predict(X_test)

### **2. KNeigborsClassifier**

1. Get the best hyperparameters with RandomizedSearchCV

In [14]:
from sklearn.neighbors import KNeighborsRegressor

knn = KNeighborsRegressor()

knn_param_grid = {
    'n_neighbors': stats.randint(1, 20),
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'p': [1, 2]  # L1 and L2 distances
}

knn_random_search = RandomizedSearchCV(knn, param_distributions=knn_param_grid, n_iter=100, cv=5, verbose=1, scoring='accuracy', n_jobs=-1)
knn_random_search.fit(X_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


In [15]:
knn_best_params = knn_random_search.best_params_
knn_best_params

{'algorithm': 'brute', 'n_neighbors': 1, 'p': 1, 'weights': 'distance'}

2. Refined **GridSearchCV** with hyperparameters from RandomizedSearchCV

In [16]:
knn_refined_param_grid = {
    'n_neighbors': [knn_best_params['n_neighbors'] - 1, knn_best_params['n_neighbors'], knn_best_params['n_neighbors'] + 1],
    'weights': [knn_best_params['weights']],
    'algorithm': [knn_best_params['algorithm']],
    'p': [knn_best_params['p']],
}

knn_grid_search_refined = GridSearchCV(knn, knn_refined_param_grid, cv=5, verbose=1, scoring='accuracy', n_jobs=-1)
knn_grid_search_refined.fit(X_train, y_train)

Fitting 5 folds for each of 3 candidates, totalling 15 fits


3. Get the best model and make predictions on the test set (from X,y split)

In [17]:
knn_best = knn_grid_search_refined.best_estimator_
knn_pred = knn_best.predict(X_test)

### **3. RandomForestClassifier**

1. Get the best hyperparameters with RandomizedSearchCV

In [18]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()

rf_param_grid = {
    'n_estimators': stats.randint(50, 200),
    'criterion': ['gini', 'entropy'],
    'max_depth': [None] + list(stats.randint(5, 30).rvs(2)),
    'max_features': ['auto', 'sqrt', 'log2', None],
    'min_samples_leaf': stats.randint(1, 5),
    'min_samples_split': stats.randint(2, 10),
}

rf_random_search = RandomizedSearchCV(rf, param_distributions=rf_param_grid, n_iter=100, cv=5, verbose=1, scoring='accuracy', n_jobs=-1)
rf_random_search.fit(X_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


In [19]:
rf_best_params = rf_random_search.best_params_
rf_best_params

{'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'min_samples_split': 6,
 'n_estimators': 107}

2. Refined **GridSearchCV** with hyperparameters from RandomizedSearchCV

In [20]:
rf_refined_param_grid = {
    'n_estimators': [rf_best_params['n_estimators'] - 1, rf_best_params['n_estimators'], rf_best_params['n_estimators'] + 1],
    'criterion': [rf_best_params['criterion']],
    'max_depth': [rf_best_params['max_depth'] - 2, rf_best_params['max_depth'], rf_best_params['max_depth'] + 2] if rf_best_params['max_depth'] is not None else [None],
    'max_features': [rf_best_params['max_features']],
    'min_samples_leaf': [rf_best_params['min_samples_leaf'] - 1, rf_best_params['min_samples_leaf'], rf_best_params['min_samples_leaf'] + 1],
    'min_samples_split': [rf_best_params['min_samples_split'] - 1, rf_best_params['min_samples_split'], rf_best_params['min_samples_split'] + 1],
}

rf_grid_search_refined = GridSearchCV(rf, rf_refined_param_grid, cv=5, verbose=1, scoring='accuracy', n_jobs=-1)
rf_grid_search_refined.fit(X_train, y_train)

Fitting 5 folds for each of 27 candidates, totalling 135 fits


3. Get the best model and make predictions on the test set (from X,y split)

In [21]:
rf_best = rf_grid_search_refined.best_estimator_
rf_pred = rf_best.predict(X_test)

### **Compare**

In [22]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

In [23]:
# Metrics for DecisionTreeClassifier
acc_dt = round(accuracy_score(y_test, dt_pred), 2)
pre_dt = round(precision_score(y_test, dt_pred), 2)
rec_dt = round(recall_score(y_test, dt_pred), 2)
f1_dt = round(f1_score(y_test, dt_pred), 2)

# Metrics for KNeighborsClassifier
acc_knn = round(accuracy_score(y_test, knn_pred), 2)
pre_knn = round(precision_score(y_test, knn_pred), 2)
rec_knn = round(recall_score(y_test, knn_pred), 2)
f1_knn = round(f1_score(y_test, knn_pred), 2)

# Metrics for RandomForestClassifier
acc_rf = round(accuracy_score(y_test, rf_pred), 2)
pre_rf = round(precision_score(y_test, rf_pred), 2)
rec_rf = round(recall_score(y_test, rf_pred), 2)
f1_rf = round(f1_score(y_test, rf_pred), 2)

In [24]:
all_ACCs = [ acc_dt, acc_knn, acc_rf]
all_PREs = [ pre_dt, pre_knn, pre_rf]
all_RECs = [ rec_dt, rec_knn, rec_rf]
all_F1s  = [ f1_dt,  f1_knn, f1_rf]

models = pd.DataFrame({ 'Model': 
                        [
                            'DecisionTreeClassifier',
                            'KNeighborsClassifier',
                            'RandomForestClassifier'
                            ],
                        'Accuracy': all_ACCs,
                        'Precision': all_PREs,
                        'Recall': all_RECs,
                        'F1-Score': all_F1s,
                        })

models.sort_values(by = 'Accuracy', ascending = False, ignore_index = True)

Unnamed: 0,Model,Accuracy,Precision,Recall,F1-Score
0,RandomForestClassifier,0.84,0.95,0.72,0.82
1,DecisionTreeClassifier,0.78,0.79,0.76,0.78
2,KNeighborsClassifier,0.78,0.73,0.87,0.8


## 4. Make predictions on `score.csv` using best model

### **Import applicant data CSV**, and make adjustments

In [25]:
applicants_data = pd.read_csv("../../data/score.csv")

"`get_dummies`" the same columns as we did for the training data

In [26]:
applicants_data = pd.get_dummies(applicants_data, columns=['gender'])

# execute `calculate_scores` on the DF, to calculate the scores
applicants_data = calculate_scores(applicants_data.copy())

applicants_data.fillna(0, inplace=True)

### **Make prediction on the test set**

In [27]:
predicted_damage_incident = rf_grid_search_refined.predict(applicants_data)

## 5. Score applicants

I'm not actually scoring applicants (read `IMPORTANT`)

### **Make new DataFrame with `predicted_damage_incident`**

In [28]:
applicants_data["predicted_damage_incident"] = predicted_damage_incident
applicants_data["applicant_id"] = applicants_data.index

applicants_with_predicted_damage_incident = applicants_data[["applicant_id", 'predicted_damage_incident']]

In [29]:
applicants_with_predicted_damage_incident[["predicted_damage_incident"]].value_counts()

predicted_damage_incident
0                            467
1                             33
Name: count, dtype: int64

In [30]:
applicants_with_predicted_damage_incident

Unnamed: 0,applicant_id,predicted_damage_incident
0,0,0
1,1,0
2,2,1
3,3,0
4,4,0
...,...,...
495,495,0
496,496,0
497,497,0
498,498,0


### **Export dataframe**

In [31]:
applicants_with_predicted_damage_incident.to_csv("../../data/exported/predicted_damage_incident.csv", index=False)

# **IMPORTANT**

in the future I will merge all 3 dataframes with each other, and then calculate the outcome_profit and select the top 200