## 0. Configs

In [9]:
import warnings
warnings.filterwarnings('ignore')

## 1. Import libraries and data

In [10]:
import pandas as pd
import numpy as np

from imblearn.over_sampling import SMOTE

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
import scipy.stats as stats


df = pd.read_csv("../../data/cleaned/train_V2_cleaned.csv")

There are 779/2592 guests that have damaged the room

In [11]:
df["outcome_damage_inc"].value_counts()

outcome_damage_inc
0    1943
1     572
Name: count, dtype: int64

## 2. Modelling

### **X, y split**

Split the columns which I will need to train a model.

In [12]:
X = df.drop(['outcome_damage_inc', "outcome_damage_amount", "outcome_profit"], axis=1)
y = df['outcome_damage_inc']

In [13]:
# Sample the classes with `SMOTE`

smote = SMOTE(random_state=1234)
X_resampled, y_resampled = smote.fit_resample(X, y)

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled,y_resampled, test_size=0.2, random_state=1234)

In [15]:
num_feat = ['income_am', 'profit_last_am', 'profit_am', 'damage_am', 'damage_inc',
       'crd_lim_rec', 'credit_use_ic', 'gluten_ic', 'lactose_ic',
       'insurance_ic', 'spa_ic', 'empl_ic', 'cab_requests', 'married_cd',
       'bar_no', 'sport_ic', 'neighbor_income', 'age', 'marketing_permit',
       'urban_ic', 'dining_ic', 'presidential', 'client_segment', 'sect_empl',
       'prev_stay', 'prev_all_in_stay', 'divorce', 'fam_adult_size',
       'children_no', 'tenure_mts', 'tenure_yrs', 'company_ic', 'claims_no',
       'claims_am', 'nights_booked', 'shop_am', 'shop_use', 'retired',
       'gold_status', 'gender_M', 'gender_V']

### **Train the model**

#### 1. Get the best hyperparameters with **RandomizedSearchCV**

In [16]:
rf = RandomForestClassifier()

param_dist = {
    'bootstrap': [True, False],
    'criterion': ['gini', 'entropy'],
    'max_depth': [None] + list(stats.randint(10, 30).rvs(2)),
    'max_features': ['auto', 'sqrt', 'log2', None],
    'min_samples_leaf': stats.randint(1, 5),
    'min_samples_split': stats.randint(2, 10),
    'n_estimators': stats.randint(50, 200),
}

random_search_rf = RandomizedSearchCV(rf, param_distributions=param_dist, n_iter=100, cv=5, verbose=1, scoring='accuracy', n_jobs=-1)
random_search_rf.fit(X_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


In [17]:
best_params_rf = random_search_rf.best_params_
best_params_rf

{'bootstrap': False,
 'criterion': 'entropy',
 'max_depth': 27,
 'max_features': 'log2',
 'min_samples_leaf': 3,
 'min_samples_split': 3,
 'n_estimators': 164}

#### 2. Refined **GridSearchCV** with hyperparameters from RandomizedSearchCV

In [18]:
refined_param_grid = {
    'bootstrap': [best_params_rf['bootstrap']],
    'criterion': [best_params_rf['criterion']],
    'max_depth': [best_params_rf['max_depth'] - 2, best_params_rf['max_depth'], best_params_rf['max_depth'] + 2] if best_params_rf['max_depth'] is not None else [None],
    'max_features': [best_params_rf['max_features']],
    'min_samples_leaf': [best_params_rf['min_samples_leaf'] - 1, best_params_rf['min_samples_leaf'], best_params_rf['min_samples_leaf'] + 1],
    'min_samples_split': [best_params_rf['min_samples_split'] - 1, best_params_rf['min_samples_split'], best_params_rf['min_samples_split'] + 1],
    'n_estimators': [best_params_rf['n_estimators'] - 20, best_params_rf['n_estimators'], best_params_rf['n_estimators'] + 20],
}

grid_search_rf_refined = GridSearchCV(rf, refined_param_grid, cv=5, verbose=1, scoring='accuracy', n_jobs=-1)
grid_search_rf_refined.fit(X_train, y_train)

Fitting 5 folds for each of 81 candidates, totalling 405 fits


#### 3. Get the best model and make predictions on the test set (from X,y split)

In [19]:
best_rf = grid_search_rf_refined.best_estimator_
y_pred_rf = best_rf.predict(X_test)

#### 4. Evaluation on the test set (from X,y split)

In [20]:
conf_matrix = confusion_matrix(y_test, y_pred_rf)

print(f"Classification Report: \n {classification_report(y_test, y_pred_rf)}")
print(f"Confusion Matrix: \n {conf_matrix}")

Classification Report: 
               precision    recall  f1-score   support

           0       0.79      0.96      0.86       395
           1       0.94      0.74      0.83       383

    accuracy                           0.85       778
   macro avg       0.87      0.85      0.85       778
weighted avg       0.86      0.85      0.85       778

Confusion Matrix: 
 [[378  17]
 [101 282]]


## 3. Make predictions on `score.csv`

### **Import applicant data CSV**, and make adjustments

In [21]:
applicants_data = pd.read_csv("../../data/score.csv")

"`get_dummies`" the same columns as we did for the training data

In [22]:
applicants_data = pd.get_dummies(applicants_data, columns=['gender'])

applicants_data.fillna(0, inplace=True)

### **Make prediction on the test set**

In [23]:
predicted_damage_incident = best_rf.predict(applicants_data)

## 4. Score applicants

I'm not actually scoring applicants (read `IMPORTANT`)

### **Make new DataFrame with `predicted_damage_incident`**

In [24]:
applicants_data["predicted_damage_incident"] = predicted_damage_incident
applicants_data["applicant_id"] = applicants_data.index

applicants_with_predicted_damage_incident = applicants_data[["applicant_id", 'predicted_damage_incident']]

In [25]:
applicants_with_predicted_damage_incident[["predicted_damage_incident"]].value_counts()

predicted_damage_incident
0                            485
1                             15
Name: count, dtype: int64

In [26]:
applicants_with_predicted_damage_incident

Unnamed: 0,applicant_id,predicted_damage_incident
0,0,0
1,1,0
2,2,1
3,3,0
4,4,0
...,...,...
495,495,0
496,496,0
497,497,0
498,498,0


### **Export dataframe**

In [27]:
applicants_with_predicted_damage_incident.to_csv("../../data/exported/predicted_damage_incident.csv", index=False)