## Import Datasets

In [1]:
from joblib import dump
import math
import pandas as pd
import numpy as np
import random
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score

In [3]:
df_raw_train = pd.read_csv('../data/raw/train.csv')
df_raw_test = pd.read_csv('../data/raw/test.csv')

## Set up constants

In [16]:
train_df_size = 0.8
rand_state_ind = 42
validation_df_size = 0.2
cv = 10
scoring = 'roc_auc'

## Perform standard data scaling

In [5]:
df_cleaned_train = df_raw_train.copy()
target = df_cleaned_train.pop('TARGET_5Yrs')
IDlist_train = df_cleaned_train.pop('Id')
df_col_names = df_cleaned_train.columns
train_dataset_size = IDlist_train.size

In [6]:
scaler = StandardScaler()
array_cleaned_train = scaler.fit_transform(df_cleaned_train)
df_cleaned2_train = pd.DataFrame(array_cleaned_train,columns=df_col_names)

In [7]:
df_cleaned_test = df_raw_test.copy()
IDlist_test = df_cleaned_test.pop('Id')
array_cleaned_test = scaler.transform(df_cleaned_test) 
df_cleaned2_test = pd.DataFrame(array_cleaned_test,columns=df_col_names)

In [8]:
dump(scaler, '../models/scaler.joblib')

['../models/scaler.joblib']

In [9]:
y_train_pos_count = sum(target)
y_train_neg_count = target.size - y_train_pos_count

## Set up grid search for Random Forest models

In [10]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

### Set up the hyperparameter space 

In [13]:
RFModel = RandomForestClassifier(random_state=42)

In [11]:
n_estimators = np.logspace(start=1, stop=round(math.log(train_dataset_size/2,2)),base=2,num=5).astype(int)
max_depth = np.arange(start=2,stop=5)
min_samples_split = np.logspace(start=1, stop=round(math.log(train_dataset_size/2),2),base=2,num=5).astype(int)
param_grid2 = {'max_depth':max_depth,'n_estimators':n_estimators,'min_samples_split':min_samples_split}

In [None]:
clf12 = GridSearchCV(estimator=RFModel,param_grid=param_grid2,cv=cv,scoring=scoring,return_train_score=True,verbose=3)
clf12.fit(df_cleaned2_train, target)
clf12.best_estimator_
clf12.score(df_cleaned2_train, target)
clf12.cv_results_.keys()
test_grid_2 = clf12.predict_proba(df_cleaned2_test)[:,1]

Fitting 10 folds for each of 75 candidates, totalling 750 fits
[CV 1/10] END max_depth=2, min_samples_split=2, n_estimators=2;, score=(train=0.658, test=0.660) total time=   0.0s
[CV 2/10] END max_depth=2, min_samples_split=2, n_estimators=2;, score=(train=0.652, test=0.610) total time=   0.0s
[CV 3/10] END max_depth=2, min_samples_split=2, n_estimators=2;, score=(train=0.658, test=0.660) total time=   0.0s
[CV 4/10] END max_depth=2, min_samples_split=2, n_estimators=2;, score=(train=0.667, test=0.635) total time=   0.0s
[CV 5/10] END max_depth=2, min_samples_split=2, n_estimators=2;, score=(train=0.661, test=0.630) total time=   0.0s
[CV 6/10] END max_depth=2, min_samples_split=2, n_estimators=2;, score=(train=0.669, test=0.642) total time=   0.0s
[CV 7/10] END max_depth=2, min_samples_split=2, n_estimators=2;, score=(train=0.657, test=0.584) total time=   0.0s
[CV 8/10] END max_depth=2, min_samples_split=2, n_estimators=2;, score=(train=0.662, test=0.692) total time=   0.0s
[CV 9/10]

In [None]:
dump(clf12,  '../models/randforest_10cv_gridsearch_basic.joblib')

In [None]:
dump(clf12.best_estimator_,  '../models/randforest_10cv_gridsearch_best.joblib')