In [25]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score, GridSearchCV
import pandas as pd
import pickle 
import numpy as np
from tqdm import tqdm
from sklearn.feature_selection import RFE, SelectFromModel
from sklearn.metrics import roc_auc_score

In [26]:
def get_pickles(): 
    x_train = pickle.load(open(f'../../data/processed/pickles/cluster_x_train.p', 'rb'))
    x_test = pickle.load(open(f'../../data/processed/pickles/cluster_x_test.p', 'rb'))
    y_train = pickle.load(open(f'../../data/processed/pickles/cluster_y_train.p', 'rb'))
    y_test = pickle.load(open(f'../../data/processed/pickles/cluster_y_test.p', 'rb'))
    X = pickle.load(open('../../data/processed/pickles/cluster_X.p', 'rb'))
    y = pickle.load(open('../../data/processed/pickles/cluster_y.p', 'rb'))

    return (x_train, x_test, y_train, y_test), (X,y)

(x_train, x_test, y_train, y_test), (X,y) = get_pickles()


print(f'Original X: {X.shape}\tOriginal y: {y.shape}')
print(f'Train X: {x_train.shape}\tTrain y: {y_train.shape}')
print(f'Test X: {x_test.shape}\tTest y: {y_test.shape}')

y_train = y_train.values.ravel()
y_test = y_test.values.ravel()

Original X: (57247, 55)	Original y: (57247, 1)
Train X: (42935, 54)	Train y: (42935, 1)
Test X: (14312, 54)	Test y: (14312, 1)


In [27]:
# rf = RandomForestClassifier()


# rf.fit(x_train,y_train)
# print(f'Original: {rf.score(x_test,y_test)}')
# model = SelectFromModel(rf, prefit = True)
# model.support_

        


In [31]:
rf = RandomForestClassifier()
best_score = 0 
best_i = 0
r_back = [i for i in range(8,53)]
pbar = tqdm(r_back)
for i in pbar: 
    pbar.set_description(f'Testing {i} Features || Best AUC: {best_score} || Best n: {best_i}')
    rfe = RFE(rf, n_features_to_select = i, step = 3)

    rfe.fit(x_train,y_train)
    pred = rfe.predict(x_test)
    roc_auc = roc_auc_score(y_test, pred)
    if roc_auc > best_score: 
        best_score = roc_auc 
        best_i = i
    #masks for columns that are important
        column_masks = rfe.support_

        orig_columns = x_train.columns
        new_columns = [x for x,y in zip(orig_columns, column_masks) if y == True]
        pickle.dump(new_columns, open('../../models/RFColumns.p', 'wb'))

        


Testing 52 Features || Best AUC: 0.8358247718868206 || Best n: 49: 100%|██████████████████████████████████████████████████████████████████████████████████| 45/45 [26:26<00:00, 35.26s/it]


In [None]:

param_grid = {
    'n_estimators': [None, 75, 100],
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 100, 500],
    'max_features': [None,'auto', 'sqrt', 'log2'],
    'min_samples_leaf': [1,5, 10], 
    'bootstrap': [True, False]
}
tuned_forest = RandomForestClassifier()
gs = GridSearchCV(tuned_forest, param_grid, verbose = 1, n_jobs =10)
gs.fit(x_train, y_train) 


In [None]:
pickle.dump(gs, open('../../models/RFGridSearch.p', 'wb'))
gs.best_estimator_

In [None]:
from sklearn.feature_selection import RFE
rf = RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=100, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=50,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)
rf.fit(x_train, y_train)

In [None]:
rf.score(x_test, y_test)