# Classification random forests

In [1]:
import numpy as np
import pandas as pd
import pickle
import random
import os

from bayes_opt import BayesianOptimization
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score

from sklearn.model_selection import train_test_split
from skmultilearn.model_selection import IterativeStratification

In [2]:
os.chdir(r"C:/Users/mmandziej001/Desktop/FCU/SCRIPTS/predictive_qc_lion_king")

In [3]:
def bayesian_optimization(dataset, function, parameters):
    n_iterations = 10
    gp_params = {"alpha": 1e-4}

    BO = BayesianOptimization(function, parameters)
    BO.maximize(n_iter=n_iterations, **gp_params)

    return BO.max

In [4]:
def rfc_optimization(cv_splits):
    def function(n_estimators, max_depth, min_samples_split):
        return cross_val_score(
               RandomForestClassifier(
                   n_estimators=int(max(n_estimators,0)),                                                               
                   max_depth=int(max(max_depth,1)),
                   min_samples_split=int(max(min_samples_split,2)), 
                   n_jobs=-1, 
                   random_state=42,   
                   class_weight="balanced"),  
               X=test_data, 
               y=test_labels, 
               cv=10,
               scoring="roc_auc",
               n_jobs=-1).mean()

    parameters = {"n_estimators": (1, 1000),
                  "max_depth": (1, 2),
                  "min_samples_split": (5, 10)}
    return function, parameters

In [5]:
parameters = {"n_estimators": (1, 1000),
                  "max_depth": (1, 2),
                  "min_samples_split": (5, 10)}

In [6]:
def function(n_estimators, max_depth, min_samples_split):
        return cross_val_score(
               RandomForestClassifier(
                   n_estimators=int(max(n_estimators,0)),                                                               
                   max_depth=int(max(max_depth,1)),
                   min_samples_split=int(max(min_samples_split,2)), 
                   n_jobs=-1, 
                   random_state=42,   
                   class_weight="balanced"),  
               X=test_data, 
               y=test_labels, 
               cv=10,
               scoring='accuracy',
               n_jobs=-1).mean()

In [7]:
#Train model
def train(X_train, y_train, function, parameters):
    dataset = (X_train, y_train)
    cv_splits = 10
    
    best_solution = bayesian_optimization(dataset, function, parameters)      
    params = best_solution["params"]

    model = RandomForestClassifier(
             n_estimators = int(max(params["n_estimators"], 0)),
             max_depth = int(max(params["max_depth"], 1)),
             min_samples_split = int(max(params["min_samples_split"], 2)), 
             n_jobs = -1, 
             random_state = 42,   
             class_weight = "balanced")    
    model.fit(X_train, y_train)
    
    model_predict = model.predict(test_data)
    return model

def roc_auc_score_FIXED(y_true, y_pred):
    if len(np.unique(y_true)) == 1: # bug in roc_auc_score
        return accuracy_score(y_true, np.rint(y_pred))
    return roc_auc_score(y_true, y_pred)

In [98]:
train_data = pd.read_csv('model_training/data/PC/dataset_dummy_grouped_time_train_cat8.csv')
test_data = pd.read_csv('model_training/data/PC/dataset_dummy_grouped_time_test_cat8.csv')

# drop id of cases
train_id = train_data.pop('Unique')
test_id = test_data.pop('Unique')

train_labels = train_data.pop('Label')
test_labels = test_data.pop('Label')

train_datestamp = train_data.pop('datestamp')
test_datestamp = test_data.pop('datestamp')

In [99]:
boruta_importance = pd.read_excel('model_training/data/ai_assistant_dumps/PC/\
8_FATCA_CRS/Boruta_variable_importance_pc.xlsx')
boruta_importance = boruta_importance.sort_values(by=['normHits', 'meanImp'],
                                                  ascending=True)
boruta_features = list(
    boruta_importance[boruta_importance['decision'] == 'Confirmed'].V7)

In [25]:
#boruta_features = ['ScreenedParties', 'PartyType_Subsidiary', 'Critical_last_3_checklistsDR', 'ESR_Full_ESR_review', 'Minor_last_3_checklistsDR', 'Minor_last_3_checklistsPC', 'Major_last_3_checklistsPC', 'TLAssignedName_Yadav__N___Neha_', 'Cases_last_30_days_of_PC', 'Minor_last_10_checklistsDR', 'Critical_last_10_checklistsDR', 'Cases_last_5_days_of_PC', 'TLAssignedName_Makowska__M_M___Malgorzata_', 'TLAssignedName_Michalik__J___Justyna_', 'TeamExperience', 'HourNumeric', 'GroupCases', 'TLAssignedName_Jurojc__M___Mateusz_', 'ProcessingUnit_Gdansk', 'ProcessingUnit_MidCorp', 'FirstGroupCase', 'TLAssignedName_Jastrzebowska__S___Sonia_', 'Minor_last_10_checklistsPC', 'Major_last_10_checklistsPC', 'TLAssignedName_Marcos_Cantabrana__I___Ivan_', 'TLAssignedName_Wojciechowska__M___Magdalena_', 'ProjectExperience', 'Cases_last_30_days_of_DR', 'Cases_last_5_days_of_DR', 'TLAssignedName_Rybka__I_A___Izabela_Anna_', 'Major_last_10_checklistsDR', 'Major_last_3_checklistsDR', 'Major_last_5_checklistsDR']

In [100]:
test_data = test_data#[boruta_features]  #[boruta_features] # used_columns, boruta_features
train_data = train_data#[boruta_features]  #[boruta_features] # used_columns, boruta_features

In [101]:
print(list(train_data))
print(len(train_labels), sum(train_labels), round(sum(train_labels)/len(train_labels), 2))
print(len(test_labels), sum(test_labels), round(sum(test_labels)/len(test_labels), 2))

['ScreenedParties', 'OwnershipLayers', 'HourNumeric', 'FirstGroupCase', 'CasesGroupPercProcessed', 'AnalystGroupPercProcessed', 'GroupCases', 'PopulationMatch', 'Cases_last_5_days_of_PC', 'Cases_last_30_days_of_PC', 'Cases_last_5_days_of_DR', 'Cases_last_30_days_of_DR', 'Cases_last_5_days_of_CO', 'Cases_last_30_days_of_CO', 'PCODD_(Office_Due_Diligence)_-_local_requirements_(if_applicable)_Major_last_5_checklists', 'PCODD_(Office_Due_Diligence)_-_local_requirements_(if_applicable)_Critical_last_5_checklists', 'PCODD_(Office_Due_Diligence)_-_local_requirements_(if_applicable)_Major_last_10_checklists', 'PCODD_(Office_Due_Diligence)_-_local_requirements_(if_applicable)_Critical_last_10_checklists', 'DRODD_(Office_Due_Diligence)_-_local_requirements_(if_applicable)_Major_last_5_checklists', 'DRODD_(Office_Due_Diligence)_-_local_requirements_(if_applicable)_Major_last_10_checklists', 'COODD_(Office_Due_Diligence)_-_local_requirements_(if_applicable)_Major_last_5_checklists', 'COODD_(Office

In [None]:
best_model_rfc = train(train_data, train_labels, function, parameters)

|   iter    |  target   | max_depth | min_sa... | n_esti... |
-------------------------------------------------------------
| [0m 1       [0m | [0m 0.9227  [0m | [0m 1.401   [0m | [0m 7.771   [0m | [0m 898.4   [0m |
| [95m 2       [0m | [95m 0.9228  [0m | [95m 1.409   [0m | [95m 8.377   [0m | [95m 424.4   [0m |
| [0m 3       [0m | [0m 0.9227  [0m | [0m 1.559   [0m | [0m 9.726   [0m | [0m 651.0   [0m |
| [0m 4       [0m | [0m 0.9227  [0m | [0m 1.51    [0m | [0m 9.3     [0m | [0m 880.6   [0m |
| [0m 5       [0m | [0m 0.92    [0m | [0m 1.552   [0m | [0m 9.554   [0m | [0m 587.1   [0m |
| [0m 6       [0m | [0m 0.9228  [0m | [0m 1.247   [0m | [0m 8.547   [0m | [0m 704.2   [0m |
| [0m 7       [0m | [0m 0.9228  [0m | [0m 1.904   [0m | [0m 7.496   [0m | [0m 354.6   [0m |
| [0m 8       [0m | [0m 0.92    [0m | [0m 1.939   [0m | [0m 9.394   [0m | [0m 277.8   [0m |
| [0m 9       [0m | [0m 0.9227  [0m | [0m 1.914   

In [95]:
model = best_model_rfc
#model = RandomForestClassifier(max_depth=2, random_state=5, n_estimators=10000, class_weight="balanced", max_features=1)

model.fit(train_data, train_labels)
model_predict = model.predict(test_data)
test_predictions = model.predict_proba(test_data)[:, 1]
train_predictions = model.predict_proba(train_data)[:,1]
auc_best_model_train = roc_auc_score(train_labels, train_predictions)
auc_best_model_test = roc_auc_score(test_labels, test_predictions)
overfit = round((auc_best_model_train - auc_best_model_test), 3)

print(auc_best_model_train, auc_best_model_test, overfit)

0.9301197023616953 0.90625 0.024


In [96]:
test_predictions_df = pd.DataFrame(test_predictions)
train_predictions_df = pd.DataFrame(train_predictions)

test_results = pd.concat([test_id, test_datestamp, test_predictions_df, test_labels], axis=1)
test_results = test_results.rename(columns={0: "Score"})
test_results = test_results.sort_values(by='Score', axis=0, ascending=False)

SumTest = pd.DataFrame(np.cumsum(test_results['Label']))
SumTest = SumTest.rename(columns={'Label': 'Sum'})
test_results = pd.concat([test_results, SumTest], axis=1)
test_results.reset_index(inplace = True , drop=True)
X_coordinates = pd.DataFrame(np.arange(1,(test_results.shape[0]+1))/(test_results.shape[0]))
X_coordinates = X_coordinates.rename(columns={0: 'X_coordinates'})
y_coordinates = pd.DataFrame(test_results['Sum']/test_results['Sum'].max())
y_coordinates = y_coordinates.rename(columns={'Sum': 'y_coordinates'})
test_results = pd.concat([test_results, X_coordinates,y_coordinates], axis=1)

# train results
train_results = pd.concat([train_id, train_datestamp, train_predictions_df, train_labels], axis=1)
train_results = train_results.rename(columns={0:"Score"})
train_results = train_results.sort_values(by='Score', axis=0, ascending=False)

SumTrain = pd.DataFrame(np.cumsum(train_results['Label']))
SumTrain = SumTrain.rename(columns={'Label':'Sum'})
train_results = pd.concat([train_results, SumTrain], axis=1)
train_results.reset_index(inplace = True , drop=True)

X_coordinates = pd.DataFrame(np.arange(1,(train_results.shape[0]+1))/(train_results.shape[0]))
X_coordinates = X_coordinates.rename(columns={0: 'X_coordinates'})
y_coordinates = pd.DataFrame(train_results['Sum']/train_results['Sum'].max())
y_coordinates = y_coordinates.rename(columns={'Sum': 'y_coordinates'})
train_results = pd.concat([train_results, X_coordinates,y_coordinates], axis=1)

In [97]:
train_results.to_csv(r'model_training/results/PC/crf/8_FATCA_CRS/rf_train_results.csv')
test_results.to_csv(r'model_training/results/PC/crf/8_FATCA_CRS/rf_test_results.csv')
filename = r'model_training/results/PC/crf/8_FATCA_CRS/rfc_model.sav'
pickle.dump(model, open(filename, 'wb'))
print(list(train_data.columns))

['ScreenedParties', 'OwnershipLayers', 'HourNumeric', 'FirstGroupCase', 'CasesGroupPercProcessed', 'AnalystGroupPercProcessed', 'GroupCases', 'PopulationMatch', 'Cases_last_5_days_of_PC', 'Cases_last_30_days_of_PC', 'Cases_last_5_days_of_DR', 'Cases_last_30_days_of_DR', 'Cases_last_5_days_of_CO', 'Cases_last_30_days_of_CO', 'PCODD_.Office_Due_Diligence._._local_requirements_.if_applicable._Major_last_5_checklists', 'PCODD_.Office_Due_Diligence._._local_requirements_.if_applicable._Critical_last_5_checklists', 'PCODD_.Office_Due_Diligence._._local_requirements_.if_applicable._Major_last_10_checklists', 'PCODD_.Office_Due_Diligence._._local_requirements_.if_applicable._Critical_last_10_checklists', 'DRODD_.Office_Due_Diligence._._local_requirements_.if_applicable._Major_last_5_checklists', 'DRODD_.Office_Due_Diligence._._local_requirements_.if_applicable._Major_last_10_checklists', 'COODD_.Office_Due_Diligence._._local_requirements_.if_applicable._Major_last_5_checklists', 'COODD_.Office