In [74]:
# Imports 
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, svm
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import fbeta_score
from sklearn.metrics import precision_score
from sklearn.metrics import make_scorer
from imblearn.over_sampling import RandomOverSampler
from sklearn.ensemble import RandomForestClassifier
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import GridSearchCV
from imblearn.pipeline import Pipeline, make_pipeline
from imblearn.over_sampling import SMOTE 

# Set random seed for reproducability
np.random.seed(500)

In [75]:
# Load in dataset
data = pd.read_csv("../../data/sensitivity_data/sensitivity_dataset.csv")
data = data[["Filename","Date","Sensitivity","Document"]]

# Train / Test split
train_x, test_x, train_y, test_y = model_selection.train_test_split(data['Document'],data['Sensitivity'],test_size=0.2,random_state=5)

In [76]:
# TF-IDF vectorise

Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(data["Document"])
train_x = Tfidf_vect.transform(train_x)
test_x = Tfidf_vect.transform(test_x)

In [14]:
# Random Forest Predictions
random_forest = RandomForestClassifier(max_depth=200, random_state=5)
random_forest.fit(train_x,train_y)

predictions_random_forest = random_forest.predict(test_x)

precision = precision_score(test_y, predictions_random_forest)
bac = balanced_accuracy_score(test_y, predictions_random_forest)
f2 = fbeta_score(test_y, predictions_random_forest, beta=2.0)

print(precision,bac,f2)

0.6666666666666666 0.5395546907397365 0.10309278350515463


In [7]:
scoring = {
"f2_score" : make_scorer(fbeta_score, beta=2.0),
"precision" : make_scorer(precision_score),
"bal_acc" : make_scorer(balanced_accuracy_score)
}

kfold = model_selection.KFold(n_splits=5)
random_forest = RandomForestClassifier(max_depth=200, random_state=5)

results = model_selection.cross_validate(estimator=random_forest, X=train_x, y=train_y, cv=kfold, scoring=scoring)

f2 = np.mean(results['test_f2_score'])
precision = np.mean(results['test_precision'])
bac = np.mean(results['test_bal_acc'])
print(precision,bac,f2)

0.8914285714285715 0.5325022235884669 0.08105602286956633


In [18]:
over_sampler = RandomOverSampler(random_state=5)
balanced_x, balanced_y = over_sampler.fit_resample(train_x, train_y)

scoring = {
"f2_score" : make_scorer(fbeta_score, beta=2.0),
"precision" : make_scorer(precision_score),
"bal_acc" : make_scorer(balanced_accuracy_score)
}

kfold = model_selection.KFold(n_splits=5)
random_forest = RandomForestClassifier(max_depth=200, random_state=5)

results = model_selection.cross_validate(estimator=random_forest, X=balanced_x, y=balanced_y, cv=kfold, scoring=scoring)

f2 = np.mean(results['test_f2_score'])
precision = np.mean(results['test_precision'])
bac = np.mean(results['test_bal_acc'])
print(precision,bac,f2)

0.9366493886230728 0.9932355738080961 0.984544927055703


In [9]:
# -----------------------------------------------------
# -- TRY RANDOM FORREST WITH AN UNDERSAMPLED DATASET -- 
# -----------------------------------------------------
# -- PERFORMS WORSE THAN THE OVERSAMPLED DATASET ------
# -----------------------------------------------------

In [10]:
under_sampler = RandomUnderSampler(random_state=5)
balanced_x, balanced_y = under_sampler.fit_resample(train_x, train_y)

scoring = {
"f2_score" : make_scorer(fbeta_score, beta=2.0),
"precision" : make_scorer(precision_score),
"bal_acc" : make_scorer(balanced_accuracy_score)
}

kfold = model_selection.KFold(n_splits=5)
random_forest = RandomForestClassifier(max_depth=200, random_state=5)

results = model_selection.cross_validate(estimator=random_forest, X=balanced_x, y=balanced_y, cv=kfold, scoring=scoring)

f2 = np.mean(results['test_f2_score'])
precision = np.mean(results['test_precision'])
bac = np.mean(results['test_bal_acc'])
print(precision,bac,f2)



0.5324675324675325 0.37274935666875414 0.2681913519490003




In [11]:
over_sampler = RandomOverSampler(random_state=5)
balanced_x, balanced_y = over_sampler.fit_resample(train_x, train_y)

random_forest = RandomForestClassifier(max_depth=200, random_state=5)
predictions = random_forest.fit(balanced_x,balanced_y).predict(test_x)
precision = precision_score(test_y, predictions)
bac = balanced_accuracy_score(test_y, predictions)
f2 = fbeta_score(test_y, predictions, beta=2.0)
print(precision,bac,f2)

0.7619047619047619 0.5813582570416919 0.20151133501259444


In [12]:
under_sampler = RandomUnderSampler(random_state=5)
balanced_x, balanced_y = under_sampler.fit_resample(train_x, train_y)

random_forest = RandomForestClassifier(max_depth=200, random_state=5)
param_grid = {
    'bootstrap': [True],
    'max_depth': [80, 90, 100, 110],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
}
kfold = model_selection.KFold(n_splits=5)
grid_search = GridSearchCV(estimator = random_forest, param_grid = param_grid, cv = kfold, n_jobs = -1, verbose = 2)
grid_search.fit(balanced_x, balanced_y)
print(grid_search.best_params_)

Fitting 5 folds for each of 288 candidates, totalling 1440 fits
{'bootstrap': True, 'max_depth': 80, 'max_features': 3, 'min_samples_leaf': 3, 'min_samples_split': 12, 'n_estimators': 100}


In [28]:
imba_pipeline = make_pipeline(RandomOverSampler(random_state=42), 
                              RandomForestClassifier(max_depth=200,random_state=5))
                              
kfold = model_selection.KFold(n_splits=5)


results = model_selection.cross_validate(estimator=imba_pipeline, X=train_x, y=train_y, cv=kfold, scoring=scoring)

f2 = np.mean(results['test_f2_score'])
precision = np.mean(results['test_precision'])
bac = np.mean(results['test_bal_acc'])
print(precision,bac,f2)

0.7819047619047619 0.5543095796123796 0.1361373498586968


In [61]:
# BEST BAC SO FAR 

imba_pipeline = make_pipeline(RandomOverSampler(random_state=42), 
                              RandomForestClassifier(max_depth=200,random_state=5))

params = {
    'n_estimators': [50, 100, 200,400,600],
    'max_depth': [4, 6, 10, 12,50,100,200],
    'random_state': [5]
}

new_params = {'randomforestclassifier__' + key: params[key] for key in params}
grid_imba = GridSearchCV(imba_pipeline, param_grid=new_params, cv=kfold, scoring='recall',
                        return_train_score=True)
grid_imba.fit(train_x, train_y)
y_test_predict = grid_imba.best_estimator_.named_steps['randomforestclassifier'].predict(test_x)

precision = precision_score(test_y, y_test_predict)
bac = balanced_accuracy_score(test_y, y_test_predict)
f2 = fbeta_score(test_y, y_test_predict, beta=2.0)
print(precision,bac,f2)

0.32620320855614976 0.7300153114931895 0.541740674955595


In [95]:
imba_pipeline = make_pipeline(RandomOverSampler(random_state=42), 
                              RandomForestClassifier(max_depth=200,random_state=5))

params = {
    'n_estimators': [50, 100, 200],
    'max_depth': [4, 6, 10, 12,50,100,200],
    'random_state': [5]
}

new_params = {'randomforestclassifier__' + key: params[key] for key in params}
grid_imba = GridSearchCV(imba_pipeline, param_grid=new_params, cv=kfold, scoring='balanced_accuracy',
                        return_train_score=True)
grid_imba.fit(train_x, train_y)
y_test_predict = grid_imba.best_estimator_.named_steps['randomforestclassifier'].predict(test_x)

precision = precision_score(test_y, y_test_predict)
bac = balanced_accuracy_score(test_y, y_test_predict)
f2 = fbeta_score(test_y, y_test_predict, beta=2.0)
print(precision,bac,f2)

0.2857142857142857 0.6929248141886504 0.4895104895104895


In [78]:
# BEST BASELINE RANDOM FOREST

rf_classifier = make_pipeline(RandomUnderSampler(random_state=42), 
                              RandomForestClassifier(max_depth=4, n_estimators=400,random_state=5))

rf_classifier.fit(train_x, train_y)
y_test_predict = rf_classifier.predict(test_x)
precision = precision_score(test_y, y_test_predict)
bac = balanced_accuracy_score(test_y, y_test_predict)
f2 = fbeta_score(test_y, y_test_predict, beta=2.0)
print(precision,bac,f2)

0.26848249027237353 0.7260917413633609 0.5450236966824644


In [94]:
# Best 5-Fold Cross Validation Undersampled Balanced Dataset

rf_classifier = make_pipeline(RandomUnderSampler(random_state=42), 
                              RandomForestClassifier(max_depth=30, n_estimators=10000,random_state=5))

                             
kfold = model_selection.KFold(n_splits=5)
results = model_selection.cross_validate(estimator=rf_classifier, X=train_x, y=train_y, cv=kfold, scoring=scoring)

f2 = np.mean(results['test_f2_score'])
precision = np.mean(results['test_precision'])
bac = np.mean(results['test_bal_acc'])
print(precision,bac,f2)

0.2647134547458486 0.6998848625521494 0.5240874581709378
