In [12]:
# Imports 
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, svm
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import fbeta_score
from sklearn.metrics import precision_score
from sklearn.metrics import make_scorer
from imblearn.over_sampling import RandomOverSampler
from sklearn.ensemble import RandomForestClassifier
# Set random seed for reproducability
np.random.seed(500)

In [13]:
# Load in dataset
data = pd.read_csv("../../data/sensitivity_data/sensitivity_dataset.csv")
data = data[["Filename","Date","Sensitivity","Document"]]

# Train / Test split
train_x, test_x, train_y, test_y = model_selection.train_test_split(data['Document'],data['Sensitivity'],test_size=0.2,random_state=5)

In [14]:
# TF-IDF vectorise

Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(data["Document"])
train_x = Tfidf_vect.transform(train_x)
test_x = Tfidf_vect.transform(test_x)

In [15]:
# Random Forest Predictions
random_forest = RandomForestClassifier(max_depth=200, random_state=5)
random_forest.fit(train_x,train_y)

predictions_random_forest = random_forest.predict(test_x)

precision = precision_score(test_y, predictions_random_forest)
bac = balanced_accuracy_score(test_y, predictions_random_forest)
f2 = fbeta_score(test_y, predictions_random_forest, beta=2.0)

print(precision,bac,f2)

0.6666666666666666 0.5395546907397365 0.10309278350515463


In [16]:
scoring = {
"f2_score" : make_scorer(fbeta_score, beta=2.0),
"precision" : make_scorer(precision_score),
"bal_acc" : make_scorer(balanced_accuracy_score)
}

kfold = model_selection.KFold(n_splits=5)
random_forest = RandomForestClassifier(max_depth=200, random_state=5)

results = model_selection.cross_validate(estimator=random_forest, X=train_x, y=train_y, cv=kfold, scoring=scoring)

f2 = np.mean(results['test_f2_score'])
precision = np.mean(results['test_precision'])
bac = np.mean(results['test_bal_acc'])
print(precision,bac,f2)

0.8914285714285715 0.5325022235884669 0.08105602286956633


In [17]:
over_sampler = RandomOverSampler(random_state=5)
balanced_x, balanced_y = over_sampler.fit_resample(train_x, train_y)

scoring = {
"f2_score" : make_scorer(fbeta_score, beta=2.0),
"precision" : make_scorer(precision_score),
"bal_acc" : make_scorer(balanced_accuracy_score)
}

kfold = model_selection.KFold(n_splits=5)
random_forest = RandomForestClassifier(max_depth=200, random_state=5)

results = model_selection.cross_validate(estimator=random_forest, X=balanced_x, y=balanced_y, cv=kfold, scoring=scoring)

f2 = np.mean(results['test_f2_score'])
precision = np.mean(results['test_precision'])
bac = np.mean(results['test_bal_acc'])
print(precision,bac,f2)

0.9366493886230728 0.9932355738080961 0.984544927055703
