In [1]:
# Imports 
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, svm
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import fbeta_score
from sklearn.metrics import precision_score
from sklearn.metrics import make_scorer
from imblearn.over_sampling import RandomOverSampler
from sklearn.naive_bayes import GaussianNB
from imblearn.pipeline import Pipeline, make_pipeline
from imblearn.under_sampling import RandomUnderSampler
# Set random seed for reproducability
np.random.seed(500)

In [2]:
# Load in dataset
data = pd.read_csv("../../data/sensitivity_data/sensitivity_dataset.csv")
data = data[["Filename","Date","Sensitivity","Document"]]

# Train / Test split
train_x, test_x, train_y, test_y = model_selection.train_test_split(data['Document'],data['Sensitivity'],test_size=0.2,random_state=5)

In [3]:
# TF-IDF vectorise

Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(data["Document"])
train_x = Tfidf_vect.transform(train_x)
test_x = Tfidf_vect.transform(test_x)

In [4]:
# Gaussian Naieve Bayes predictions
gnb = GaussianNB()
gnb.fit(train_x.toarray(), train_y)

# Predict the labels on validation dataset
predictions_gnb = gnb.predict(test_x.toarray())

# Get scores

precision = precision_score(test_y, predictions_gnb)
bac = balanced_accuracy_score(test_y, predictions_gnb)
f2 = fbeta_score(test_y, predictions_gnb, beta=2.0)

print(precision,bac,f2)

0.291044776119403 0.63623241570704 0.38235294117647056


In [5]:
scoring = {
"f2_score" : make_scorer(fbeta_score, beta=2.0),
"precision" : make_scorer(precision_score),
"bal_acc" : make_scorer(balanced_accuracy_score)
}

kfold = model_selection.KFold(n_splits=5)
gnb = GaussianNB()

results = model_selection.cross_validate(estimator=gnb, X=train_x.toarray(), y=train_y, cv=kfold, scoring=scoring)

f2 = np.mean(results['test_f2_score'])
precision = np.mean(results['test_precision'])
bac = np.mean(results['test_bal_acc'])
print(precision,bac,f2)

0.26015199662347005 0.5811281614494528 0.28521555977384894


In [6]:
over_sampler = RandomOverSampler(random_state=5)
balanced_x, balanced_y = over_sampler.fit_resample(train_x, train_y)

scoring = {
"f2_score" : make_scorer(fbeta_score, beta=2.0),
"precision" : make_scorer(precision_score),
"bal_acc" : make_scorer(balanced_accuracy_score)
}

kfold = model_selection.KFold(n_splits=5)
gnb = GaussianNB()

results = model_selection.cross_validate(estimator=gnb, X=balanced_x.toarray(), y=balanced_y, cv=kfold, scoring=scoring)

f2 = np.mean(results['test_f2_score'])
precision = np.mean(results['test_precision'])
bac = np.mean(results['test_bal_acc'])
print(precision,bac,f2)



0.7080733732875848 0.9228613413361024 0.8768366601011562




In [7]:
# Grid search and predict
over_sampler = RandomOverSampler(random_state=5)
balanced_x, balanced_y = over_sampler.fit_resample(train_x, train_y)

kfold = model_selection.KFold(n_splits=5)
gnb = GaussianNB()

gnb.fit(balanced_x.toarray(),balanced_y)
predictions = gnb.predict(test_x.toarray())

precision = precision_score(test_y, predictions)
bac = balanced_accuracy_score(test_y, predictions)
f2 = fbeta_score(test_y, predictions, beta=2.0)
print(precision,bac,f2)

0.28346456692913385 0.6232734696481547 0.3578528827037773


In [6]:
# Best 5-Fold Cross Validation Undersampled Balanced Dataset

scoring = {
"f2_score" : make_scorer(fbeta_score, beta=2.0),
"precision" : make_scorer(precision_score),
"bal_acc" : make_scorer(balanced_accuracy_score)
}

gnb_classifier = make_pipeline(RandomUnderSampler(random_state=42), 
                              GaussianNB())

                             
kfold = model_selection.KFold(n_splits=5)
results = model_selection.cross_validate(estimator=gnb_classifier, X=train_x.toarray(), y=train_y, cv=kfold, scoring=scoring)

f2 = np.mean(results['test_f2_score'])
precision = np.mean(results['test_precision'])
bac = np.mean(results['test_bal_acc'])
print(precision,bac,f2)

0.19904335126557948 0.607910025075013 0.4174335392039775
