In [2]:
# Imports 
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import fbeta_score
from sklearn.metrics import precision_score
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from imblearn.pipeline import Pipeline, make_pipeline
from sklearn.metrics import make_scorer
import math
import random

In [3]:
# Load in dataset
data = pd.read_csv("../../data/sensitivity_data/sensitivity_dataset.csv")
data = data[["Filename","Date","Sensitivity","Document"]]

In [4]:
# TF-IDF vectorise

Tfidf_vect = TfidfVectorizer(max_features=5000000)
Tfidf_vect.fit(data["Document"])
tf_idfs = Tfidf_vect.transform(data["Document"])
tf_idfs = np.array(tf_idfs.todense())

In [None]:
###########################################
## RANDOM RESAMPLING THEN RANDOM SAMPLE  ##
###########################################

formalities_data = pd.read_csv("gensim_formalities.csv")
formalities_data = np.array(formalities_data)
for i in range(len(formalities_data)):
    arr = formalities_data[i]
    non_nans = arr[~np.isnan(arr)]
    formalities_data[i] = [random.choice(non_nans) if math.isnan(x) else x for x in arr]

formalities_data = pd.DataFrame(formalities_data)
random_formalities = np.array(formalities_data.sample(n=10,axis='columns'))

combined_dataset = np.hstack((tf_idfs,random_formalities))
sensitivities = np.array(data["Sensitivity"])
sensitivities = sensitivities.reshape(-1,1)

# Train / Test split
train_x, test_x, train_y, test_y = model_selection.train_test_split(combined_dataset,sensitivities,test_size=0.2,random_state=5)

In [5]:
##############################
## REPEAT AND RANDOM SAMPLE ##
##############################

formalities_data = pd.read_csv("gensim_formalities.csv")
formalities_data = np.array(formalities_data)
non_nan_formalities_data = []
for i in range(len(formalities_data)):
    arr = np.array(formalities_data[i][~np.isnan(formalities_data[i])])
    arr = np.resize(arr, 1337)
    non_nan_formalities_data.append(arr)
    


formalities_data = pd.DataFrame(non_nan_formalities_data)

rand_int = random.randint(0,(1337-100))
columns = np.arange(rand_int,(rand_int+100))
random_formalities = np.array(formalities_data.iloc[:,columns])


combined_dataset = np.hstack((tf_idfs,random_formalities))
sensitivities = np.array(data["Sensitivity"])
sensitivities = sensitivities.reshape(-1,1)

# Train / Test split
train_x, test_x, train_y, test_y = model_selection.train_test_split(combined_dataset,sensitivities,test_size=0.2,random_state=5)

In [6]:
imba_pipeline = make_pipeline(RandomOverSampler(random_state=42), SVC())
imba_pipeline.fit(train_x, train_y)
y_test_predict = imba_pipeline.predict(test_x)
precision = precision_score(test_y, y_test_predict)
bac = balanced_accuracy_score(test_y, y_test_predict)
f2 = fbeta_score(test_y, y_test_predict, beta=2.0)
print(precision,bac,f2)

0.14642857142857144 0.5389246865928738 0.3125


In [7]:
imba_pipeline = make_pipeline(RandomOverSampler(random_state=42), SVC())

params = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['linear', 'poly', 'rbf', 'sigmoid']}

kfold = model_selection.KFold(n_splits=5)

new_params = {'svc__' + key: params[key] for key in params}
grid_imba = GridSearchCV(imba_pipeline, param_grid=new_params, cv=kfold, scoring='balanced_accuracy', return_train_score=True)

grid_imba.fit(train_x, train_y)
y_test_predict = grid_imba.best_estimator_.named_steps['svc'].predict(test_x)

precision = precision_score(test_y, y_test_predict)
bac = balanced_accuracy_score(test_y, y_test_predict)
f2 = fbeta_score(test_y, y_test_predict, beta=2.0)
print(precision,bac,f2)

In [None]:
scoring = {
"f2_score" : make_scorer(fbeta_score, beta=2.0),
"precision" : make_scorer(precision_score),
"bal_acc" : make_scorer(balanced_accuracy_score)
}

results = model_selection.cross_validate(estimator=grid_imba.best_estimator_, X=train_x, y=train_y, cv=kfold, scoring=scoring)

In [None]:
f2 = np.round(np.mean(results['test_f2_score']),4)
precision = np.round(np.mean(results['test_precision']),4)
bac = np.round(np.mean(results['test_bal_acc']),4)
print(precision,bac,f2)

0.2019 0.6193 0.4342


In [None]:
# CURRENTLY RUNNING REPEAT AND RANDOM SAMPLE ON GENSIM FORMALITIES WITH OVERSAMPLER 23/1/22 20:55
grid_imba.best_params_

{'svc__C': 10, 'svc__gamma': 1, 'svc__kernel': 'linear'}