In [2]:
# Imports 
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, svm
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import fbeta_score
from sklearn.metrics import precision_score
from sklearn.metrics import make_scorer
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

# Set random seed for reproducability
np.random.seed(500)

In [3]:
# Load in dataset
data = pd.read_csv("../../data/sensitivity_data/sensitivity_dataset.csv")
data = data[["Filename","Date","Sensitivity","Document"]]

# Train / Test split
train_x, test_x, train_y, test_y = model_selection.train_test_split(data['Document'],data['Sensitivity'],test_size=0.2,random_state=5)

In [4]:
# TF-IDF vectorise

Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(data["Document"])
train_x = Tfidf_vect.transform(train_x)
test_x = Tfidf_vect.transform(test_x)

In [5]:
# SVM predictions
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(train_x,train_y)
# Predict the labels on validation dataset
predictions_SVM = SVM.predict(test_x)

In [6]:
# Get scores

precision = precision_score(test_y, predictions_SVM)
bac = balanced_accuracy_score(test_y, predictions_SVM)
f2 = fbeta_score(test_y, predictions_SVM, beta=2.0)

print(precision,bac,f2)

0.5 0.5274171424925835 0.07731958762886597


In [7]:
# ---------------------------------------------
# ----- NOW TRYING 5 FOLD CROSS VALIDATION ---- 
# ---------------------------------------------

In [8]:
scoring = {
"f2_score" : make_scorer(fbeta_score, beta=2.0),
"precision" : make_scorer(precision_score),
"bal_acc" : make_scorer(balanced_accuracy_score)
}

kfold = model_selection.KFold(n_splits=5)
supp_vec = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')

results = model_selection.cross_validate(estimator=supp_vec, X=train_x, y=train_y, cv=kfold, scoring=scoring)

In [9]:
f2 = np.mean(results['test_f2_score'])
precision = np.mean(results['test_precision'])
bac = np.mean(results['test_bal_acc'])
print(precision,bac,f2)

0.9083333333333332 0.5383835304383485 0.09529619813136071


In [10]:
# ---------------------------------------------------
# ----------- NOW TRYING BALANCED DATASET ----------- 
# ---------------------------------------------------

In [5]:
over_sampler = RandomOverSampler(random_state=5)
balanced_x, balanced_y = over_sampler.fit_resample(train_x, train_y)

scoring = {
"f2_score" : make_scorer(fbeta_score, beta=2.0),
"precision" : make_scorer(precision_score),
"bal_acc" : make_scorer(balanced_accuracy_score)
}

kfold = model_selection.KFold(n_splits=5)
supp_vec = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')

results = model_selection.cross_validate(estimator=supp_vec, X=balanced_x, y=balanced_y, cv=kfold, scoring=scoring)

f2 = np.mean(results['test_f2_score'])
precision = np.mean(results['test_precision'])
bac = np.mean(results['test_bal_acc'])
print(precision,bac,f2)

In [5]:
imba_pipeline = make_pipeline(RandomUnderSampler(random_state=42), SVC())

params = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['linear', 'poly', 'rbf', 'sigmoid']}

kfold = model_selection.KFold(n_splits=5)

new_params = {'svc__' + key: params[key] for key in params}
grid_imba = GridSearchCV(imba_pipeline, param_grid=new_params, cv=kfold, scoring='balanced_accuracy', return_train_score=True)

grid_imba.fit(train_x, train_y)
y_test_predict = grid_imba.best_estimator_.named_steps['svc'].predict(test_x)

precision = precision_score(test_y, y_test_predict)
bac = balanced_accuracy_score(test_y, y_test_predict)
f2 = fbeta_score(test_y, y_test_predict, beta=2.0)
print(precision,bac,f2)

KeyboardInterrupt: 

In [13]:
grid_imba.best_params_

{'svc__C': 1, 'svc__gamma': 1, 'svc__kernel': 'rbf'}