In [29]:
# Imports 
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, svm
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import fbeta_score
from sklearn.metrics import precision_score

# Set random seed for reproducability
np.random.seed(500)

In [30]:
# Load in dataset
data = pd.read_csv("../../data/sensitivity_data/sensitivity_dataset.csv")
data = data[["Filename","Date","Sensitivity","Document"]]

# Train / Test split
train_x, test_x, train_y, test_y = model_selection.train_test_split(data['Document'],data['Sensitivity'],test_size=0.2,random_state=5)

In [31]:
# TF-IDF vectorise

Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(data["Document"])
train_x = Tfidf_vect.transform(train_x)
test_x = Tfidf_vect.transform(test_x)

print(test_x)

  (0, 4975)	0.009245690727422677
  (0, 4974)	0.01706343790200572
  (0, 4972)	0.016237620055009852
  (0, 4964)	0.05287056470777404
  (0, 4963)	0.018180594211422933
  (0, 4957)	0.040695834267672315
  (0, 4948)	0.009552527932021843
  (0, 4939)	0.010961585746022141
  (0, 4935)	0.06041900460981615
  (0, 4922)	0.005825905868072858
  (0, 4911)	0.016362642893482365
  (0, 4909)	0.019037242400552566
  (0, 4907)	0.010130514557107253
  (0, 4906)	0.017220478890059107
  (0, 4898)	0.035591904747449304
  (0, 4879)	0.03237454251446588
  (0, 4869)	0.01807430045647964
  (0, 4863)	0.0256722863795706
  (0, 4822)	0.019795040117009527
  (0, 4813)	0.02907881978915265
  (0, 4796)	0.021152453378268637
  (0, 4795)	0.03897407641913441
  (0, 4755)	0.011555002516857335
  (0, 4742)	0.011738778138925546
  (0, 4741)	0.009466337853536635
  :	:
  (760, 744)	0.033818347013271965
  (760, 736)	0.014581795113579113
  (760, 619)	0.024593600440595106
  (760, 543)	0.33399559931260675
  (760, 541)	0.06444599119380902
  (760, 51

In [32]:
# SVM predictions
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(train_x,train_y)
# Predict the labels on validation dataset
predictions_SVM = SVM.predict(test_x)

In [33]:
# Get scores

precision = precision_score(test_y, predictions_SVM)
bac = balanced_accuracy_score(test_y, predictions_SVM)
f2 = fbeta_score(test_y, predictions_SVM, beta=2.0)

print(precision,bac,f2)

0.5 0.5274171424925835 0.07731958762886597
