In [23]:
# Imports
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, svm
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import fbeta_score
from sklearn.metrics import precision_score
from sklearn.metrics import make_scorer
from imblearn.under_sampling import RandomUnderSampler
from sklearn.ensemble import RandomForestClassifier
from imblearn.pipeline import Pipeline, make_pipeline
import tensorflow as tf
import tensorflow_text as text
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

# Set random seed for reproducability
np.random.seed(500)

In [24]:
data = pd.read_csv("all_formalities_preprocessed.csv")
columns = ["%.0f" % number for number in np.arange(0,1337)]
train_x, test_x, train_y, test_y = model_selection.train_test_split(data[columns],data['Sensitivity'],test_size=0.2,random_state=5)

In [None]:
imba_pipeline = make_pipeline(RandomUnderSampler(random_state=42), SVC())

params = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['linear', 'poly', 'rbf', 'sigmoid']}

kfold = model_selection.KFold(n_splits=5)

new_params = {'svc__' + key: params[key] for key in params}
grid_imba = GridSearchCV(imba_pipeline, param_grid=new_params, cv=kfold, scoring='balanced_accuracy', return_train_score=True)

grid_imba.fit(train_x, train_y)
y_test_predict = grid_imba.best_estimator_.named_steps['svc'].predict(test_x)

precision = precision_score(test_y, y_test_predict)
bac = balanced_accuracy_score(test_y, y_test_predict)
f2 = fbeta_score(test_y, y_test_predict, beta=2.0)
print(precision,bac,f2)

In [25]:
imba_pipeline = make_pipeline(RandomUnderSampler(random_state=42), SVC(C=10,gamma=0.001,kernel="rbf",probability=True))
imba_pipeline.fit(train_x, train_y)
formalities_probs = imba_pipeline.predict_proba(test_x)

In [26]:
# TF-IDF setup


# Load in dataset
string_data = pd.read_csv("../../data/sensitivity_data/sensitivity_dataset.csv")
string_data = string_data[["Filename","Date","Sensitivity","Document"]]

# Train / Test split
train_x, test_x, train_y, test_y = model_selection.train_test_split(string_data['Document'],string_data['Sensitivity'],test_size=0.2,random_state=5)

Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(string_data["Document"])
train_x = Tfidf_vect.transform(train_x)
test_x = Tfidf_vect.transform(test_x)

imba_pipeline = make_pipeline(RandomUnderSampler(random_state=42), SVC(C=1.0, kernel='rbf', gamma=1, probability=True))

imba_pipeline.fit(train_x,train_y)
tfidf_probs = imba_pipeline.predict_proba(test_x)

In [35]:
combined_predictions = []
for i in range(len(formalities_probs)):
    zero_formalities = formalities_probs[i][0]
    one_formalities = formalities_probs[i][1]
    zero_tfidf = tfidf_probs[i][0]
    one_tfidf = tfidf_probs[i][1]

    zero_prob = zero_formalities + zero_tfidf
    one_prob = one_formalities + one_tfidf

    if zero_prob >=  one_prob:
        combined_predictions.append(0)
    else:
        combined_predictions.append(1)

In [37]:
combined_predictions = []
for i in range(len(formalities_probs)):
    zero_formalities = formalities_probs[i][0]
    one_formalities = formalities_probs[i][1]
    zero_tfidf = tfidf_probs[i][0]
    one_tfidf = tfidf_probs[i][1]

    formalities_sureness =  max(formalities_probs[i])
    tfidf_sureness = max(tfidf_probs[i])

    if formalities_sureness >=  tfidf_sureness:
        if zero_formalities >= one_formalities:
            combined_predictions.append(0)
        else:
            combined_predictions.append(1)
    else:
        if zero_tfidf >= one_tfidf:
            combined_predictions.append(0)
        else:
            combined_predictions.append(1)

In [38]:
precision = precision_score(test_y, combined_predictions)
bac = balanced_accuracy_score(test_y, combined_predictions)
f2 = fbeta_score(test_y, combined_predictions, beta=2.0)
print(precision,bac,f2)

0.26881720430107525 0.7460126319818814 0.5725190839694656
