In [3]:
# Imports
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, svm
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import fbeta_score
from sklearn.metrics import precision_score
from sklearn.metrics import make_scorer
from imblearn.under_sampling import RandomUnderSampler
from sklearn.ensemble import RandomForestClassifier
from imblearn.pipeline import Pipeline, make_pipeline
import tensorflow as tf
import tensorflow_text as text
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler
from mlxtend.feature_selection import ColumnSelector
from mlxtend.classifier import EnsembleVoteClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

# Set random seed for reproducability
np.random.seed(500)

In [4]:
string_data = pd.read_csv("../../data/sensitivity_data/sensitivity_dataset.csv")
string_data = string_data[["Filename","Date","Document"]]
formalities_data = pd.read_csv("all_formalities_preprocessed.csv")

x = pd.concat([string_data,formalities_data],axis=1)
y = formalities_data["Sensitivity"]

Tfidf_vect = TfidfVectorizer(max_features=5000)
document_feature = Tfidf_vect.fit_transform(string_data["Document"])

text_feature_df = pd.DataFrame(document_feature.todense(), columns = Tfidf_vect.get_feature_names_out())
x = pd.concat([text_feature_df,formalities_data[formalities_data.columns[:-1]]],axis=1)

train_x, test_x, train_y, test_y = model_selection.train_test_split(x,y,test_size=0.2,random_state=5)
train_x = np.array(train_x)
test_x = np.array(test_x)

In [5]:
tfidf_cols = np.arange(0,5000)
formality_cols = np.array(x.columns[5000:],dtype=np.int)

In [44]:
tfidf_pipeline = make_pipeline(ColumnSelector(cols=tfidf_cols),RandomUnderSampler(random_state=42), SVC(C=1.0, kernel='poly', gamma=1, probability=True))
formalities_pipeline = make_pipeline(ColumnSelector(cols=formality_cols), RandomUnderSampler(random_state=42), SVC(C=10,gamma=0.001,kernel="poly",probability=True))
vote_classifier = EnsembleVoteClassifier(clfs=[tfidf_pipeline,formalities_pipeline],voting='soft')

In [45]:
vote_classifier.fit(train_x,train_y)
y_test_predict = vote_classifier.predict(test_x)

In [46]:
precision = precision_score(test_y, y_test_predict)
bac = balanced_accuracy_score(test_y, y_test_predict)
f2 = fbeta_score(test_y, y_test_predict, beta=2.0)
print(precision,bac,f2)

0.25868725868725867 0.7124549427413953 0.5275590551181102


In [7]:
# GRID SEARCH ON VOTING CLASSIFIER

In [8]:
tfidf_pipeline = make_pipeline(ColumnSelector(cols=tfidf_cols),RandomUnderSampler(random_state=42), SVC(probability=True))
formalities_pipeline = make_pipeline(ColumnSelector(cols=formality_cols), RandomUnderSampler(random_state=42), SVC(probability=True))
vote_classifier = EnsembleVoteClassifier(clfs=[tfidf_pipeline,formalities_pipeline],voting='soft')

kfold = model_selection.KFold(n_splits=5)

params = {'pipeline-1__svc__C': [0.1, 1, 10],
              'pipeline-1__svc__gamma': [1, 0.1, 0.01, 0.001],
              'pipeline-1__svc__kernel': ['linear','rbf'],
              'pipeline-2__svc__C': [1, 10, 100],
              'pipeline-2__svc__gamma': [1, 0.1, 0.01, 0.001],
              'pipeline-2__svc__kernel': ['linear', 'rbf']}

grid = GridSearchCV(estimator=vote_classifier,cv=kfold,param_grid = params, scoring="balanced_accuracy",n_jobs=-1)   
grid.fit(train_x,train_y)         

In [None]:
y_test_predict = grid.best_estimator_.predict(test_x)
precision = precision_score(test_y, y_test_predict)
bac = balanced_accuracy_score(test_y, y_test_predict)
f2 = fbeta_score(test_y, y_test_predict, beta=2.0)
print(precision,bac,f2)

In [None]:
print(grid.best_params_)

In [47]:
np.argwhere(np.isnan(train_x))

array([], shape=(0, 2), dtype=int64)

In [6]:
tfidf_pipeline = make_pipeline(ColumnSelector(cols=tfidf_cols),StandardScaler(),RandomUnderSampler(random_state=42), SVC(C=1.0, kernel='poly', gamma=1, probability=True))
formalities_pipeline = make_pipeline(ColumnSelector(cols=formality_cols),StandardScaler(), RandomUnderSampler(random_state=42), SVC(C=10,gamma=0.001,kernel="poly",probability=True))
vote_classifier = EnsembleVoteClassifier(clfs=[tfidf_pipeline,formalities_pipeline],voting='soft')

vote_classifier.fit(train_x,train_y)
y_test_predict = vote_classifier.predict(test_x)

precision = precision_score(test_y, y_test_predict)
bac = balanced_accuracy_score(test_y, y_test_predict)
f2 = fbeta_score(test_y, y_test_predict, beta=2.0)
print(precision,bac,f2)

0.25339366515837103 0.6741841845034929 0.46901172529313234
