In [1]:
# Imports
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, svm
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import fbeta_score
from sklearn.metrics import precision_score
from sklearn.metrics import make_scorer
from imblearn.under_sampling import RandomUnderSampler
from sklearn.ensemble import RandomForestClassifier
from imblearn.pipeline import Pipeline, make_pipeline
import tensorflow as tf
import tensorflow_text as text
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler

# Set random seed for reproducability
np.random.seed(500)

In [2]:
string_data = pd.read_csv("../../data/sensitivity_data/sensitivity_dataset.csv")
string_data = string_data[["Filename","Date","Document"]]
formalities_data = pd.read_csv("all_formalities_preprocessed.csv")

In [3]:
x = pd.concat([string_data,formalities_data],axis=1)
y = formalities_data["Sensitivity"]

In [4]:
Tfidf_vect = TfidfVectorizer(max_features=5000)
document_feature = Tfidf_vect.fit_transform(string_data["Document"])

In [5]:
text_feature_df = pd.DataFrame(document_feature.todense(), columns = Tfidf_vect.get_feature_names_out())
text_feature_df

Unnamed: 0,00,000,001,002,003,004,005,007,01,02,...,zambia,zanu,zealand,zero,zi,zimbabwe,zimbabwean,zimbabweans,zone,zones
0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.007028,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.019604,0.000000,0.000000,0.0,0.0,0.0,0.0,0.014141,0.017322,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3796,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3797,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3798,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3799,0.0,0.000000,0.028465,0.058807,0.0,0.0,0.0,0.0,0.021513,0.013176,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
x = pd.concat([text_feature_df,formalities_data[formalities_data.columns[:-1]]],axis=1)

In [7]:
train_x, test_x, train_y, test_y = model_selection.train_test_split(x,y,test_size=0.2,random_state=5)
train_x = np.array(train_x)
test_x = np.array(test_x)

In [8]:
imba_pipeline = make_pipeline(RandomUnderSampler(random_state=42), MinMaxScaler(), SVC())

params = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [100, 10, 1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['linear', 'poly', 'rbf', 'sigmoid']}

kfold = model_selection.KFold(n_splits=5)

new_params = {'svc__' + key: params[key] for key in params}
grid_imba = GridSearchCV(imba_pipeline, param_grid=new_params, cv=kfold, scoring='balanced_accuracy', return_train_score=True)

grid_imba.fit(train_x, train_y)
y_test_predict = grid_imba.best_estimator_.named_steps['svc'].predict(test_x)

precision = precision_score(test_y, y_test_predict)
bac = balanced_accuracy_score(test_y, y_test_predict)
f2 = fbeta_score(test_y, y_test_predict, beta=2.0)
print(precision,bac,f2)

0.0 0.5 0.0


  _warn_prf(average, modifier, msg_start, len(result))


In [9]:
print(grid_imba.best_params_)

{'svc__C': 100, 'svc__gamma': 0.0001, 'svc__kernel': 'rbf'}
