In [22]:
# Imports 
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, svm
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import fbeta_score
from sklearn.metrics import precision_score
from sklearn.metrics import make_scorer
from imblearn.over_sampling import RandomOverSampler
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models import KeyedVectors
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from imblearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import GridSearchCV

# Set random seed for reproducability
np.random.seed(500)

In [11]:
# Load in dataset
data = pd.read_csv("../../data/sensitivity_data/sensitivity_dataset.csv")
data = data[["Filename","Date","Sensitivity","Document"]]

# Train / Test split
train_x, test_x, train_y, test_y = model_selection.train_test_split(data['Document'],data['Sensitivity'],test_size=0.2,random_state=5)

In [3]:
from gensim.scripts.glove2word2vec import glove2word2vec

glove_path = "..\deep_learning\glove.6B.300d.txt"
word2vec_output_file = "glove.6B.300d"+'.word2vec'
glove2word2vec(glove_path, word2vec_output_file)

  glove2word2vec(glove_path, word2vec_output_file)


(400000, 300)

In [4]:
word2vec_output_file = "glove.6B.300d"+'.word2vec'
model = KeyedVectors.load_word2vec_format(word2vec_output_file, binary=False)

In [12]:
class Word2VecVectorizer:
  def __init__(self, model):
    print("Loading in word vectors...")
    self.word_vectors = model
    print("Finished loading in word vectors")

  def fit(self, data):
    pass

  def transform(self, data):
    # determine the dimensionality of vectors
    v = self.word_vectors.get_vector('king')
    self.D = v.shape[0]

    X = np.zeros((len(data), self.D))
    n = 0
    emptycount = 0
    for sentence in data:
      tokens = sentence.split()
      vecs = []
      m = 0
      for word in tokens:
        try:
          # throws KeyError if word not found
          vec = self.word_vectors.get_vector(word)
          vecs.append(vec)
          m += 1
        except KeyError:
          pass
      if len(vecs) > 0:
        vecs = np.array(vecs)
        X[n] = vecs.mean(axis=0)
      else:
        emptycount += 1
      n += 1
    print("Numer of samples with no words found: %s / %s" % (emptycount, len(data)))
    return X


  def fit_transform(self, data):
    self.fit(data)
    return self.transform(data)

In [13]:
vectorizer = Word2VecVectorizer(model)
train_x = vectorizer.fit_transform(train_x)
test_x = vectorizer.fit_transform(test_x)

Loading in word vectors...
Finished loading in word vectors
Numer of samples with no words found: 4 / 3040
Numer of samples with no words found: 0 / 761


In [21]:
imba_pipeline = make_pipeline(RandomOverSampler(random_state=42), 
                              GaussianNB())
                              
kfold = model_selection.KFold(n_splits=5)

scoring = {
"f2_score" : make_scorer(fbeta_score, beta=2.0),
"precision" : make_scorer(precision_score),
"bal_acc" : make_scorer(balanced_accuracy_score)
}

imba_pipeline.fit(train_x,train_y)
y_test_predict = imba_pipeline.predict(test_x)

precision = precision_score(test_y, y_test_predict)
bac = balanced_accuracy_score(test_y, y_test_predict)
f2 = fbeta_score(test_y, y_test_predict, beta=2.0)
print(precision,bac,f2)

0.2789115646258503 0.6386248365179112 0.3919694072657744


In [24]:
imba_pipeline = make_pipeline(RandomOverSampler(random_state=42), 
                              RandomForestClassifier(random_state=5))

params = {
    'n_estimators': [50, 100, 200,400,800],
    'max_depth': [4, 6, 10, 12,50,100,200],
    'random_state': [5]
}

new_params = {'randomforestclassifier__' + key: params[key] for key in params}
grid_imba = GridSearchCV(imba_pipeline, param_grid=new_params, cv=kfold, scoring='recall',
                        return_train_score=True)
grid_imba.fit(train_x, train_y)
y_test_predict = grid_imba.best_estimator_.named_steps['randomforestclassifier'].predict(test_x)

precision = precision_score(test_y, y_test_predict)
bac = balanced_accuracy_score(test_y, y_test_predict)
f2 = fbeta_score(test_y, y_test_predict, beta=2.0)
print(precision,bac,f2)

0.2573099415204678 0.6388401543908897 0.4021937842778794
