In [3]:
# Imports 
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import fbeta_score
from sklearn.metrics import precision_score
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from imblearn.pipeline import Pipeline, make_pipeline
from sklearn.metrics import make_scorer
import math
import random
import gensim
import tensorflow as tf
import tensorflow_text as text

In [4]:
# Load in dataset
data = pd.read_csv("../../data/sensitivity_data/sensitivity_dataset.csv")
data = data[["Document","Sensitivity"]]
data = np.array(data)

formalities_model = tf.keras.models.load_model("..\\deep_learning\\bert_small_e20_b32")

In [None]:
equal_parts = []
for document in data:
    preprocessed_list = gensim.utils.simple_preprocess(document[0])
    chunk_size = 17
    equal_parts.append([preprocessed_list[(i*len(preprocessed_list))//chunk_size:((i+1)*len(preprocessed_list))//chunk_size] for i in range(chunk_size)])

corpus_formality = []
i = 1
for elt in equal_parts:
    document_formality = []
    for l in elt:
        reconstructed = " ".join(l)
        formality = formalities_model.predict([reconstructed])[0][0]
        document_formality.append(formality)
    corpus_formality.append(document_formality)
    print(str(i) + " Documents Processed..." + str( round( (i/len(data)) * 100, 2)) + "%")
    i += 1

In [None]:
df = pd.DataFrame(corpus_formality)
df.to_csv("gensim_17_formalities.csv")

In [19]:
###############################################################
## DISCARD DOCUMENTS THAT CANNOT BE OF SPLIT INTO 100 CHUNKS ##
###############################################################

# Generate dataset with documents that cannot be split into 100 even chunks discarded
equal_parts = []
for document in data:
    preprocessed_list = gensim.utils.simple_preprocess(document[0])
    chunk_size = 100
    equal_parts.append([preprocessed_list[(i*len(preprocessed_list))//chunk_size:((i+1)*len(preprocessed_list))//chunk_size] for i in range(chunk_size)])

discard_count = 0
discarded_dataset = []
non_discard_count = 0
for i, document in enumerate(equal_parts):
    zero_len = False 
    for chunk in document:
        if len(chunk) < 1:
            zero_len = True

    if zero_len == False:
        discarded_dataset.append(data[i])

# Get formalities of all of the docuemnts in the discarded dataset
equal_parts = []
for document in discarded_dataset:
    preprocessed_list = gensim.utils.simple_preprocess(document[0])
    chunk_size = 100
    equal_parts.append([preprocessed_list[(i*len(preprocessed_list))//chunk_size:((i+1)*len(preprocessed_list))//chunk_size] for i in range(chunk_size)])

corpus_formality = []
i = 1
for elt in equal_parts:
    document_formality = []
    for l in elt:
        reconstructed = " ".join(l)
        formality = formalities_model.predict([reconstructed])[0][0]
        document_formality.append(formality)
    corpus_formality.append(document_formality)
    print(str(i) + " Documents Processed..." + str( round( (i/len(discarded_dataset)) * 100, 2)) + "%")
    i += 1

1 Documents Processed...0.03%
2 Documents Processed...0.05%
3 Documents Processed...0.08%
4 Documents Processed...0.11%
5 Documents Processed...0.14%
6 Documents Processed...0.16%
7 Documents Processed...0.19%
8 Documents Processed...0.22%
9 Documents Processed...0.25%
10 Documents Processed...0.27%
11 Documents Processed...0.3%
12 Documents Processed...0.33%
13 Documents Processed...0.35%
14 Documents Processed...0.38%
15 Documents Processed...0.41%
16 Documents Processed...0.44%
17 Documents Processed...0.46%
18 Documents Processed...0.49%
19 Documents Processed...0.52%
20 Documents Processed...0.55%
21 Documents Processed...0.57%
22 Documents Processed...0.6%
23 Documents Processed...0.63%
24 Documents Processed...0.65%
25 Documents Processed...0.68%
26 Documents Processed...0.71%
27 Documents Processed...0.74%
28 Documents Processed...0.76%
29 Documents Processed...0.79%
30 Documents Processed...0.82%
31 Documents Processed...0.85%
32 Documents Processed...0.87%
33 Documents Proces

In [None]:
df = pd.DataFrame(corpus_formality)
df.to_csv("gensim_less_than_100_discarded_formalities.csv")