In [1]:
!pip install transformers

'pip' is not recognized as an internal or external command,
operable program or batch file.


In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import torch
import transformers as ppb
import warnings

warnings.filterwarnings('ignore')

In [2]:
#For logistic regression
from sklearn.linear_model import LogisticRegression

In [3]:
#For naive bayes
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import CategoricalNB

In [3]:
#for linear svc
from sklearn import svm

In [4]:
#read dataset into dataframe
df = pd.read_csv("FormattedTrainingDataset.csv", delimiter=',', header=None)

In [8]:
#instantiate DistilBERT model

d_model_class, d_tokenizer_class, d_pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')
d_tokenizer = d_tokenizer_class.from_pretrained(d_pretrained_weights)
d_model = d_model_class.from_pretrained(d_pretrained_weights)

In [5]:
#instantiate BERT model
model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)

model = model_class.from_pretrained(pretrained_weights)

In [6]:
batch_1 = df[:3800]

In [7]:
batch_1[1].value_counts()

value     2182
policy     815
fact       786
Name: 1, dtype: int64

In [8]:
#tokenize words into values in dataframe for BERT
tokenized = batch_1[0].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

In [69]:
#tokenize words into values in dataframe for DistilBERT
tokenized = batch_1[0].apply((lambda x: d_tokenizer.encode(x, add_special_tokens=True)))

In [9]:
#pad each line in dataframe to a uniform length
max_len = 0
for i in tokenized.values:
  if len(i) > max_len:
    max_len = len(i)
padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

In [10]:
np.array(padded).shape

(3783, 116)

In [11]:
#create attention mask of the same shape as padded dataframe
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape


(3783, 116)

In [12]:
#run data through BERT model
input_ids = torch.LongTensor(padded)
attention_mask = torch.tensor(attention_mask)

with torch.no_grad():
  last_hidden_states = model(input_ids, attention_mask=attention_mask)

In [73]:
#run data through DistilBERT model
input_ids = torch.LongTensor(padded)
attention_mask = torch.tensor(attention_mask)

with torch.no_grad():
  last_hidden_states = d_model(input_ids, attention_mask=attention_mask)

In [13]:
#create list of processed statements
features = last_hidden_states[0][:,0,:].numpy()

In [14]:
#create list of labels
labels = batch_1[1]

In [15]:
#necessary imports for stratified k-fold validation
from statistics import mean, stdev
from sklearn import preprocessing
from sklearn.model_selection import StratifiedKFold
from sklearn import datasets

In [16]:
#instantiate variables for stratified train test split
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, random_state=1, stratify=labels)

In [122]:
#instantiate variables for stratified k-fold validation
from sklearn.metrics import f1_score
scaler = preprocessing.MinMaxScaler()
features_scaled = scaler.fit_transform(features)
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)
lst_accu_stratified = []
lst_f1_stratified = []



In [131]:
#find accuracy and f1 score from stratified k-fold validation
print(features.shape)
print(labels.shape)

for train_index, test_index in skf.split(features, labels):
    x_train_fold, x_test_fold = features_scaled[train_index], features_scaled[test_index]
    y_train_fold, y_test_fold = labels[train_index], labels[test_index]
    lr_clf.fit(x_train_fold, y_train_fold)
    lst_accu_stratified.append(lr.score(x_test_fold, y_test_fold))
    lst_f1_stratified.append(f1_score(y_test_fold, lr.predict(x_test_fold), average='weighted'))
   
# Print the output.
print('List of possible accuracy:', lst_accu_stratified)
print('\nMaximum Accuracy That can be obtained from this model is:',
      max(lst_accu_stratified)*100, '%')
print('\nMinimum Accuracy:',
      min(lst_accu_stratified)*100, '%')
print('\nOverall Accuracy:',
      mean(lst_accu_stratified)*100, '%')
print('\nStandard Deviation is:', stdev(lst_accu_stratified))

print('List of possible f1 scores:', lst_f1_stratified)
print('\nMaximum f1 score That can be obtained from this model is:',
      max(lst_f1_stratified)*100, '%')
print('\nMinimum f1 score:',
      min(lst_f1_stratified)*100, '%')
print('\nOverall f1 score:',
      mean(lst_f1_stratified)*100, '%')
print('\nStandard Deviation is:', stdev(lst_f1_stratified))

(1, 768)
(3783,)


ValueError: Found input variables with inconsistent numbers of samples: [1, 3783]

In [36]:
#grid search for best SVC
from sklearn.svm import SVC
parameters = {'C': np.linspace(0.0001, 100, 20), 'kernel': ['poly', 'rbf', 'sigmoid'], 'gamma': ['scale']}
grid_search = GridSearchCV(svm.SVC(), parameters)
grid_search.fit(train_features, train_labels)



print('best parameters: ', grid_search.best_params_)
print('best scrores: ', grid_search.best_score_)

best parameters:  {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}
best scrores:  0.7835711801674243


In [22]:
#SVC score
svc_clf = svm.SVC(C= 10, gamma= 'scale', kernel= 'rbf')
svc_clf.fit(train_features, train_labels)

svc_predicted = svc_clf.predict(test_features)
from sklearn.metrics import classification_report
print(classification_report(test_labels, svc_predicted))

              precision    recall  f1-score   support

        fact       0.57      0.52      0.54       196
      policy       0.88      0.85      0.86       204
       value       0.80      0.84      0.82       546

    accuracy                           0.77       946
   macro avg       0.75      0.73      0.74       946
weighted avg       0.77      0.77      0.77       946



In [20]:
#linearSVC grid search
parameters = {'C': np.linspace(0.0001, 100, 20)}
grid_search = GridSearchCV(svm.LinearSVC(), parameters)
grid_search.fit(train_features, train_labels)

print('best parameters: ', grid_search.best_params_)
print('best scrores: ', grid_search.best_score_)



best parameters:  {'C': 5.263252631578947}
best scrores:  0.7261215440792904


In [18]:
#linear SVC score
svc_clf = svm.SVC(C= 15.789557894736841)
svc_clf.fit(train_features, train_labels)

svc_predicted = svc_clf.predict(test_features)
from sklearn.metrics import classification_report
print(classification_report(test_labels, svc_predicted))

              precision    recall  f1-score   support

        fact       0.63      0.57      0.60       196
      policy       0.88      0.83      0.86       204
       value       0.80      0.85      0.82       546

    accuracy                           0.79       946
   macro avg       0.77      0.75      0.76       946
weighted avg       0.78      0.79      0.78       946



In [None]:
#print classification report from stratified k-fold validation
lr_predicted = lr.predict(test_features)
print(classification_report(test_labels, lr_predicted))

In [38]:
#Logistic regression grid search
parameters = {'C': np.linspace(0.0001, 100, 20), "multi_class": ['multinomial'], 'solver': ['newton-cg', 'lbfgs', 'sag', 'saga'], 'penalty': ['l1', 'l2', 'elasticnet', 'none']}
grid_search = GridSearchCV(LogisticRegression(), parameters)
grid_search.fit(train_features, train_labels)

print('best parameters: ', grid_search.best_params_)
print('best scrores: ', grid_search.best_score_)

best parameters:  {'C': 5.263252631578947, 'multi_class': 'multinomial', 'penalty': 'l1', 'solver': 'saga'}
best scrores:  0.7680608341406463


In [20]:
#Logistic regression score
lr_clf = LogisticRegression(C=10.526405263157894, multi_class='multinomial', solver='saga', penalty='l2')
lr_clf.fit(train_features, train_labels)


lr_predicted = lr_clf.predict(test_features)
print(classification_report(test_labels, lr_predicted))

              precision    recall  f1-score   support

        fact       0.61      0.55      0.58       196
      policy       0.84      0.85      0.85       204
       value       0.81      0.83      0.82       546

    accuracy                           0.78       946
   macro avg       0.75      0.75      0.75       946
weighted avg       0.77      0.78      0.78       946



In [106]:
#naive bayes score
nb_clf = GaussianNB()
nb_clf.fit(train_features, train_labels)
nb_predicted = nb_clf.predict(test_features)
nb_clf.score(test_features, test_labels)

0.642706131078224

In [108]:
#print classification report for all models
from sklearn.metrics import classification_report

print("Naive Bayes Score:")
print(classification_report(test_labels, nb_predicted))
print("Logistic regression Score:")
print(classification_report(test_labels, lr_predicted))
print("LinearSVC Score:")
print(classification_report(test_labels, svc_predicted))


Naive Bayes Score:
              precision    recall  f1-score   support

        fact       0.46      0.51      0.48       196
      policy       0.60      0.63      0.62       204
       value       0.74      0.69      0.72       546

    accuracy                           0.64       946
   macro avg       0.60      0.61      0.60       946
weighted avg       0.65      0.64      0.65       946

Logistic regression Score:
              precision    recall  f1-score   support

        fact       0.61      0.53      0.56       196
      policy       0.84      0.80      0.82       204
       value       0.80      0.85      0.82       546

    accuracy                           0.77       946
   macro avg       0.75      0.73      0.74       946
weighted avg       0.77      0.77      0.77       946

LinearSVC Score:
              precision    recall  f1-score   support

        fact       0.69      0.47      0.56       196
      policy       0.90      0.77      0.83       204
       value

In [44]:
domainTestdf = pd.read_csv("DomainSpecTest.csv", delimiter=',', header=None)

#tokenize words into values in dataframe for BERT
tokenized = domainTestdf[0].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

#pad each line in dataframe to a uniform length
max_len = 0
for i in tokenized.values:
  if len(i) > max_len:
    max_len = len(i)
padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

#create attention mask of the same shape as padded dataframe
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

#run data through BERT model
input_ids = torch.LongTensor(padded)
attention_mask = torch.tensor(attention_mask)

with torch.no_grad():
  last_hidden_states = model(input_ids, attention_mask=attention_mask)

#create list of processed statements
dom_spec_features = last_hidden_states[0][:,0,:].numpy()
dom_spec_labels = domainTestdf[1]

In [47]:
dom_spec_predicted = lr_clf.predict(dom_spec_features)
print(classification_report(dom_spec_labels, dom_spec_predicted))

              precision    recall  f1-score   support

        fact       0.84      0.42      0.56        64
      policy       0.90      0.99      0.94        87
       value       0.39      0.65      0.48        34

    accuracy                           0.73       185
   macro avg       0.71      0.69      0.66       185
weighted avg       0.78      0.73      0.73       185



In [59]:
#print confusion report
from sklearn.metrics import confusion_matrix

confusion_matrix(test_labels, svc_predicted)

array([[110,   2,  84],
       [  3, 169,  32],
       [ 56,  14, 476]], dtype=int64)

In [68]:
#random guess score
from sklearn.dummy import DummyClassifier
clf = DummyClassifier()

scores = cross_val_score(clf, features, labels)
print("Dummy classifier score: %0.3f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Dummy classifier score: 0.577 (+/- 0.00)


In [120]:
import time
time_test = "This is a test to show how long it takes to process a sentence"
start_time = time.time()
untokenized_input = pd.DataFrame(["This is a test to show how long it takes to process a sentence"], dtype="string")
#tokenize words into values in dataframe for BERT
tokenized = untokenized_input[0].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))
#pad each line in dataframe to a uniform length
max_len = 0
for i in tokenized.values:
  if len(i) > max_len:
    max_len = len(i)
padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])
attention_mask = np.where(padded != 0, 1, 0)
#run data through DistilBERT model
input_ids = torch.LongTensor(padded)
attention_mask = torch.tensor(attention_mask)

with torch.no_grad():
  last_hidden_states = model(input_ids, attention_mask=attention_mask)

features = last_hidden_states[0][:,0,:].numpy()

lr_clf.predict(features)
print("--- %s seconds ---" % (time.time() - start_time))

--- 0.7481307983398438 seconds ---


In [17]:
import pickle

#with open("lr_model.sav", "wb") as handle:
   # pickle.dump(lr_clf, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open("SVC_model.sav", "wb") as handle:
    pickle.dump(svc_clf, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [90]:
untokenized_features = batch_1[0]
train_untokenized, test_untokenized, train_labels, test_labels = train_test_split(untokenized_features, labels, random_state = 1, stratify=labels)


In [91]:
list_of_statements = []
for statement in enumerate(test_untokenized):
    list_of_statements.append(statement[1])
print(list_of_statements)



In [92]:
list_of_true_labels = []
for label in enumerate(test_labels):
    list_of_true_labels.append(label[1])
print(list_of_true_labels)

['value', 'value', 'value', 'value', 'policy', 'value', 'value', 'policy', 'value', 'value', 'fact', 'value', 'value', 'policy', 'value', 'value', 'value', 'value', 'value', 'fact', 'policy', 'value', 'fact', 'value', 'value', 'value', 'fact', 'fact', 'fact', 'value', 'policy', 'value', 'policy', 'policy', 'value', 'value', 'policy', 'fact', 'value', 'value', 'fact', 'fact', 'value', 'value', 'value', 'policy', 'value', 'fact', 'value', 'value', 'value', 'value', 'fact', 'policy', 'value', 'policy', 'value', 'value', 'value', 'fact', 'value', 'value', 'value', 'policy', 'value', 'value', 'fact', 'value', 'value', 'value', 'policy', 'fact', 'value', 'value', 'value', 'value', 'fact', 'value', 'value', 'policy', 'value', 'value', 'value', 'fact', 'value', 'policy', 'value', 'value', 'fact', 'value', 'fact', 'fact', 'policy', 'value', 'value', 'policy', 'fact', 'policy', 'value', 'value', 'policy', 'value', 'policy', 'value', 'value', 'value', 'value', 'policy', 'value', 'fact', 'fact', '

In [93]:
list_of_predicted_labels = []
for label in enumerate(svc_predicted):
    list_of_predicted_labels.append(label[1])
print(list_of_predicted_labels)

['value', 'value', 'value', 'value', 'policy', 'value', 'value', 'policy', 'value', 'value', 'fact', 'value', 'value', 'policy', 'value', 'value', 'value', 'value', 'value', 'value', 'policy', 'value', 'fact', 'value', 'fact', 'value', 'fact', 'fact', 'value', 'value', 'policy', 'fact', 'policy', 'policy', 'value', 'value', 'policy', 'fact', 'policy', 'value', 'fact', 'fact', 'value', 'fact', 'fact', 'policy', 'fact', 'value', 'value', 'value', 'value', 'value', 'fact', 'policy', 'value', 'policy', 'value', 'value', 'value', 'fact', 'fact', 'fact', 'value', 'value', 'policy', 'value', 'fact', 'value', 'value', 'value', 'policy', 'fact', 'value', 'fact', 'fact', 'value', 'fact', 'value', 'value', 'policy', 'value', 'value', 'value', 'value', 'value', 'policy', 'value', 'value', 'fact', 'fact', 'fact', 'fact', 'policy', 'value', 'value', 'policy', 'fact', 'policy', 'value', 'value', 'policy', 'value', 'policy', 'value', 'value', 'value', 'value', 'value', 'fact', 'fact', 'value', 'policy

In [95]:
list_of_statements[10]

'After my marriage, the Dept of Ed (Higher Ed) offset two years of joint tax returns, private collection agencies constantly called.'

In [97]:
error_analysis_list = []
for index, label in enumerate(test_labels):
    error_analysis_list.append('"' + list_of_statements[index] + '",' + list_of_true_labels[index] + ',' + list_of_predicted_labels[index])
print(error_analysis_list)



In [98]:
f = open('error_analysis_quotes.txt', 'a')

for statements in error_analysis_list:
    f.write(statements + '\n')
f.close

<function TextIOWrapper.close()>

In [80]:
#instantiate variables for stratified train test split
train_features, test_features, train_labels, test_labels train_statements, test_statements = train_test_split(features, labels, statements, stratify=labels)

SyntaxError: invalid syntax (<ipython-input-80-0629c2a967c1>, line 2)