In [1]:
import csv
import json

import numpy as np
import pandas as pd
from datasets import load_dataset

from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_fscore_support, roc_curve, auc

from transformers import Trainer, TrainingArguments
from transformers import DataCollatorWithPadding, AutoTokenizer, AutoModelForSequenceClassification

import torch

from tqdm import tqdm, trange
from dataset_loader import load

from eli5 import show_weights
import matplotlib.pyplot as plt

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset_name = 'NarrativeQA'
dataset = load(dataset_name, 'datasets/')

parsing data: 100%|██████████| 758/758 [00:00<00:00, 108282.50it/s]


In [3]:
train_df  = pd.DataFrame.from_dict(dataset['train'])
train_df.to_csv("squad_train.csv")
dev_df = pd.DataFrame.from_dict(dataset['test'])
dev_df.to_csv("squad_test.csv")

In [4]:
train_df.head()

Unnamed: 0,text,label
0,With a gun.,0
1,Drexl was killed by Clarence Worley.,1
2,Voyager 2 Disk,0
3,The disk used by Starman to understand English...,1
4,After the farmer died,0


In [5]:
import stanza
from collections import defaultdict

def deprel_func(sentences, nlp):
    doc = nlp(sentences)
    deprel_dist = defaultdict(int)
    for sent in doc.sentences:
        for word in sent.words:
            deprel_dist[word.deprel] += 1
    return deprel_dist

nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos,lemma,depparse')

2023-04-18 15:50:58 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json: 200kB [00:00, 3.67MB/s]                    
2023-04-18 15:50:59 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |
| pos       | combined |
| lemma     | combined |
| depparse  | combined |

2023-04-18 15:51:00 INFO: Using device: cuda
2023-04-18 15:51:00 INFO: Loading: tokenize
2023-04-18 15:51:02 INFO: Loading: pos
2023-04-18 15:51:03 INFO: Loading: lemma
2023-04-18 15:51:03 INFO: Loading: depparse
2023-04-18 15:51:03 INFO: Done loading processors!


In [6]:
deprel_dist = [deprel_func(sent, nlp) for sent in tqdm(train_df['text'])]
keys = set().union(*(d.keys() for d in deprel_dist))

100%|██████████| 1214/1214 [04:12<00:00,  4.81it/s]


In [7]:
deprel_dist_dev = [deprel_func(sent, nlp) for sent in tqdm(dev_df['text'])]
dev_keys = set().union(*(d.keys() for d in deprel_dist))

100%|██████████| 302/302 [01:05<00:00,  4.58it/s]


In [8]:
all_keys = keys.union(dev_keys)

train_list = [{k: d.get(k, 0) for k in all_keys} for d in deprel_dist]
dev_list = [{k: d.get(k, 0) for k in all_keys} for d in deprel_dist_dev]

In [9]:
train_syn = pd.DataFrame(train_list)
train_mean = train_syn.mean(axis=0)
train_std = train_syn.std(axis=0)
dev_syn = pd.DataFrame(dev_list)

train_syn = (train_syn - train_mean) / train_std
dev_syn = (dev_syn - train_mean) / train_std

### Dummy Classifier

In [10]:
mf_dummy_clf = DummyClassifier(strategy="most_frequent")
mf_dummy_clf.fit(train_df.text, train_df.label)
mf_dummy_test_preds = mf_dummy_clf.predict(dev_df.text)
precision_recall_fscore_support(dev_df.label, mf_dummy_test_preds, average='binary')

  _warn_prf(average, modifier, msg_start, len(result))


(0.0, 0.0, 0.0, None)

In [11]:
random_dummy_clf = DummyClassifier(strategy="stratified")
random_dummy_clf.fit(train_df.text, train_df.label)
random_dummy_test_preds = random_dummy_clf.predict(dev_df.text)
precision_recall_fscore_support(dev_df.label, random_dummy_test_preds, average='binary')

(0.5144927536231884, 0.47019867549668876, 0.4913494809688581, None)

In [12]:
fpr, tpr, threshold = roc_curve(np.array(dev_df.label) , random_dummy_test_preds)
auc(fpr, tpr)

0.5132450331125827

### Sklearn

In [13]:
# Convert text to features using a simple tf-idf
# min_df = is the minimum occurrences for a word to a feature
vectorizer = TfidfVectorizer(min_df=10)
X_train = vectorizer.fit_transform(train_df.text.values.astype('U'))

# Train a Logisitic Regression classifier
clf = LogisticRegression()
clf.fit(X_train, train_df.label)

# Test the classifier by converting test data to features
X_test = vectorizer.transform(dev_df.text.values.astype('U'))
y_pred = clf.predict(X_test)
y_true = dev_df.label

# We'll using binary F1 to test whether we can detect sarcastic comments
precision_recall_fscore_support(y_true, y_pred, average='binary')


(0.8300653594771242, 0.8410596026490066, 0.8355263157894738, None)

In [14]:
fpr, tpr, threshold = roc_curve(np.array(y_true) , y_pred)
auc(fpr, tpr)

0.8344370860927153

In [15]:
pred_df = pd.DataFrame({'text': dev_df.text, 'label': dev_df.label, 'predict': y_pred})
pd.set_option('display.max_colwidth', None)
not_match_df = pred_df[pred_df.label != pred_df.predict]
not_match_df

Unnamed: 0,text,label,predict
14,In a mansion in Ville-d'Avray.,0,1
15,Mr. and Mrs. Vervelle live in Ville-d'Avray.,1,0
26,to take her and Dain to the mouth of the river where they would find a boat to help them escape,0,1
34,"The political question of if there is a good number or too many Jews hasn't been addressed, even in civilized countries.",0,1
42,In a car bomb meant for Michael,0,1
45,Ernest eventually becomes an author of controversial literature.,1,0
51,"Bumblebee, the Autobot scout, communicates through his car radio since he is mute.",1,0
52,By flirting with Bill.,0,1
64,He finds out that Francis is the son of a mafia boss.,0,1
69,Bateman is a wealthy New York investment banker.,1,0


In [16]:
vocab = vectorizer.vocabulary_
id_to_vocab = {id_: word for word, id_ in vocab.items()}

show_weights(clf,  vec=vectorizer, feature_names=id_to_vocab, top=(10, 10))

Weight?,Feature
+3.314,is
+2.871,context
+2.731,after
+2.647,in
+2.536,to
+2.455,when
+2.301,as
+2.276,with
+1.874,of
+1.782,from


### Sklearn with Syn

In [17]:
vectorizer = TfidfVectorizer(min_df=10)
X_train = vectorizer.fit_transform(train_df.text.values.astype('U'))
X_train = np.concatenate((X_train.toarray(), train_syn.to_numpy()), axis=1)

# Train a Logisitic Regression classifier
clf = LogisticRegression()
clf.fit(X_train, train_df.label)

# Test the classifier by converting test data to features
X_test = vectorizer.transform(dev_df.text.values.astype('U'))
X_test = np.concatenate((X_test.toarray(), dev_syn.to_numpy()), axis=1)
y_pred = clf.predict(X_test)
y_true = dev_df.label

# We'll using binary F1 to test whether we can detect sarcastic comments
precision_recall_fscore_support(y_true, y_pred, average='binary')

(0.9383561643835616, 0.9072847682119205, 0.9225589225589225, None)

In [18]:
fpr, tpr, threshold = roc_curve(np.array(y_true) , y_pred)
auc(fpr, tpr)

0.9238410596026491

In [19]:
pred_df = pd.DataFrame({'text': dev_df.text, 'label': dev_df.label, 'predict': y_pred})
pd.set_option('display.max_colwidth', None)
not_match_df = pred_df[pred_df.label != pred_df.predict]
not_match_df

Unnamed: 0,text,label,predict
14,In a mansion in Ville-d'Avray.,0,1
26,to take her and Dain to the mouth of the river where they would find a boat to help them escape,0,1
34,"The political question of if there is a good number or too many Jews hasn't been addressed, even in civilized countries.",0,1
41,Bobo burns Lilly's hand with a cigar.,1,0
45,Ernest eventually becomes an author of controversial literature.,1,0
64,He finds out that Francis is the son of a mafia boss.,0,1
74,"Socrates reasoned that Crito and his friends should not worry about public opinion, but listen to only wise and expert advice.",0,1
81,"The statement ""don't buy from Jews"" causes much anxiety among Jewish people in the given context.",1,0
88,"A Malayan prince who wanted to talk to Almayer about trading, but ended up falling in love with Almayer's daughter Nina.",0,1
101,Claude marries Christine Hallegrain.,1,0


In [20]:
vocab = vectorizer.vocabulary_
id_to_vocab = {id_: word for word, id_ in vocab.items()}
for i, k in enumerate(all_keys, len(id_to_vocab)):
    id_to_vocab[i] = k

show_weights(clf,  vec=vectorizer, feature_names=id_to_vocab, top=(10, 10))

Weight?,Feature
+3.603,nsubj
+2.767,<BIAS>
+1.853,punct
+1.692,is
+1.382,case
+1.353,obl
+1.177,context
+0.909,obj
+0.901,socrates
+0.838,in


### Only Syn

In [21]:
vectorizer = TfidfVectorizer(min_df=10)
X_train = train_syn.to_numpy()

# Train a Logisitic Regression classifier
clf = LogisticRegression()
clf.fit(X_train, train_df.label)

# Test the classifier by converting test data to features
X_test = dev_syn.to_numpy()
y_pred = clf.predict(X_test)
y_true = dev_df.label

# We'll using binary F1 to test whether we can detect sarcastic comments
precision_recall_fscore_support(y_true, y_pred, average='binary')

(0.9133333333333333, 0.9072847682119205, 0.9102990033222591, None)

In [22]:
fpr, tpr, threshold = roc_curve(np.array(y_true) , y_pred)
auc(fpr, tpr)

0.9105960264900662

In [23]:
pred_df = pd.DataFrame({'text': dev_df.text, 'label': dev_df.label, 'predict': y_pred})
pd.set_option('display.max_colwidth', None)
not_match_df = pred_df[pred_df.label != pred_df.predict]
not_match_df

Unnamed: 0,text,label,predict
14,In a mansion in Ville-d'Avray.,0,1
26,to take her and Dain to the mouth of the river where they would find a boat to help them escape,0,1
34,"The political question of if there is a good number or too many Jews hasn't been addressed, even in civilized countries.",0,1
45,Ernest eventually becomes an author of controversial literature.,1,0
64,He finds out that Francis is the son of a mafia boss.,0,1
73,Cathleen and Nora receive clothing from the drowned corpse that confirms it is their brother Michael.,1,0
74,"Socrates reasoned that Crito and his friends should not worry about public opinion, but listen to only wise and expert advice.",0,1
81,"The statement ""don't buy from Jews"" causes much anxiety among Jewish people in the given context.",1,0
88,"A Malayan prince who wanted to talk to Almayer about trading, but ended up falling in love with Almayer's daughter Nina.",0,1
101,Claude marries Christine Hallegrain.,1,0


In [24]:
id_to_vocab = {}
for i, k in enumerate(all_keys):
    id_to_vocab[i] = k

show_weights(clf,  vec=vectorizer, feature_names=id_to_vocab, top=(10, 10))

Weight?,Feature
+3.260,nsubj
+3.200,<BIAS>
+2.209,punct
+1.598,case
+1.270,obl
+1.039,det
+0.986,flat
+0.771,aux:pass
+0.714,obj
+0.693,cop
