# Rough outline:
## Read in data from tsv
## Train tabular classifier (?)

In [1]:
%load_ext autoreload
%autoreload 2
import os
import os.path
import numpy as np
import pandas as pd
import sklearn
import sklearn.model_selection
import sklearn.linear_model
import sklearn.ensemble
import spacy
import sys
from sklearn.feature_extraction.text import CountVectorizer
from anchor import anchor_text
import time

In [38]:
nlp = spacy.load('en_core_web_sm')

In [11]:
def clean(data: str):
    return data.strip()

In [44]:
# def numberify(sent):
#     match sent:
#         case "negative":
#             return -1
#         case "positive":
#             return 1
#         case _:
#             return 0

def numberify(sent):
    match sent:
        case "negative":
            return 0
        case "positive":
            return 1
        case _:
            return -1

In [45]:
def load_data(fname="Tweets.csv"):
    df = pd.read_csv(fname)
    df = df[df['sentiment'] != "neutral"]
    # return df['clean_text'].apply(clean), df['category']
    return df['selected_text'], df['sentiment'].apply(numberify)

In [28]:
data, labels = load_data()
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(data, labels, test_size=.2)

In [46]:
data, labels = load_data()
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(data, labels, test_size=.2)
train, test, train_labels, test_labels = sklearn.model_selection.train_test_split(data, labels, test_size=.2, random_state=42)
train, val, train_labels, val_labels = sklearn.model_selection.train_test_split(train, train_labels, test_size=.1, random_state=42)
train_labels = np.array(train_labels)
test_labels = np.array(test_labels)
val_labels = np.array(val_labels)

In [31]:
train

590                  Unfortunately
10156       very very nervous  bah
19490               What a burden.
15658                         best
16284                       i wish
                   ...            
18112                         Good
5689     oh my... now im offended.
27019                 successfully
7037                        yummmm
18464                       afraid
Name: selected_text, Length: 11781, dtype: object

In [32]:
train_labels

array([-1, -1, -1, ...,  1,  1, -1])

In [79]:
vectorizer = CountVectorizer(min_df=1, max_features=10000)
vectorizer.fit(train)
train_vectors = vectorizer.transform(train)
test_vectors = vectorizer.transform(test)
val_vectors = vectorizer.transform(val)

In [80]:
c = sklearn.linear_model.LogisticRegression()
# c = sklearn.ensemble.RandomForestClassifier(n_estimators=500, n_jobs=10)
c.fit(train_vectors, train_labels)
preds = c.predict(val_vectors)
print('Val accuracy', sklearn.metrics.accuracy_score(val_labels, preds))
def predict_lr(texts):
    return c.predict(vectorizer.transform(texts))

Val accuracy 0.917494270435447


In [81]:
# explainer = anchor_text.AnchorText(nlp, ['negative', 'positive'], use_unk_distribution=True)
explainer = anchor_text.AnchorText(nlp, ['negative', 'positive'], use_unk_distribution=False)

In [62]:
np.random.seed(1)
text = 'I thought the most recent film I saw was decent .'
pred = explainer.class_names[predict_lr([text])[0]]
alternative =  explainer.class_names[1 - predict_lr([text])[0]]
print('Prediction: %s' % pred)
exp = explainer.explain_instance(text, predict_lr, threshold=0.95)

Prediction: negative


KeyboardInterrupt: 

In [59]:
print('Anchor: %s' % (' AND '.join(exp.names())))
print('Precision: %.2f' % exp.precision())
print()
print('Examples where anchor applies and model predicts %s:' % pred)
print()
print('\n'.join([x[0] for x in exp.examples(only_same_prediction=True)]))
print()
print('Examples where anchor applies and model predicts %s:' % alternative)
print()
print('\n'.join([x[0] for x in exp.examples(partial_index=0, only_different_prediction=True)]))

Anchor: saw
Precision: 1.00

Examples where anchor applies and model predicts negative:

I thought the most recent film I saw was UNK UNK
I UNK UNK most UNK film I saw was UNK UNK
I thought the UNK recent UNK I saw was UNK .
UNK thought the UNK UNK film UNK saw was UNK .
UNK thought UNK UNK UNK film UNK saw UNK UNK UNK
I UNK UNK UNK UNK UNK I saw was UNK UNK
I thought the most recent film I saw UNK decent UNK
I UNK UNK most recent UNK UNK saw was UNK UNK
UNK UNK UNK most recent film I saw was decent .
I thought UNK UNK UNK film I saw UNK UNK .

Examples where anchor applies and model predicts positive:




In [82]:
np.random.seed(1)
text = 'This is a long book, but ok and dense.'
# text = 'I thought the most recent film I saw was decent .'
pred = explainer.class_names[predict_lr([text])[0]]
alternative =  explainer.class_names[1 - predict_lr([text])[0]]
print('Prediction: %s' % pred)
b = time.time()
exp = explainer.explain_instance(text, predict_lr, threshold=0.95, verbose=False, onepass=True)
print('Time: %s' % (time.time() - b))

Prediction: negative
Time: 6.406022310256958


In [83]:
print('Anchor: %s' % (' AND '.join(exp.names())))
print('Precision: %.2f' % exp.precision())
print()
print('Examples where anchor applies and model predicts %s:' % pred)
print()
print('\n'.join([x[0] for x in exp.examples(only_same_prediction=True)]))
print()
print('Examples where anchor applies and model predicts %s:' % alternative)
print()
print('\n'.join([x[0] for x in exp.examples(only_different_prediction=True)]))

Anchor: long
Precision: 0.96

Examples where anchor applies and model predicts negative:

place is denoted long beach , f 6 post vowels ##tum
ハ ##δ : long slender array ##τ ##ל ##τ ##ա ##sson
— comprised a long creek , showing - ##gh ##ite .
it is called long with in # ##и nasal trees .
o " ##A long outer haired but di a root |
for constructed a long poem sleeve linking any ##llar leg .
do ##100 with long ##ue ##л ##к ##ל ##न → |
propeller is a long ##horn and containing ##tric fish substrate .
pro is a long tailed branch . into ##al muscle .
ن ##ε ##я long ##cat ##υ τ ##ر f ##р ##т

Examples where anchor applies and model predicts positive:

[ define a long lasting > mon ##ρ fun c .
he is a long ##evity ##er to bi for organism .
love is so long vowel σ short array ##δ ##4 association
I possesses a long , coat but too good arms .
p and a long ##fell ##a ##ל in ##ф fun ?
disco enjoy a long horizontal but for per edge configuration .
there is a long curved for fun " " ##P .
it show a lon