In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [88]:
from collections import Counter
from fastai.text import untar_data, URLs, TextList
import numpy as np
import pandas as pd
import scipy
from sklearn.linear_model import LogisticRegression
# import sklearn.feature_extraction.text as sklearn_text

In [8]:
path = untar_data(URLs.IMDB_SAMPLE)
path

PosixPath('/Users/HenryDashwood/.fastai/data/imdb_sample')

In [11]:
df = pd.read_csv(path/'texts.csv')
df.head()

Unnamed: 0,label,text,is_valid
0,negative,Un-bleeping-believable! Meg Ryan doesn't even ...,False
1,positive,This is a extremely well-made film. The acting...,False
2,negative,Every once in a long while a movie will come a...,False
3,positive,Name just says it all. I watched this movie wi...,False
4,negative,This movie succeeds at being one of the most u...,False


In [14]:
movie_reviews = (TextList.from_csv(path, 'texts.csv', cols='text')
                         .split_from_df(col=2)
                         .label_from_df(cols=0))

In [19]:
print(movie_reviews.train.x[0]) 
print(movie_reviews.train.y[0])

xxbos xxmaj un - xxunk - believable ! xxmaj meg xxmaj ryan does n't even look her usual xxunk lovable self in this , which normally makes me forgive her shallow xxunk acting xxunk . xxmaj hard to believe she was the producer on this dog . xxmaj plus xxmaj kevin xxmaj kline : what kind of suicide trip has his career been on ? xxmaj xxunk ... xxmaj xxunk ! ! ! xxmaj finally this was directed by the guy who did xxmaj big xxmaj xxunk ? xxmaj must be a replay of xxmaj jonestown - hollywood style . xxmaj xxunk !
negative


In [25]:
print(len(movie_reviews.train.x), len(movie_reviews.valid.x))
print(len(movie_reviews.vocab.itos), len(movie_reviews.vocab.stoi))
print(movie_reviews.vocab.stoi['language'], movie_reviews.vocab.itos[917])

800 200
6008 19161
917 language


**See how counter works**

In [32]:
c = Counter([4,2,8,8,4,8])
print(c)
print(c.keys())
print(c.values())

Counter({8: 3, 4: 2, 2: 1})
dict_keys([4, 2, 8])
dict_values([2, 1, 3])


In [34]:
# Counter((movie_reviews.valid.x)[0].data)

In [35]:
def get_term_doc_matrix(label_list, vocab_len):
    j_indices = []
    indptr = []
    values = []
    indptr.append(0)

    for i, doc in enumerate(label_list):
        feature_counter = Counter(doc.data)
        j_indices.extend(feature_counter.keys())
        values.extend(feature_counter.values())
        indptr.append(len(j_indices))
        
    return scipy.sparse.csr_matrix(
        (values, j_indices, indptr), 
        shape=(len(indptr) - 1, vocab_len), 
        dtype=int
    )

In [41]:
trn_term_doc = get_term_doc_matrix(movie_reviews.train.x, len(movie_reviews.vocab.itos))
val_term_doc = get_term_doc_matrix(movie_reviews.valid.x, len(movie_reviews.vocab.itos))

**Get positive review with the word hated in it**

In [56]:
a = np.argwhere((x[:,movie_reviews.vocab.stoi['hated']] > 0))[:,0]
b = np.argwhere(y.items==positive)[:,0]
set(a).intersection(set(b))

{393, 612, 695}

In [55]:
review = movie_reviews.train.x[695]
review.text

"xxbos xxmaj xxunk , yeah this episode is extremely underrated . \n \n  xxmaj even though there is a xxup lot of bad writing and acting at parts . i think the good over wins the bad . \n \n  i love the xxunk parts and the big ' twist ' at the end . i absolutely love that scene when xxmaj michelle xxunk xxmaj tony . xxmaj it 's actually one of my favorite scenes of xxmaj season 1 . \n \n  xxmaj for some reason , people have always hated the xxmaj xxunk episodes , yet i have always liked them . xxmaj they 're not the best , in terms of writing . but the theme really does interest me , \n \n  i 'm gon na give it a xxup three star , but if the writing were a little more consistent i 'd give it xxup four ."

# Naive Bayes

In [45]:
x = trn_term_doc
y = movie_reviews.train.y
val_y = movie_reviews.valid.y

In [46]:
positive = y.c2i['positive']
negative = y.c2i['negative']

In [48]:
p1 = np.squeeze(np.asarray(x[y.items==positive].sum(0)))
p0 = np.squeeze(np.asarray(x[y.items==negative].sum(0)))

In [57]:
pr1 = (p1+1) / ((y.items==positive).sum() + 1)
pr0 = (p0+1) / ((y.items==negative).sum() + 1)

In [58]:
r = np.log(pr1/pr0); r

array([-0.015487,  0.084839,  0.      ,  0.084839, ...,  1.471133, -1.301455, -1.301455, -1.301455])

In [59]:
biggest = np.argpartition(r, -10)[-10:]
smallest = np.argpartition(r, 10)[:10]

In [61]:
[movie_reviews.vocab.itos[k] for k in biggest]

['sport',
 'davies',
 'gilliam',
 'fanfan',
 'biko',
 'felix',
 'noir',
 'jabba',
 'astaire',
 'jimmy']

In [63]:
np.argmax(trn_term_doc[:,movie_reviews.vocab.stoi['biko']])

515

In [67]:
# movie_reviews.train.x[515]

In [68]:
(y.items==positive).mean(), (y.items==negative).mean()

(0.47875, 0.52125)

In [73]:
b = np.log((y.items==positive).mean() / (y.items==negative).mean())

In [74]:
preds = (val_term_doc @ r + b) > 0

In [75]:
(preds == val_y.items).mean()

0.645

# Binarised naive bayes

In [77]:
x = trn_term_doc.sign()
y = movie_reviews.train.y

In [78]:
x.todense()[:10,:10]

matrix([[1, 0, 1, 0, ..., 0, 0, 0, 1],
        [1, 0, 1, 0, ..., 1, 0, 0, 1],
        [1, 0, 1, 0, ..., 1, 0, 0, 1],
        [1, 0, 1, 0, ..., 0, 0, 0, 1],
        ...,
        [1, 0, 1, 0, ..., 0, 0, 0, 1],
        [1, 0, 1, 0, ..., 1, 0, 0, 1],
        [1, 0, 1, 0, ..., 0, 0, 0, 1],
        [1, 0, 1, 0, ..., 1, 0, 0, 1]])

In [80]:
positive = y.c2i['positive']
negative = y.c2i['negative']

In [81]:
p1 = np.squeeze(np.asarray(x[y.items==positive].sum(0)))
p0 = np.squeeze(np.asarray(x[y.items==negative].sum(0)))

In [83]:
pr1 = (p1+1) / ((y.items==positive).sum() + 1)
pr0 = (p0+1) / ((y.items==negative).sum() + 1)

In [84]:
r = np.log(pr1/pr0)
b = np.log((y.items==positive).mean() / (y.items==negative).mean())

preds = (val_term_doc.sign() @ r + b) > 0

In [87]:
(preds==val_y.items).mean()

0.695

# Logistic regression

In [90]:
m = LogisticRegression(C=0.1, dual=True)
m.fit(x, y.items.astype(int))
preds = m.predict(val_term_doc)
(preds==val_y.items).mean()

0.685

In [92]:
m = LogisticRegression(C=0.1, dual=True)
m.fit(trn_term_doc.sign(), y.items.astype(int))
preds = m.predict(val_term_doc.sign())
(preds==val_y.items).mean()

0.83

# Trigram with NB features