In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

from fastai.nlp import *
from sklearn.linear_model import LogisticRegression
import os
import glob

In [22]:
from sklearn.feature_extraction.text import CountVectorizer

## Helper Functions

In [20]:
def texts_labels_from_folders(path, folders):
    texts, labels = [], []
    for idx, label in enumerate(folders):
        for fname in glob(os.path.join(path, label, '*.*')):
            texts.append(open(fname, 'r').read())
            labels.append(idx)
    return texts, np.array(labels).astype(np.int64)
    

## IMDB dataset and the sentiment classification task

In [10]:
PATH='data/aclImdb/aclImdb/'
names=['neg', 'pos']

In [11]:
%ls {PATH}

README      imdb.vocab  imdbEr.txt  [34mtest[m[m/       [34mtrain[m[m/


In [12]:
%ls {PATH}train

labeledBow.feat  [34mpos[m[m/             unsupBow.feat    urls_pos.txt
[34mneg[m[m/             [34munsup[m[m/           urls_neg.txt     urls_unsup.txt


In [13]:
%ls {PATH}train/pos | head

0_9.txt
10000_8.txt
10001_10.txt
10002_7.txt
10003_8.txt
10004_8.txt
10005_7.txt
10006_7.txt
10007_7.txt
10008_7.txt


In [21]:
trn,trn_y = texts_labels_from_folders(f'{PATH}train',names)
val,val_y = texts_labels_from_folders(f'{PATH}test',names)

In [16]:
trn[0]

"Story of a man who has unnatural feelings for a pig. Starts out with a opening scene that is a terrific example of absurd comedy. A formal orchestra audience is turned into an insane, violent mob by the crazy chantings of it's singers. Unfortunately it stays absurd the WHOLE time with no general narrative eventually making it just too off putting. Even those from the era should be turned off. The cryptic dialogue would make Shakespeare seem easy to a third grader. On a technical level it's better than you might think with some good cinematography by future great Vilmos Zsigmond. Future stars Sally Kirkland and Frederic Forrest can be seen briefly."

In [17]:
trn_y[0]

0

In [23]:
veczr = CountVectorizer(tokenizer=tokenize)

In [25]:
trn_term_doc = veczr.fit_transform(trn)
val_term_doc = veczr.transform(val)

In [26]:
trn_term_doc

<25000x75132 sparse matrix of type '<class 'numpy.int64'>'
	with 3749745 stored elements in Compressed Sparse Row format>

In [27]:
trn_term_doc[0]

<1x75132 sparse matrix of type '<class 'numpy.int64'>'
	with 93 stored elements in Compressed Sparse Row format>

In [28]:
vocab = veczr.get_feature_names(); vocab[5000:5005]

['aussie', 'aussies', 'austen', 'austeniana', 'austens']

In [29]:
w0 = set([o.lower() for o in trn[0].split(' ')]); w0

{'a',
 'absurd',
 'an',
 'and',
 'audience',
 'be',
 'better',
 'briefly.',
 'by',
 'can',
 'chantings',
 'cinematography',
 'comedy.',
 'crazy',
 'cryptic',
 'dialogue',
 'easy',
 'era',
 'even',
 'eventually',
 'example',
 'feelings',
 'for',
 'formal',
 'forrest',
 'frederic',
 'from',
 'future',
 'general',
 'good',
 'grader.',
 'great',
 'has',
 'insane,',
 'into',
 'is',
 'it',
 "it's",
 'just',
 'kirkland',
 'level',
 'make',
 'making',
 'man',
 'might',
 'mob',
 'narrative',
 'no',
 'of',
 'off',
 'off.',
 'on',
 'opening',
 'orchestra',
 'out',
 'pig.',
 'putting.',
 'sally',
 'scene',
 'seem',
 'seen',
 'shakespeare',
 'should',
 'singers.',
 'some',
 'stars',
 'starts',
 'stays',
 'story',
 'technical',
 'terrific',
 'than',
 'that',
 'the',
 'think',
 'third',
 'those',
 'time',
 'to',
 'too',
 'turned',
 'unfortunately',
 'unnatural',
 'vilmos',
 'violent',
 'who',
 'whole',
 'with',
 'would',
 'you',
 'zsigmond.'}

In [30]:
veczr.vocabulary_['absurd']

1297

In [31]:
trn_term_doc[0,1297]

2

In [33]:
trn_term_doc[0,5000]

0

## Naive Bayes

In [36]:
x=trn_term_doc
y=trn_y

p = x[y==1].sum(0)+1
q = x[y==0].sum(0)+1
r = np.log((p/p.sum()) / (q/q.sum()))
b = np.log(len(p)/len(q))

In [37]:
pre_preds = val_term_doc @ r.T + b
preds = pre_preds.T > 0
(preds==val_y).mean()

0.8074

In [38]:
pre_preds = val_term_doc.sign() @ r.T + b
preds = pre_preds.T > 0
(preds==val_y).mean()

0.82624

## Logistic Regression

In [42]:
m = LogisticRegression(C=1e8, dual=True)
m.fit(x, y)
preds = m.predict(val_term_doc)
(preds==val_y).mean()



0.85628

In [44]:
m = LogisticRegression(C=1e8, dual=True)
m.fit(trn_term_doc.sign(), y)
preds = m.predict(val_term_doc.sign())
(preds==val_y).mean()



0.85512

In [45]:
m = LogisticRegression(C=0.1, dual=True)
m.fit(trn_term_doc.sign(), y)
preds = m.predict(val_term_doc.sign())
(preds==val_y).mean()

0.88404

## Trigram with NB features

In [53]:
def pr(y_i):
    p = x[y==y_i].sum(0)
    return (p+1) / ((y==y_i).sum()+1)

In [46]:
veczr =  CountVectorizer(ngram_range=(1,3), tokenizer=tokenize,
                         max_features=800000)
trn_term_doc = veczr.fit_transform(trn)
val_term_doc = veczr.transform(val)

In [48]:
trn_term_doc.shape

(25000, 800000)

In [49]:
vocab = veczr.get_feature_names()
vocab[200000:200005]

['by vast', 'by vengeance', 'by vengeance .', 'by vera', 'by vera miles']

In [51]:
y=trn_y
x=trn_term_doc.sign()
val_x = val_term_doc.sign()

In [54]:
r = np.log(pr(1) / pr(0))
b = np.log((y==1).mean() / (y==0).mean())

In [55]:
m = LogisticRegression(C=0.1, dual=True)
m.fit(x, y)
preds = m.predict(val_term_doc)
(preds==val_y).mean()

0.87872

In [56]:

np.exp(r)

matrix([[0.94678, 0.85129, 0.78049, ..., 3.     , 0.5    , 0.5    ]])

In [57]:
x_nb = x.multiply(r)
m = LogisticRegression(dual=True, C=0.1)
m.fit(x_nb, y);

val_x_nb = val_x.multiply(r)
preds = m.predict(val_x_nb)
(preds.T==val_y).mean()

0.91768

## FastAI NBSVM++

In [58]:
sl=2000

In [59]:
md = TextClassifierData.from_bow(trn_term_doc, trn_y, val_term_doc, val_y, sl)

In [60]:
learner = md.dotprod_nb_learner()
learner.fit(0.02, 1, wds=1e-6, cycle_len=1)

HBox(children=(IntProgress(value=0, description='Epoch', max=1, style=ProgressStyle(description_width='initial…

epoch      trn_loss   val_loss   <lambda>   
    0      0.02215    0.118981   0.91712   



[0.11898146554470063, 0.917120000038147]