In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

from fastai.learner import *

import torchtext
from torchtext import vocab, data
from torchtext.datasets import language_modeling

from fastai.rnn_reg import *
from fastai.rnn_train import *
from fastai.nlp import *
from fastai.lm_rnn import *

import dill as pickle
import spacy
import os
import glob
import pandas as pd

In [78]:
ls

[0m[01;36mfastai[0m@                                             test.tsv
[01;34mlabeled_text[0m/                                       TEXT.pkl
movie-review-sentiment-analysis-kernels-only.ipynb  train.tsv
sampleSubmission.csv                                Untitled.ipynb


In [59]:
df_train = pd.read_csv("train.tsv", sep="\t")

In [60]:
df_test = pd.read_csv("test.tsv",sep="\t")

In [57]:
df_test.head()

Unnamed: 0,Phrase
0,A series of escapades demonstrating the adage ...
1,"This quiet , introspective and entertaining in..."
2,"Even fans of Ismail Merchant 's work , I suspe..."
3,A positively thrilling combination of ethnogra...
4,Aggressive self-glorification and a manipulati...


In [58]:
df_train.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
0,64,2,"This quiet , introspective and entertaining in...",4
0,82,3,"Even fans of Ismail Merchant 's work , I suspe...",1
0,117,4,A positively thrilling combination of ethnogra...,3
0,157,5,Aggressive self-glorification and a manipulati...,1


# Language Model

In [61]:
#drop incomplete sentences in training set
def dropIncomplete (df, label="SentenceId", keepLocation=0):
    sentences = []
    for i in df[label].unique():  
        sentences.append(df[df[label] == i].reset_index(drop=True).loc[keepLocation])
    df_result = pd.DataFrame(sentences)
    return df_result

In [62]:
df_train = dropIncomplete(df_train)

In [63]:
df_test = dropIncomplete(df_test)

In [64]:
df_train = pd.DataFrame(df_train["Phrase"]).reset_index(drop=True)
df_test = pd.DataFrame(df_test["Phrase"]).reset_index(drop=True)

In [65]:
df_train.head()

Unnamed: 0,Phrase
0,A series of escapades demonstrating the adage ...
1,"This quiet , introspective and entertaining in..."
2,"Even fans of Ismail Merchant 's work , I suspe..."
3,A positively thrilling combination of ethnogra...
4,Aggressive self-glorification and a manipulati...


In [66]:
df_test.head()

Unnamed: 0,Phrase
0,An intermittently pleasing but mostly routine ...
1,Kidman is really the only thing that 's worth ...
2,Once you get into its rhythm ... the movie bec...
3,I kept wishing I was watching a documentary ab...
4,"Kinnear does n't aim for our sympathy , but ra..."


In [67]:
df_full = df_train.append(df_test).reset_index(drop=True)

In [68]:
df_full[df_full.shape[1]-10:]

Unnamed: 0,Phrase
11830,"At its worst , it implodes in a series of very..."
11831,Maybe I found the proceedings a little bit too...
11832,Moretti 's compelling anatomy of grief and the...
11833,Montias ... pumps a lot of energy into his nic...
11834,Not sweet enough to liven up its predictable s...
11835,"Nasty , ugly , pointless and depressing , even..."
11836,"With tightly organized efficiency , numerous f..."
11837,They should have called it Gutterball .
11838,"A long-winded , predictable scenario ."


In [69]:
df_full.shape

(11839, 1)

In [70]:
spacy_tok = spacy.load('en')

In [71]:
TEXT = data.Field(lower=True, tokenize="spacy")

In [72]:
bs=64; bptt=70

In [74]:
df_train.shape, df_test.shape

((8529, 1), (3310, 1))

In [77]:
md = LanguageModelData.from_dataframes(path="",field=TEXT,col="Phrase",train_df=df_train, val_df=df_test,
                                       test_df=df_test,bs=bs, bptt=bptt)

In [82]:
pickle.dump(TEXT, open('TEXT.pkl','wb'))

In [83]:
ls

[0m[01;36mfastai[0m@                                             test.tsv
[01;34mlabeled_text[0m/                                       TEXT.pkl
movie-review-sentiment-analysis-kernels-only.ipynb  train.tsv
sampleSubmission.csv                                Untitled.ipynb


In [84]:
len(md.trn_dl), md.nt, len(md.trn_ds), len(md.trn_ds[0].text)

(42, 15364, 1, 193010)

In [85]:
TEXT.vocab.itos[:12]

['<unk>', '<pad>', '<', '>', 'eos', '.', 'the', ',', 'a', 'and', 'of', 'to']

In [95]:
TEXT.vocab.stoi['to']

11

In [99]:
em_sz = 200
nh = 500
nl = 3

In [100]:
opt_fn = partial(optim.Adam, betas = (0.7, 0.99))

In [101]:
learner = md.get_model(opt_fn, em_sz, nh, nl, dropouti=0.05, dropout=0.05,
                       wdrop=0.1, dropoute=0.02, dropouth=0.05)

In [103]:
learner.reg_fn = partial(seq2seq_reg, alpha=2, beta=1)
learner.clip=0.3

In [104]:
learner.fit(3e-3, 4, wds=1e-6, cycle_len=1, cycle_mult=2)

HBox(children=(IntProgress(value=0, description='Epoch', max=15), HTML(value='')))

epoch      trn_loss   val_loss                            
    0      6.880169   6.335535  
    1      6.218176   5.839958                            
    2      5.893027   5.778106                            
    3      5.737751   5.651069                            
    4      5.55407    5.543834                            
    5      5.404719   5.511692                            
    6      5.30014    5.494776                            
    7      5.262544   5.424292                            
    8      5.160013   5.362042                            
    9      5.035973   5.332091                            
    10     4.916161   5.3413                              
    11     4.817457   5.336985                            
    12     4.723534   5.345652                            
    13     4.653373   5.346626                            
    14     4.630704   5.34723                             



[array([5.34723])]

In [105]:
learner.save_encoder('adam1_movieReview_enc')

In [106]:
learner.fit(3e-3, 1, wds=1e-6, cycle_len=10)

HBox(children=(IntProgress(value=0, description='Epoch', max=10), HTML(value='')))

epoch      trn_loss   val_loss                            
    0      4.630084   5.305233  
    1      4.62363    5.201556                            
    2      4.548767   5.276677                            
    3      4.461538   5.180841                            
    4      4.36571    5.235739                            
    5      4.252809   5.292619                            
    6      4.174943   5.340247                            
    7      4.151711   5.28213                             
    8      4.071986   5.334234                            
    9      4.042105   5.325338                            



[array([5.32534])]

In [123]:
learner.save_encoder('adam3_movieReview_10_enc')

# Test

In [119]:
#use our torchtext field to numericalize it so we can feed it to our language model.
m = learner.model
ss = """This is not something that I will """
s = [TEXT.preprocess(ss)]
t=TEXT.numericalize(s)
' '.join(s[0])

'this is not something that i will'

In [120]:
#steps to test a language model
# Set batch size to 1
m[0].bs=1
# Turn off dropout
m.eval()
# Reset hidden state
m.reset()
# Get predictions from model
res,*_ = m(t)
# Put the batch size back to what it was
m[0].bs=bs

In [121]:
#Top 10 predictions
nexts = torch.topk(res[-1], 10)[1]
[TEXT.vocab.itos[o] for o in to_np(nexts)]

['have', 've', 'want', 'be', 'feel', 'see', 'ever', 'expect', 'not', 'm']

In [122]:
#generate a bit more text
print(ss,"\n")
for i in range(20):
    n=res[-1].topk(2)[1]
    n = n[1] if n.data[0]==0 else n[0]
    print(TEXT.vocab.itos[n.data[0]], end=' ')
    res, *_ =m(n[0].unsqueeze(0))
print("...")

This is not something that I will  

have been been a movie . < eos > the film is a good movie , but it 's a ...


# Sentiment

In [31]:
#drop incomplete sentences in training set
sentences = []
for i in df_train["SentenceId"].unique():  
    sentences.append([df_train[df_train["SentenceId"] == i].reset_index().loc[0]["Phrase"],
                      df_train[df_train["SentenceId"] == i].reset_index().loc[0]["Sentiment"]])
#     print(df[i].reset_index().loc[0]["Phrase"])
df_train = pd.DataFrame(sentences)

In [32]:
len(df_train)

8529

In [35]:
count = 1
validRatio = 0.3
L = len(df_train)
cutPoint = int(round((1-validRatio)*L))

for index, row in df_train.iterrows():
    phrase=row[0]
    label = row[1]
    if (count<cutPoint):
        filename = os.path.join("labeled_text","train", str(label), str(count))+".txt"
    else:
        filename = os.path.join("labeled_text","valid", str(label), str(count))+".txt"
    os.makedirs(os.path.dirname(filename), exist_ok=True)
    with open(filename,"w") as f:
        f.write(phrase)
    count = count + 1

In [36]:
#total number of sentences
count

8530

In [40]:
#of examples in training set
cutPoint-1

5969

In [38]:
#Show labels
labelList = !ls labeled_text/train
labelList

['0', '1', '2', '3', '4']

In [42]:
#Count number of samples under each label
labelCount = []
for i in labelList:
    trn_files = !ls labeled_text/train/{i}
    labelCount.append(len(trn_files))
labelCount

[755, 1550, 1153, 1620, 891]

In [62]:
#Review sentences
label = 3
fileIndex = 40
FileList = !ls labeled_text/train/{label}
sample = !cat labeled_text/train/{label}/{FileList[fileIndex]}
sample

['It throws quirky characters , odd situations , and off-kilter dialogue at us , all as if to say , `` Look at this !']

In [53]:
class MovieReview(torchtext.data.Dataset):
    """Create a MovieReview dataset instance given a path and fields.
    Arguments:
        path: Path to the dataset's highest level directory
        text_field: The field that will be used for text data.
        label_field: The field that will be used for label data.
        Remaining keyword arguments: Passed to the constructor of
            data.Dataset.
    """
    def __init__(self, path, text_field, label_field, **kwargs):
        fields = [('text', text_field), ('label', label_field)]
        examples = []
        for label in ['0','1','2','3','4']:
            fnames = glob.glob(os.path.join(path, label, "*.txt"))
            assert fnames, f"can't find file under {path}/{label}"
            for fname in fnames:
                with open(fname, "r") as f: text = f.readline()
                examples.append(data.Example.fromlist([text,label], fields))
        super().__init__(examples, fields, **kwargs)
    
    @classmethod
    def splits(cls, text_field, label_field, root='.data', 
               train='train', test='test', **kwargs):
        return super().splits(
            root, text_field=text_field, label_field=label_field,
            train=train, validation=None, test=test, **kwargs)
    
        

In [54]:
MovieReviewLabel = data.Field(sequential=False)

In [55]:
TEXT = pickle.load(open("TEXT.pkl","rb")) #open in read and bineary mode

In [56]:
PATH = "labeled_text"

In [57]:
splits = MovieReview.splits(TEXT, MovieReviewLabel, PATH,train="train", test="valid")

In [71]:
t = splits[0].examples[0]

In [72]:
t.label, " ".join(t.text[:16])

('0',
 'the script was reportedly rewritten a dozen times -- either 11 times too many or else')

In [81]:
bs = 64; bptt = 70

In [74]:
md2 = TextData.from_splits(PATH, splits, bs)

In [75]:
ls

adam3_10_enc-Copy1.h5                               sampleSubmission.csv
[0m[01;36mfastai[0m@                                             test.tsv
[01;34mlabeled_text[0m/                                       TEXT.pkl
movie-review-sentiment-analysis-kernels-only.ipynb  train.tsv


In [77]:
opt_fn = partial(optim.Adam, betas=(0.7, 0.99))

In [83]:
em_sz = 200  # size of each embedding vector
nh = 500     # number of hidden activations per layer
nl = 3       # number of layers

In [85]:
m3 = md2.get_model(opt_fn, 1500, bptt, emb_sz=em_sz, n_hid=nh, n_layers=nl, 
           dropout=0.1, dropouti=0.4, wdrop=0.5, dropoute=0.05, dropouth=0.3)
m3.reg_fn = partial(seq2seq_reg, alpha=2, beta=1)
m3.load_encoder(f'adam3_10_enc')

In [86]:
m3.clip=25.
lrs=np.array([1e-4,1e-4,1e-4,1e-3,1e-2])

In [87]:
m3.fit(lrs, 7, metrics=[accuracy], cycle_len=2, cycle_save_name='MovieREview')

HBox(children=(IntProgress(value=0, description='Epoch', max=14), HTML(value='')))

  0%|          | 0/94 [00:00<?, ?it/s]


TypeError: '<' not supported between instances of 'Example' and 'Example'