In [2]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

from fastai.learner import *

import torchtext
from torchtext import vocab, data
from torchtext.datasets import language_modeling

from fastai.rnn_reg import *
from fastai.rnn_train import *
from fastai.nlp import *
from fastai.lm_rnn import *

import dill as pickle
import spacy
import os
import glob
import pandas as pd

In [2]:
#Make
ls

[0m[01;36mfastai[0m@                                             test.tsv
[01;34mlabeled_text[0m/                                       TEXT.pkl
[01;34mmodels[0m/                                             [01;34mtmp[0m/
movie-review-sentiment-analysis-kernels-only.ipynb  train.tsv
sampleSubmission.csv                                Untitled.ipynb


# Data prepartion

In [6]:
df_train = pd.read_csv("train.tsv", sep="\t")
df_test = pd.read_csv("test.tsv",sep="\t")

In [7]:
#drop incomplete sentences in training set
def dropIncomplete (df, label="SentenceId", keepLocation=0):
    sentences = []
    for i in df[label].unique():  
        sentences.append(df[df[label] == i].reset_index(drop=True).loc[keepLocation])
    df_result = pd.DataFrame(sentences)
    return df_result

In [8]:
df_train = dropIncomplete(df_train)

In [9]:
df_test = dropIncomplete(df_test)

In [10]:
df_train = pd.DataFrame(df_train["Phrase"]).reset_index(drop=True)
df_test = pd.DataFrame(df_test["Phrase"]).reset_index(drop=True)

In [11]:
df_train.head()

Unnamed: 0,Phrase
0,A series of escapades demonstrating the adage ...
1,"This quiet , introspective and entertaining in..."
2,"Even fans of Ismail Merchant 's work , I suspe..."
3,A positively thrilling combination of ethnogra...
4,Aggressive self-glorification and a manipulati...


In [12]:
df_test.head()

Unnamed: 0,Phrase
0,An intermittently pleasing but mostly routine ...
1,Kidman is really the only thing that 's worth ...
2,Once you get into its rhythm ... the movie bec...
3,I kept wishing I was watching a documentary ab...
4,"Kinnear does n't aim for our sympathy , but ra..."


# Load and test imdb language model

In [25]:
#Preparing to load model
spacy_tok = spacy.load('en')
TEXT = pickle.load(open("TEXT_imdb.pkl","rb")) #TEXT field from imdb data set
bs=64; bptt=70
md = LanguageModelData.from_dataframes(path="labeled_text",field=TEXT,col="Phrase",train_df=df_train, val_df=df_test,
                                       test_df=df_test,bs=bs, bptt=bptt)

In [26]:
em_sz = 200
nh = 500
nl = 3
opt_fn = partial(optim.Adam, betas = (0.7, 0.99))

In [27]:
learner = md.get_model(opt_fn, em_sz, nh, nl, dropouti=0.05, dropout=0.05,
                       wdrop=0.1, dropoute=0.02, dropouth=0.05)

In [28]:
#Load pretrained enconder from imdb dataset
learner.load_encoder('adam3_imdb_10_enc')

In [32]:
#use our torchtext field to numericalize it so we can feed it to our language model.
m = learner.model
ss = """This is not something that I will have watched"""
s = [TEXT.preprocess(ss)]
t=TEXT.numericalize(s)
' '.join(s[0])

'this is not something that i will have watched'

In [33]:
#steps to test a language model
# Set batch size to 1
m[0].bs=1
# Turn off dropout
m.eval()
# Reset hidden state
m.reset()
# Get predictions from model
res,*_ = m(t)
# Put the batch size back to what it was
m[0].bs=bs

In [34]:
#Top 10 predictions
nexts = torch.topk(res[-1], 10)[1]
[TEXT.vocab.itos[o] for o in to_np(nexts)]

['before', 'over', '.', 'in', 'again', 'for', 'on', ',', 'since', 'at']

In [35]:
#generate a bit more text
print(ss,"\n")
for i in range(20):
    n=res[-1].topk(2)[1]
    n = n[1] if n.data[0]==0 else n[0]
    print(TEXT.vocab.itos[n.data[0]], end=' ')
    res, *_ =m(n[0].unsqueeze(0))
print("...")

This is not something that I will have watched 

before the end of the film . the film is a bit of a mess , but it 's a ...


# Sentiment

In [54]:
#Process training set data with sentiment label
df_trainWLabels = pd.read_csv("train.tsv", sep="\t")
df_trainWLabels = dropIncomplete(df_trainWLabels)
df_trainWLabels.drop(["PhraseId", "SentenceId"], axis = 1,inplace=True)
df_trainWLabels.reset_index(inplace=True, drop=True)
df_trainWLabels.head()

Unnamed: 0,Phrase,Sentiment
0,A series of escapades demonstrating the adage ...,1
1,"This quiet , introspective and entertaining in...",4
2,"Even fans of Ismail Merchant 's work , I suspe...",1
3,A positively thrilling combination of ethnogra...,3
4,Aggressive self-glorification and a manipulati...,1


In [55]:
len(df_trainWLabels)

8529

In [67]:
folderPath = "labeled_text"

In [68]:
def IntoLabeledFolders(df, folderPath, validRatio=0.3, contentIndex=0, labelIndex=1):
    count = 1
    L = len(df)
    cutPoint = int(round((1-validRatio)*L))

    for index, row in df.iterrows():
        phrase=row[contentIndex]
        label = row[labelIndex]
        if (count<cutPoint):
            filename = os.path.join(folderPath,"train", str(label), str(count))+".txt"
        else:
            filename = os.path.join(folderPath,"valid", str(label), str(count))+".txt"
        os.makedirs(os.path.dirname(filename), exist_ok=True)
        with open(filename,"w") as f:
            f.write(phrase)
        count = count + 1

In [69]:
IntoLabeledFolders(df=df_trainWLabels, folderPath = folderPath)

In [70]:
#Show labels ###Start here if already created labeled folders
labelList = !ls labeled_text/train
labelList

['0', '1', '2', '3', '4']

In [71]:
#Count number of samples under each label
labelCount = []
for i in labelList:
    trn_files = !ls labeled_text/train/{i}
    labelCount.append(len(trn_files))
labelCount

[755, 1550, 1153, 1620, 891]

In [72]:
#Review sentences
label = 3
fileIndex = 40
FileList = !ls labeled_text/train/{label}
sample = !cat labeled_text/train/{label}/{FileList[fileIndex]}
sample

['It throws quirky characters , odd situations , and off-kilter dialogue at us , all as if to say , `` Look at this !']

In [73]:
#Function to create torch text object
class MovieReview(torchtext.data.Dataset):
    """Create a MovieReview dataset instance given a path and fields.
    Arguments:
        path: Path to the dataset's highest level directory
        text_field: The field that will be used for text data.
        label_field: The field that will be used for label data.
        Remaining keyword arguments: Passed to the constructor of
            data.Dataset.
    """
    def __init__(self, path, text_field, label_field, **kwargs):
        fields = [('text', text_field), ('label', label_field)]
        examples = []
        for label in ['0','1','2','3','4']:
            fnames = glob.glob(os.path.join(path, label, "*.txt"))
            assert fnames, f"can't find file under {path}/{label}"
            for fname in fnames:
                with open(fname, "r") as f: text = f.readline()
                examples.append(data.Example.fromlist([text,label], fields))
        super().__init__(examples, fields, **kwargs)
    
    @staticmethod
    def sort_key(ex): return len(ex.text)
    
    @classmethod
    def splits(cls, text_field, label_field, root='.data', 
               train='train', test='test', **kwargs):
        return super().splits(
            root, text_field=text_field, label_field=label_field,
            train=train, validation=None, test=test, **kwargs)
    
        

In [74]:
MovieReviewLabel = data.Field(sequential=False)

In [75]:
TEXT = pickle.load(open("TEXT_imdb.pkl","rb")) #open in read and bineary mode

In [76]:
PATH = "labeled_text"

In [77]:
splits = MovieReview.splits(TEXT, MovieReviewLabel, PATH,train="train", test="valid")

In [78]:
t = splits[0].examples[5]

In [79]:
t.label, " ".join(t.text[:30])

('0',
 "godard 's ode to tackling life 's wonderment is a rambling and incoherent manifesto about the vagueness of topical excess ... in praise of love remains a ponderous and pretentious")

In [80]:
bs = 64; bptt = 70

In [81]:
md2 = TextData.from_splits(PATH, splits, bs)

In [82]:
ls

[0m[01;36mfastai[0m@
[01;34mlabeled_text[0m/
[01;34mmodels[0m/
movie-review-sentiment-analysis-kernels-only.ipynb
movie-review-sentiment-analysis-kernels-only-transferred-model.ipynb
sampleSubmission.csv
test.tsv
TEXT_imdb.pkl
TEXT.pkl
[01;34mtmp[0m/
train.tsv


In [83]:
opt_fn = partial(optim.Adam, betas=(0.7, 0.99))

In [84]:
em_sz = 200  # size of each embedding vector
nh = 500     # number of hidden activations per layer
nl = 3       # number of layers

In [85]:
m3 = md2.get_model(opt_fn, 1500, bptt, emb_sz=em_sz, n_hid=nh, n_layers=nl, 
           dropout=0.1, dropouti=0.4, wdrop=0.5, dropoute=0.05, dropouth=0.3)
m3.reg_fn = partial(seq2seq_reg, alpha=2, beta=1)
m3.load_encoder('adam3_imdb_10_enc')

In [86]:
m3.clip=25.
lrs=np.array([1e-4,1e-4,1e-4,1e-3,1e-2])

In [87]:
m3.fit(lrs, 7, metrics=[accuracy], cycle_len=2, cycle_save_name='MovieREview')

HBox(children=(IntProgress(value=0, description='Epoch', max=14), HTML(value='')))

epoch      trn_loss   val_loss   accuracy                 
    0      1.47961    1.324243   0.426232  
    1      1.387777   1.273014   0.448345                 
    2      1.381538   1.287076   0.429342                 
    3      1.339347   1.249393   0.448537                 
    4      1.355395   1.232047   0.455236                 
    5      1.314312   1.228462   0.458077                 
    6      1.336506   1.226022   0.457905                 
    7      1.303152   1.228552   0.458711                 
    8      1.320887   1.236427   0.448844                 
    9      1.288927   1.231241   0.449497                 
    10     1.290093   1.24058    0.449958                 
    11     1.27969    1.215563   0.461302                 
    12     1.287484   1.255894   0.443643                 
    13     1.272589   1.225182   0.455505                 



[array([1.22518]), 0.4555052211302211]