This IPython Notebook is written to demonstrate a simple RNN for sentiment classification using Dynamic Neural Network (DyNet) Toolkit.  You can refer this <a href="https://arxiv.org/pdf/1701.03980.pdf">arXiv Paper</a> to understand the architecture behind the framework.

<h3> Import Necessary Libraries </h3>

In [10]:
import dynet as dy
import pandas as pd
import numpy as np
import string
import re
import time
import nltk
import random
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import defaultdict

<h3> Import Data </h3>

In [11]:
reviews = pd.read_csv("../data/reviews.txt",header=None)
labels = pd.read_csv("../data/labels.txt",header=None)
word2int = defaultdict(lambda: len(word2int))
labels2int = defaultdict(lambda: len(labels2int))
print (reviews.shape)
print (labels.shape)
print (reviews.head())

(25000, 1)
(25000, 1)
                                                   0
0  bromwell high is a cartoon comedy . it ran at ...
1  story of a man who has unnatural feelings for ...
2  homelessness  or houselessness as george carli...
3  airport    starts as a brand new luxury    pla...
4  brilliant over  acting by lesley ann warren . ...


<h3> Preprocess and Encode Data </h3>

In [12]:
stopwords = stopwords.words('english')
punctuations = string.punctuation
def clean(data):
    new_data = []
    wordnet_lemmatizer = WordNetLemmatizer()
    for doc in data:
        tokens = [tok.strip().lower() for tok in doc[0].split(' ')]
        tokens = [tok for tok in tokens if tok not in stopwords and tok not in punctuations]
        tokens = [wordnet_lemmatizer.lemmatize(token,pos='v') for token in tokens]
        tokens = [word2int[tok] for tok in tokens]
        new_data.append(tokens)
    return new_data
reviews = clean(reviews.values)
labels = [labels2int[label[0]] for label in labels.values]
print (reviews[0])

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0, 1, 15, 16, 17, 18, 9, 19, 20, 21, 22, 23, 24, 25, 26, 9, 27, 28, 29, 30, 31, 7, 32, 23, 33, 34, 35, 36, 37, 38, 7, 39, 40, 1, 41, 42, 43, 44, 45, 9, 35, 46, 0, 1, 47, 48, 49, 50, 51, 0, 1, 52, 53, 54]


<h3> Split Data as Train, Dev and Test </h3>

In [13]:
nwords = len(word2int)
nlabels = len(labels2int)
train_x, test_x = reviews[0:int(0.8*len(reviews))],reviews[int(0.8*len(reviews)):len(reviews)]
train_y, test_y = labels[0:int(0.8*len(labels))],labels[int(0.8*len(labels)):len(labels)]
train_x, dev_x = train_x[0:int(0.8*len(train_x))],train_x[int(0.8*len(train_x)):len(train_x)]
train_y, dev_y = train_y[0:int(0.8*len(train_y))],train_y[int(0.8*len(train_y)):len(train_y)]

print ("Train reviews : %d" %(len(train_x)))
print ("Train labels : %d" %(len(train_y)))
print ("Dev reviews : %d" %(len(dev_x)))
print ("Dev labels : %d" %(len(dev_y)))
print ("Test reviews : %d" %(len(test_x)))
print ("Test labels : %d" %(len(test_y)))

Train reviews : 16000
Train labels : 16000
Dev reviews : 4000
Dev labels : 4000
Test reviews : 5000
Test labels : 5000


<h3> Define Model and Hyperparameters </h3>

In [14]:
model = dy.Model()
trainer = dy.AdamTrainer(model)

EMBED_DIM = 50
HIDDEN_DIM = 50
Word_emb = model.add_lookup_parameters((nwords, EMBED_DIM))
RNN = dy.SimpleRNNBuilder(1, EMBED_DIM, HIDDEN_DIM, model)
W_sm = model.add_parameters((nlabels, HIDDEN_DIM))
b_sm = model.add_parameters((nlabels))

<h3> Define Feedforward Manipulation </h3>

In [15]:
def feed_forward(words):
    dy.renew_cg()
    word_embs = [dy.lookup(Word_emb, x) for x in words]
    rnn_init = RNN.initial_state()
    act_embs = rnn_init.transduce(word_embs)
    W_sm_exp = dy.parameter(W_sm)
    b_sm_exp = dy.parameter(b_sm)
    return W_sm_exp * act_embs[-1] + b_sm_exp

<h3> Train and Validate the Model</h3>

In [16]:
for ITER in range(15):
    train = list(zip(train_x, train_y))
    random.shuffle(train)
    train_x, train_y = zip(*train)
    train_loss = 0.0
    start = time.time()
    for index in range(len(train_x)):
        m_loss = dy.pickneglogsoftmax(feed_forward(train_x[index]), train_y[index])
        train_loss += m_loss.value()
        m_loss.backward()
        trainer.update()
    print("iter %r: train loss/sent=%.2f, time=%.2fs" % (ITER, train_loss / len(train_x), time.time() - start))
    
    dev_correct = 0.0
    for index in range(len(dev_x)):
        scores = feed_forward(dev_x[index])
        predict = np.argmax(scores.value())
        if predict == dev_y[index]:
            dev_correct += 1
    print("iter %r: dev acc=%.2f" % (ITER, dev_correct / len(dev_y)))

The dy.parameter(...) call is now DEPRECATED.
        There is no longer need to explicitly add parameters to the computation graph.
        Any used parameter will be added automatically.
iter 0: train loss/sent=0.62, time=25.26s
iter 0: dev acc=0.77
iter 1: train loss/sent=0.47, time=23.86s
iter 1: dev acc=0.76
iter 2: train loss/sent=0.41, time=22.96s
iter 2: dev acc=0.73
iter 3: train loss/sent=0.34, time=24.13s
iter 3: dev acc=0.73
iter 4: train loss/sent=0.31, time=25.73s
iter 4: dev acc=0.75
iter 5: train loss/sent=0.27, time=26.43s
iter 5: dev acc=0.75
iter 6: train loss/sent=0.26, time=24.77s
iter 6: dev acc=0.71
iter 7: train loss/sent=0.25, time=23.74s
iter 7: dev acc=0.73
iter 8: train loss/sent=0.23, time=23.52s
iter 8: dev acc=0.71
iter 9: train loss/sent=0.23, time=22.66s
iter 9: dev acc=0.74
iter 10: train loss/sent=0.22, time=23.15s
iter 10: dev acc=0.74
iter 11: train loss/sent=0.22, time=22.91s
iter 11: dev acc=0.71
iter 12: train loss/sent=0.22, time=23.14s
iter 12:

<h3> Test the Model </h3>

In [17]:
test_correct = 0.0
for index in range(len(test_x)):
    scores = feed_forward(test_x[index])
    predict = np.argmax(scores.value())
    if predict == test_y[index]:
        test_correct += 1
print("Test acc=%.2f" % ( test_correct / len(test_y)))

Test acc=0.72
