In [2]:
## IMPORTS
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
from collections import Counter
import nltk
nltk.download('punkt')
import random
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchtext
#glove = torchtext.vocab.GloVe(name="6B", dim=100, max_vectors=40000)

from google.colab import drive  
drive.mount('/content/drive/')
PATH = '/content/drive/My Drive/Colab/HW4/'

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Mounted at /content/drive/


In [None]:
print(torch.cuda.device_count())
print(torch.__version__)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
CUDA_LAUNCH_BLOCKING=1

1
1.8.0+cu101
cuda:0


**PROBLEM 2**

In [3]:
# create sequences of length 5 tokens
def create_seq(sent, max_len_seq = 5):
    sequences = []
    if len(sent) > max_len_seq: # if the number of tokens in 'text' is greater than 5
      for i in range(max_len_seq, len(sent)):
        seq = sent[i-max_len_seq:i+1] # select sequence of tokens
        #print("seq: {}".format(seq))
        sequences.append(seq) # add to the list
      return sequences
    else: # if the number of tokens in 'text' is equal to 5
      return [sent]
def split_sequences(text, max_len_seq):
    ## Sentences to sequences of max (sent dimension will be replaced by flattening obtaining only sequences)
    text = [create_seq(sent, max_len_seq) for sent in text]
    text = sum(text, []) # merge list-of-lists into a single list
    return text

def tokenize(text, eof_flag=True):
    ## Build words dictionary count
    words = Counter() #Dictionary that will map a word to the number of times it appeared in all the training sentences
    if eof_flag:
      for i, sentence in enumerate(text):
          text[i] = [] #The sentences will be stored as a list of words/tokens
          for word in nltk.word_tokenize(sentence): #Tokenizing the words
              word = word.lower().strip()
              words.update([word])
              text[i].append(word)
          text[i].insert(0,'<s>')
          text[i].append('</s>')
    else:
      for i, sentence in enumerate(text):
          text[i] = [] #The sentences will be stored as a list of words/tokens
          for word in sentence.split(): #Tokenizing the words
              word = word.lower().strip()
              words.update([word])
              text[i].append(word)
    return text, words


def encode_glove(words, eof_flag=True, file='glove_6B_100d.txt'):#glove.twitter.27B.100d
    ## Create glove structure from file
    glove = {}
    with open(PATH+file) as f: 
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], "float32")
            glove[word] = vector
    if eof_flag:
        to_add = ['_PAD','_UNK','<s>','</s>']
    else:
        to_add = ['_PAD','_UNK']
    for el in to_add:
        glove[el] = np.random.normal(scale=0.6, size=(100, )) #np.random.rand(100)
    ## Create embedding matrix for nn (based on the words in the vocab)
    matrix_len = len(words)
    weights_matrix = np.zeros((matrix_len, 100))
    words_found = 0
    for i, word in enumerate(words):
        try: 
            weights_matrix[i] = glove[word]
            words_found += 1
        except KeyError:
            weights_matrix[i] = np.random.normal(scale=0.6, size=(100, )) #np.random.rand(100)
    print("Word found in glove: {}/{}".format(words_found,matrix_len))
    return weights_matrix, glove

def encode(text, words, limit_appear=True, eof_flag=True):
    ## Removing the words that only appear once
    if limit_appear:
        words = {k:v for k,v in words.items() if v>1}
    else:
        words = {k:v for k,v in words.items() if v>0}
    words = sorted(words, key=words.get, reverse=True) # Sorting the words according to the number of appearances, with the most common word being first
    if eof_flag:
        to_add = ['_PAD','_UNK','<s>','</s>']
    else:
        to_add = ['_PAD','_UNK']
    words = to_add + words # Adding padding and unknown to our vocabulary so that they will be assigned an index
    word2idx = {o:i for i,o in enumerate(words)} # Dictionaries to store the word to index mappings and vice versa
    idx2word = {i:o for i,o in enumerate(words)}
    ## Encode text
    for i, sequence in enumerate(text):
        text[i] = [word2idx[word] if word in word2idx else word2idx['_UNK'] for word in sequence]
    return text, word2idx, idx2word, words

def pad_sequences(text, pad_token, max_len_seq):
    for i, sequence in enumerate(text): 
        if len(sequence) <= max_len_seq:
            for j in range(len(sequence), max_len_seq+1):
                text[i].insert(0,pad_token) #text[i].insert(0,0) #.append(0) # add to the list
    return text

def create_in_out(text):
    x = []
    y = []
    for seq in text:
      x.append([w for w in seq[:-1]])
      y.append([w for w in seq[1:]])
    text_train = np.asarray([np.asarray(s) for s in x])
    label_train = np.asarray([np.asarray(s) for s in y])
    return text_train, label_train


## Processing group
def test_processing(raw_text, max_len_seq, test_encode):
    text, words = tokenize(raw_text, eof_flag=False) ## Tokenize
    max_len_seq = len(max(text,key=len))
    print("\n---- Result after tokenization")
    print("Length words: {}".format(len(words)))
    print("words: {}".format(words))
    print("text: {}".format(text[0:2]))
    '''text = split_sequences(text, max_len_seq) ## Split in sequences
    print("\n---- Result after reorganizing by sequences")
    print("text: {}".format(text[0:2]))
    print("len(text): {}".format(len(text)))
    print("max_len_seq: {}".format(max_len_seq))'''
    for i, sent in enumerate(text):
        text[i] = [test_encode[word] if word in test_encode else test_encode['_UNK'] for word in sent]
    print("\n---- Result after encoding")
    print("text: {}".format(text[0:2]))
    text = pad_sequences(text, 0, max_len_seq)
    print("\n---- Result after padding")
    print("text_train: {}".format(text[0:2]))
    text_test, label_test = create_in_out(text) ## Create input output for the model
    print("\n---- Result after creating x and y")
    print("text_train: {}".format(text_test[0:2]))
    print("label_train: {}".format(label_test[0:2]))

    return text_test, label_test

def processing(raw_text, max_len_seq, eof_flag, glove_flag):
    text, words = tokenize(raw_text, eof_flag=eof_flag) ## Tokenize
    print("\n---- Result after tokenization")
    print("Length words: {}".format(len(words)))
    print("words: {}".format(words))
    print("text: {}".format(text[0:2]))
    text = split_sequences(text, max_len_seq) ## Split in sequences
    print("\n---- Result after reorganizing by sequences")
    print("text: {}".format(text[0:2]))
    print("len(text): {}".format(len(text)))
    print("max_len_seq: {}".format(max_len_seq))
    if glove_flag:
        print('ok')
        text, word2idx, idx2word, words = encode(text, words, limit_appear=False, eof_flag=eof_flag) ## Encode
        weights_matrix, glove = encode_glove(words, eof_flag=eof_flag) #create weight matrix
        encod = word2idx
        unencod = idx2word
        vocab_size = len(weights_matrix)
    else:
        text, word2idx, idx2word, words = encode(text, words, eof_flag=eof_flag) ## Encode
        encod = word2idx
        unencod = idx2word
        weights_matrix = []
        vocab_size = len(encod)
    print("\n---- Result after encoding")
    print("text_train: {}".format(text[0:2]))
    text = pad_sequences(text, 0, max_len_seq)
    print("\n---- Result after padding")
    print("text_train: {}".format(text[0:2]))
    text_train, label_train = create_in_out(text) ## Create input output for the model
    print("\n---- Result after creating x and y")
    print("text_train: {}".format(text_train[0:2]))
    print("label_train: {}".format(label_train[0:2]))
    print("vocab size: {}".format(vocab_size))
    return text_train, label_train, vocab_size, encod, unencod, weights_matrix, words

In [4]:
class WordLSTM(nn.Module):
    def __init__(self, n_hidden=256, n_layers=4, drop_prob=0.3, lr=0.001, mode='', weights_matrix=[]):
        super().__init__()
        self.drop_prob = drop_prob
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        self.lr = lr
        if mode=='glove':
            num_embeddings=weights_matrix.shape[0]
            embedding_dim=weights_matrix.shape[1]
            ## init lookup table (int index --> vector embedding)
            self.emb_layer = nn.Embedding(num_embeddings, embedding_dim)
            ## load the content into the lookup table
            self.emb_layer.weight = nn.Parameter(torch.tensor(weights_matrix,dtype=torch.float32))
            #self.emb_layer.load_state_dict({'weight': weights_matrix})
            self.emb_layer.weight.requires_grad = False
        else:
            embedding_dim = 200
            self.emb_layer = nn.Embedding(vocab_size, embedding_dim)
        print('vocab_size ',vocab_size)
        self.lstm = nn.LSTM(embedding_dim, n_hidden, n_layers, dropout=drop_prob, batch_first=True)
        self.dropout = nn.Dropout(drop_prob)
        self.fc = nn.Linear(n_hidden, vocab_size)  # output: 1 out for each word in the vocab

    def forward(self, x, hidden):
        ## pass input through embedding 
        embedded = self.emb_layer(x)#torch.LongTensor(x)) 
        '''embedding_vec=embedding(torch.LongTensor(text_token))
        print(embedding)
        print(embedding_vec.shape)'''
        lstm_output, hidden = self.lstm(embedded, hidden) ## Get the outputs and the new hidden state from the lstm
        out = self.dropout(lstm_output) ## pass through a dropout layer
        out = out.reshape(-1, self.n_hidden) 
        out = self.fc(out)
        return out, hidden # return the final output and the hidden state
    def init_hidden(self, batch_size):
        ''' initializes hidden state '''
        # Create two new tensors with sizes n_layers x batch_size x n_hidden,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data
        if (torch.cuda.is_available()):
          hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda(),
                    weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda())
        else:
          hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_(),
                    weight.new(self.n_layers, batch_size, self.n_hidden).zero_())
        return hidden

# Build batches
def get_batches(arr_x, arr_y, batch_size):
    prv = 0
    for n in range(batch_size, arr_x.shape[0], batch_size):
      x = arr_x[prv:n,:]
      y = arr_y[prv:n,:]
      prv = n
      yield x, y

# predict next token
def predict(net, tkn, encod, unencod, h=None):      
    # tensor inputs
    x = np.array([[encod[tkn]]])
    inputs = torch.from_numpy(x)
    # push to GPU
    inputs = inputs.cuda()
    # detach hidden state from history
    h = tuple([each.data for each in h])
    # get the output of the model
    out, h = net(inputs, h)
    # get the token probabilities
    p = F.softmax(out, dim=1).data
    p = p.cpu()
    p = p.numpy()
    p = p.reshape(p.shape[1],)
    # get indices of top 3 values
    top_n_idx = p.argsort()[-3:][::-1]
    # randomly select one of the three indices
    sampled_token_index = top_n_idx[random.sample([0,1,2],1)[0]]
    # return the encoded value of the predicted char and the hidden state
    return unencod[sampled_token_index], h
# function to generate text
def sample(net, size, encod, unencod, prime='<s>'):  
    # push to GPU
    net.cuda()
    net.eval()
    # batch size is 1
    h = net.init_hidden(1)
    toks = prime.split()
    # predict next token
    for t in prime.split():
      token, h = predict(net, t, encod, unencod, h)
    toks.append(token)
    # predict subsequent tokens
    for i in range(size-1):
        token, h = predict(net, toks[-1], encod, unencod, h)
        toks.append(token)
        if token == '</s>':
            break
    return ' '.join(toks)

def train(net, x, y, epochs=10, batch_size=32, lr=0.001, clip=1, print_every=32):
    # optimizer
    opt = torch.optim.Adam(net.parameters(), lr=lr)
    # loss
    criterion = nn.CrossEntropyLoss()
    # push model to GPU
    net.cuda()
    net.train()
    for e in range(0, epochs):
        counter = 0
        # initialize hidden state
        h = net.init_hidden(batch_size)
        for x, y in get_batches(text_train, label_train, batch_size):
            counter+= 1
            # convert numpy arrays to PyTorch arrays
            inputs, targets = torch.from_numpy(x), torch.from_numpy(y)
            # push tensors to GPU
            #inputs, targets = inputs.cuda(), targets.cuda()
            inputs, targets = inputs.cuda().long(), targets.cuda().long()
            # detach hidden states
            h = tuple([each.data for each in h])
            # zero accumulated gradients
            net.zero_grad()
            # get the output from the model
            output, h = net(inputs, h)
            # calculate the loss and perform backprop
            loss = criterion(output, targets.view(-1))
            # back-propagate error
            loss.backward()
            # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
            nn.utils.clip_grad_norm_(net.parameters(), clip)
            # update weigths
            opt.step()            
            #if counter % print_every == 0:
              #print("Epoch: {}/{}...".format(e+1, epochs),"Step: {}...".format(counter))
        print("Epoch: {}/{}...".format(e+1, epochs), "Loss: {:.6f}...".format(loss.item()))

import math
criterion = nn.CrossEntropyLoss() 
def getloss(net, tkn, tkn2, h=None):      
  # tensor inputs
  with torch.no_grad():
    x = np.array([[encod[tkn]]])
    y = np.array([[encod[tkn2]]])
    inputs = torch.from_numpy(x)
    target = torch.from_numpy(y)
    # push to GPU
    inputs = inputs.cuda()
    target = target.cuda()
    # detach hidden state from history
    h = tuple([each.data for each in h])
    # get the output of the model
    out, h = net(inputs, h)
    criterion = nn.CrossEntropyLoss() 
    test_loss = criterion(out, target.view(-1))
  return test_loss, h
def sent_perp(sent, net):
    l = sent.split(' ')
    log_prob = 0
    h = net.init_hidden(1)	
    #for word in l[0:-1]:
    for i in range(len(l)-1):
      word = l[i]
      word2 = l[i+1]
      if word not in words:
        word = '_UNK'
      if word2 not in words:
        word2 = '_UNK'
      test_loss, h = getloss(net, word, word2, h)
      log_prob = log_prob + test_loss
    return log_prob 

def test(net, text_test):
    with open(PATH + text_test) as f:#'test_1.txt'
      test_data = f.read()

    test_data = test_data.split('\n')
    N = 0
    P = 0
    for sent in test_data:
      N = N + len(sent.split(' '))
      P = P + sent_perp(sent, net)
    temp = P/N
    perplexity = torch.exp(temp)
    return temp, perplexity

In [None]:
## LOAD DATA
with open(PATH+'prideAndPrejudice.txt') as f:
    raw_text = f.readlines()
with open(PATH+'test_1.txt') as f:
    raw_test_1 = f.readlines()
with open(PATH+'test_2.txt') as f:
    raw_test_2 = f.readlines()
with open(PATH+'tweet.txt') as f:
    raw_tweet = f.readlines()
#raw_text = [re.sub("[^a-zA-Z\.\,\!\?\' ]", "", i) for i in raw_text]
#raw_tweet = [re.sub("[^a-zA-Z\.\,\!\?\'\<\/\> ]", "", i) for i in raw_tweet]

#text = text[0:2]
print("\n---- Preliminary text analysis")
print("Length text: {}".format(len(raw_text)))
print("Length test1: {}".format(len(raw_test_1)))
print("Length test2: {}".format(len(raw_test_2)))
print("Length raw_tweet: {}".format(len(raw_tweet)))


---- Preliminary text analysis
Length text: 2063
Length test1: 924
Length test2: 1000
Length raw_tweet: 6000


In [None]:
## Question 1
max_len_seq = 5
text_train, label_train, vocab_size, encod, unencod, _, words= processing(raw_text.copy(), max_len_seq, True, False) #text_train, label_train, encod, weights_matrix


---- Result after tokenization
Length words: 6553
words: Counter({',': 9129, '.': 5047, 'the': 4331, 'to': 4137, 'of': 3608, 'and': 3577, 'her': 2203, 'i': 2064, 'a': 1947, 'in': 1865, 'was': 1844, '``': 1785, "''": 1747, 'she': 1695, 'that': 1540, ';': 1538, 'it': 1535, 'not': 1533, 'you': 1326, 'he': 1324, 'his': 1258, 'be': 1240, 'as': 1179, 'had': 1172, 'for': 1058, 'with': 1051, 'but': 1002, 'is': 860, 'have': 840, 'at': 788, 'mr.': 764, 'him': 753, 'on': 716, 'my': 703, "'s": 648, 'by': 635, 'elizabeth': 635, 'all': 621, 'they': 597, 'so': 589, 'were': 563, 'which': 538, 'could': 525, 'been': 515, '!': 499, 'from': 493, 'no': 490, 'very': 485, 'what': 478, 'would': 468, '?': 462, 'this': 441, 'their': 441, 'your': 440, 'me': 429, 'them': 429, 'darcy': 417, 'will': 404, 'said': 401, '--': 394, 'such': 386, 'when': 373, 'an': 354, 'if': 350, 'there': 348, 'do': 347, 'mrs.': 341, 'are': 338, 'much': 326, 'bennet': 323, 'more': 323, 'can': 322, 'am': 316, 'must': 305, 'bingley': 305

In [None]:
## Instantiate the model
net = WordLSTM().cuda() # push the model to GPU (avoid it if you are not using the GPU)
print(net)
train(net, text_train, label_train, batch_size = 32, epochs=20, print_every=4000) ## Train the model
torch.save(net, PATH+'model_save_5.pt') ## Save

vocab_size  3970
WordLSTM(
  (emb_layer): Embedding(3970, 200)
  (lstm): LSTM(200, 256, num_layers=4, batch_first=True, dropout=0.3)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=256, out_features=3970, bias=True)
)
Epoch: 1/20... Loss: 5.469995...
Epoch: 2/20... Loss: 5.164315...
Epoch: 3/20... Loss: 4.776531...
Epoch: 4/20... Loss: 4.660214...
Epoch: 5/20... Loss: 4.441027...
Epoch: 6/20... Loss: 4.295032...
Epoch: 7/20... Loss: 4.240712...
Epoch: 8/20... Loss: 4.240494...
Epoch: 9/20... Loss: 4.234482...
Epoch: 10/20... Loss: 3.978645...
Epoch: 11/20... Loss: 3.991458...
Epoch: 12/20... Loss: 4.021789...
Epoch: 13/20... Loss: 4.052073...
Epoch: 14/20... Loss: 3.805040...
Epoch: 15/20... Loss: 3.830575...
Epoch: 16/20... Loss: 3.940037...
Epoch: 17/20... Loss: 3.819937...
Epoch: 18/20... Loss: 3.743596...
Epoch: 19/20... Loss: 3.682145...
Epoch: 20/20... Loss: 3.835780...


In [None]:
net = torch.load(PATH+'model_save_5.pt')
net.eval()
## Generate words 
for i in range(10):
    print("Example {} generated: ".format(i))
    print(sample(net, 500, encod, unencod, prime = "<s>"))

Example 0 generated: 
<s> elizabeth , who came on stairs with the familiarity and _UNK , and _UNK of her family ; she was necessarily drawn from her father 's instructions ; for when she was in the room . he bore him , to the _UNK and _UNK , _UNK of her sisters ' , she was in a _UNK , and the others , she had a little less _UNK than the subject of the match , she was _UNK in her marriage . she could not be vulgar . '' and she could have nothing to be sure . he was the _UNK of his wife and elizabeth had been prevailed to town . he was the _UNK of the persons who was a very large woman . the evening were _UNK in her own room , she had the _UNK _UNK of their being not help her , and the evening were in her way to be _UNK . she was not to speak to jane , and the evening were spent from the pursuit of courtship much of her earnest . she was the _UNK , and she had the highest gratitude to the world ; for she had a little _UNK from her husband , she was still acquainted in the room ; she had 

In [None]:
## Question 2
max_len_seq = 0
net = torch.load(PATH+'model_save_5.pt')
net.eval()
test_loss, perplexity = test(net, 'test_1.txt')
print('test loss {:5.2f} | test perplexity {:8.2f}'.format(test_loss.item(), perplexity))

test loss  5.04 | test perplexity   154.58


In [None]:
## Question 3
max_len_seq = 25
text_train, label_train, vocab_size, encod, unencod, _, words = processing(raw_text.copy(), max_len_seq, True, False)

In [None]:
net = WordLSTM().cuda() # push the model to GPU (avoid it if you are not using the GPU)
print(net)
train(net, text_train, label_train, batch_size = 32, epochs=20, print_every=4000) ## Train the model
torch.save(net, PATH+'model_save_25.pt') ## Save

vocab_size  3970
WordLSTM(
  (emb_layer): Embedding(3970, 200)
  (lstm): LSTM(200, 256, num_layers=4, batch_first=True, dropout=0.3)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=256, out_features=3970, bias=True)
)
Epoch: 1/20... Loss: 5.165695...
Epoch: 2/20... Loss: 4.619696...
Epoch: 3/20... Loss: 4.176872...
Epoch: 4/20... Loss: 4.002927...
Epoch: 5/20... Loss: 3.786731...
Epoch: 6/20... Loss: 3.692864...
Epoch: 7/20... Loss: 3.560080...
Epoch: 8/20... Loss: 3.469120...
Epoch: 9/20... Loss: 3.320129...
Epoch: 10/20... Loss: 3.286344...
Epoch: 11/20... Loss: 3.186706...
Epoch: 12/20... Loss: 3.088893...
Epoch: 13/20... Loss: 3.017620...
Epoch: 14/20... Loss: 2.962198...
Epoch: 15/20... Loss: 2.852405...
Epoch: 16/20... Loss: 2.858321...
Epoch: 17/20... Loss: 2.886101...
Epoch: 18/20... Loss: 2.827587...
Epoch: 19/20... Loss: 2.734997...
Epoch: 20/20... Loss: 2.692246...


In [None]:
net = torch.load(PATH+'model_save_25.pt')
net.eval()
## Generate words 
for i in range(10):
    print("Example {} generated: ".format(i))
    print(sample(net, 500, encod, unencod, prime = "<s>"))

Example 0 generated: 
<s> but i have no reason , but not so many ! -- i am happier even . you ought to have gone on . but i have not observed , by your imagination , you know , that he had always courage for your kindness for the kindness which his affection to the lakes ; but as he was the last , she would be always _UNK by his affection to the efforts and spending all the country ; when he wished for the match . she had always fallen on his affection with _his_ promise to see with her husband . he bore the _UNK , _UNK _UNK . he _UNK , and said that they was going in her private room ; she had always seen it . the former wish of his life , however , was extreme . mrs. gardiner had never seen him in a chaise . she was always of his good temper . she could conquer the place . the ladies were post . she had always seen her joy . she was in her power for an assistance which her estimation was great than a _UNK , _UNK woman . he had been a _UNK in the north which must be _UNK in her marria

In [None]:
## Question 4
net = torch.load(PATH+'model_save_25.pt')
net.eval()
test_loss, perplexity = test(net, 'test_1.txt')
print('test loss {:5.2f} | test perplexity {:8.2f}'.format(test_loss.item(), perplexity))

test loss  5.50 | test perplexity   243.61


In [None]:
## Question 5
net = torch.load(PATH+'model_save_5.pt') # best model
net.eval()
test_loss, perplexity = test(net, 'test_2.txt')
print('test loss {:5.2f} | test perplexity {:8.2f}'.format(test_loss.item(), perplexity))

test loss  6.22 | test perplexity   504.26


In [None]:
## Question 6
max_len_seq = 5 # best model
text_train, label_train, vocab_size, encod, unencod, weights_matrix, words = processing(raw_text.copy(), max_len_seq, True, True)


---- Result after tokenization
Length words: 6553
words: Counter({',': 9129, '.': 5047, 'the': 4331, 'to': 4137, 'of': 3608, 'and': 3577, 'her': 2203, 'i': 2064, 'a': 1947, 'in': 1865, 'was': 1844, '``': 1785, "''": 1747, 'she': 1695, 'that': 1540, ';': 1538, 'it': 1535, 'not': 1533, 'you': 1326, 'he': 1324, 'his': 1258, 'be': 1240, 'as': 1179, 'had': 1172, 'for': 1058, 'with': 1051, 'but': 1002, 'is': 860, 'have': 840, 'at': 788, 'mr.': 764, 'him': 753, 'on': 716, 'my': 703, "'s": 648, 'by': 635, 'elizabeth': 635, 'all': 621, 'they': 597, 'so': 589, 'were': 563, 'which': 538, 'could': 525, 'been': 515, '!': 499, 'from': 493, 'no': 490, 'very': 485, 'what': 478, 'would': 468, '?': 462, 'this': 441, 'their': 441, 'your': 440, 'me': 429, 'them': 429, 'darcy': 417, 'will': 404, 'said': 401, '--': 394, 'such': 386, 'when': 373, 'an': 354, 'if': 350, 'there': 348, 'do': 347, 'mrs.': 341, 'are': 338, 'much': 326, 'bennet': 323, 'more': 323, 'can': 322, 'am': 316, 'must': 305, 'bingley': 305

In [None]:
net = WordLSTM(mode='glove', weights_matrix=weights_matrix).cuda()
print(net)
train(net, text_train, label_train, batch_size = 32, epochs=20, print_every=4000)
torch.save(net, PATH+'model_glove_5.pt')

vocab_size  6557
WordLSTM(
  (emb_layer): Embedding(6557, 100)
  (lstm): LSTM(100, 256, num_layers=4, batch_first=True, dropout=0.3)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=256, out_features=6557, bias=True)
)
Epoch: 1/20... Loss: 5.633097...
Epoch: 2/20... Loss: 5.360171...
Epoch: 3/20... Loss: 4.998235...
Epoch: 4/20... Loss: 4.839860...
Epoch: 5/20... Loss: 4.524868...
Epoch: 6/20... Loss: 4.585644...
Epoch: 7/20... Loss: 4.572575...
Epoch: 8/20... Loss: 4.415634...
Epoch: 9/20... Loss: 4.346293...
Epoch: 10/20... Loss: 4.271797...
Epoch: 11/20... Loss: 4.151351...
Epoch: 12/20... Loss: 4.199869...
Epoch: 13/20... Loss: 4.229291...
Epoch: 14/20... Loss: 4.095868...
Epoch: 15/20... Loss: 4.047168...
Epoch: 16/20... Loss: 4.127080...
Epoch: 17/20... Loss: 4.079382...
Epoch: 18/20... Loss: 4.009870...
Epoch: 19/20... Loss: 3.988736...
Epoch: 20/20... Loss: 3.884956...


In [None]:
net = torch.load(PATH+'model_glove_5.pt')
net.eval()
for i in range(10):
    print("Example {} generated: ".format(i))
    print(sample(net, 500, encod, unencod, prime = "<s>"))

Example 0 generated: 
<s> you will have hated him to be done to be done , i shall not have the grief of his forbearance , she could not be in her power , by the practice of his friend . she could not help him a hint of his friend , he had no longer kept her , she could not be prevailed by him . she was not so happy , but the comfort was not so little , she was in the bingleys ; for the next of her daughters ; for she could not be prevailed , and the evening , he could have been more welcome to her , and her manners were a little kinder , tax to her husband ; but , as she could have been able for her . she could have no compassion for the marriage , and the other of them ; but elizabeth , in the room ; but it would not have relished her to the woods , and the others , and the world of her letter was now gratified to get a twelvemonth to the world ; and though she did not listen , was still relieved with the extreme of his wife , and she was not in a humour to be so happy as to make him 

In [None]:
## Question 7
net = torch.load(PATH+'model_glove_5.pt') # best model
net.eval()
test_loss, perplexity = test(net, 'test_1.txt')
print('test loss {:5.2f} | test perplexity {:8.2f}'.format(test_loss.item(), perplexity))

test loss  6.54 | test perplexity   689.15


In [None]:
## Question 8 TWEET
max_len_seq = 5
text_train, label_train, vocab_size, encod, unencod, _, words = processing(raw_tweet.copy(), max_len_seq, False, False)
print(encod)


---- Result after tokenization
Length words: 12630
text: [['<s>', 'i', 'love', '@health4uandpets', 'u', 'guys', 'r', 'the', 'best', '!', '!', '</s>'], ['<s>', 'i', 'm', 'meeting', 'up', 'with', 'one', 'of', 'my', 'besties', 'tonight', '!', 'ca', 'nt', 'wait', '!', '!', '-', 'girl', 'talk', '!', '!', '</s>']]

---- Result after reorganizing by sequences
text: [['<s>', 'i', 'love', '@health4uandpets', 'u', 'guys'], ['i', 'love', '@health4uandpets', 'u', 'guys', 'r']]
len(text): 78861
max_len_seq: 5

---- Result after encoding
text_train: [[2, 4, 66, 1, 86, 241], [4, 66, 1, 86, 241, 329]]

---- Result after padding
text_train: [[2, 4, 66, 1, 86, 241], [4, 66, 1, 86, 241, 329]]

---- Result after creating x and y
text_train: [[  2   4  66   1  86]
 [  4  66   1  86 241]]
label_train: [[  4  66   1  86 241]
 [ 66   1  86 241 329]]
vocab size: 3997
{'_PAD': 0, '_UNK': 1, '<s>': 2, '</s>': 3, 'i': 4, '!': 5, '.': 6, '"': 7, 'to': 8, 'the': 9, ',': 10, 'a': 11, 'it': 12, 'and': 13, 'you': 14,

In [None]:
net = WordLSTM().cuda()
train(net, text_train, label_train, batch_size = 32, epochs=20, print_every=4000)
torch.save(net, PATH+'model_tweet_5.pt')

vocab_size  3997
Epoch: 1/20... Loss: 5.252578...
Epoch: 2/20... Loss: 4.897115...
Epoch: 3/20... Loss: 4.688882...
Epoch: 4/20... Loss: 4.531646...
Epoch: 5/20... Loss: 4.405295...
Epoch: 6/20... Loss: 4.388572...
Epoch: 7/20... Loss: 4.270437...
Epoch: 8/20... Loss: 4.183468...
Epoch: 9/20... Loss: 4.147174...
Epoch: 10/20... Loss: 4.200684...
Epoch: 11/20... Loss: 4.059154...
Epoch: 12/20... Loss: 3.903972...
Epoch: 13/20... Loss: 3.769385...
Epoch: 14/20... Loss: 3.831891...
Epoch: 15/20... Loss: 3.742058...
Epoch: 16/20... Loss: 3.658604...
Epoch: 17/20... Loss: 3.645605...
Epoch: 18/20... Loss: 3.737691...
Epoch: 19/20... Loss: 3.544748...
Epoch: 20/20... Loss: 3.611055...


In [None]:
net = torch.load(PATH+'model_tweet_5.pt')
net.eval()
for i in range(10):
    print("Example {} generated: ".format(i))
    print(sample(net, 500, encod, unencod, prime = "<s>"))

Example 0 generated: 
<s> _UNK _ _UNK , but still not a closet i do not want to get @aubreyoday to be _UNK _UNK , _UNK _UNK _UNK . _UNK arnt * _UNK ) * i do nt get to get to see _UNK . _UNK . nyte feeling a bit _UNK , but i do n't have any idea how a homo . </s>
Example 1 generated: 
<s> " i hate _UNK . </s>
Example 2 generated: 
<s> _UNK _UNK i 'm not going back 2 skool i m so jealous ! i have n't eaten at my _UNK _UNK _UNK . i m not _UNK . _UNK . i m pretty _UNK , but i m not not a closet break at work . i m going back to work tomorrow , i m going to sleep . </s>
Example 3 generated: 
<s> " _UNK oh hun ! ! i 'm a bit _UNK , i do not get a virus ! i have to go back for a sleep </s>
Example 4 generated: 
<s> i 'm not going to be _UNK by the _UNK of my _UNK . i m not _UNK , _UNK . i m pretty sad , i 'm a _UNK _UNK _UNK , i ca n't sleep . " </s>
Example 5 generated: 
<s> i m so tired . i have n't slept awake to go to school tomorrow . </s>
Example 6 generated: 
<s> " just _UNK _UNK _UNK 

In [None]:
## Question 9
net = torch.load(PATH+'model_tweet_5.pt')
net.eval()
test_loss, perplexity = test(net, 'test_2.txt')
print('test loss {:5.2f} | test perplexity {:8.2f}'.format(test_loss.item(), perplexity))

test loss  4.78 | test perplexity   119.45


In [None]:
## Question 10
max_len_seq = 15
text_train, label_train, vocab_size, encod, unencod, _, words = processing(raw_tweet.copy(), max_len_seq, False, False)
print(encod)


---- Result after tokenization
Length words: 12630
text: [['<s>', 'i', 'love', '@health4uandpets', 'u', 'guys', 'r', 'the', 'best', '!', '!', '</s>'], ['<s>', 'i', 'm', 'meeting', 'up', 'with', 'one', 'of', 'my', 'besties', 'tonight', '!', 'ca', 'nt', 'wait', '!', '!', '-', 'girl', 'talk', '!', '!', '</s>']]

---- Result after reorganizing by sequences
text: [['<s>', 'i', 'love', '@health4uandpets', 'u', 'guys', 'r', 'the', 'best', '!', '!', '</s>'], ['<s>', 'i', 'm', 'meeting', 'up', 'with', 'one', 'of', 'my', 'besties', 'tonight', '!', 'ca', 'nt', 'wait', '!']]
len(text): 33931
max_len_seq: 15

---- Result after encoding
text_train: [[2, 4, 66, 1, 86, 241, 329, 9, 152, 5, 5, 3], [2, 4, 84, 606, 43, 38, 67, 21, 15, 1, 214, 5, 63, 65, 159, 5]]

---- Result after padding
text_train: [[0, 0, 0, 0, 2, 4, 66, 1, 86, 241, 329, 9, 152, 5, 5, 3], [2, 4, 84, 606, 43, 38, 67, 21, 15, 1, 214, 5, 63, 65, 159, 5]]

---- Result after creating x and y
text_train: [[  0   0   0   0   2   4  66   1  

In [None]:
net = WordLSTM().cuda()
train(net, text_train, label_train, batch_size = 32, epochs=20, print_every=4000)
torch.save(net, PATH+'model_tweet_15.pt')

vocab_size  3997
Epoch: 1/20... Loss: 5.853838...
Epoch: 2/20... Loss: 5.489911...
Epoch: 3/20... Loss: 5.182996...
Epoch: 4/20... Loss: 5.010383...
Epoch: 5/20... Loss: 4.774088...
Epoch: 6/20... Loss: 4.627366...
Epoch: 7/20... Loss: 4.550300...
Epoch: 8/20... Loss: 4.399261...
Epoch: 9/20... Loss: 4.253626...
Epoch: 10/20... Loss: 4.215551...
Epoch: 11/20... Loss: 4.108526...
Epoch: 12/20... Loss: 3.989191...
Epoch: 13/20... Loss: 3.874573...
Epoch: 14/20... Loss: 3.751858...
Epoch: 15/20... Loss: 3.615877...
Epoch: 16/20... Loss: 3.593357...
Epoch: 17/20... Loss: 3.496465...
Epoch: 18/20... Loss: 3.368101...
Epoch: 19/20... Loss: 3.373318...
Epoch: 20/20... Loss: 3.432022...


In [None]:
net = torch.load(PATH+'model_tweet_15.pt')
net.eval()
for i in range(10):
    print("Example {} generated: ".format(i))
    print(sample(net, 500, encod, unencod, prime = "<s>"))

Example 0 generated: 
<s> " my _UNK are _UNK , but it is n't a good day . i m scared , i m going for a good day , and i 'm not thinking _UNK i have no _UNK skin i 'm not going for a mini hour or a nice white i have to go back tomorrow . i m scared . hulk get mad . i have n't slept a _UNK - sleep ! ! _UNK . _UNK _UNK , _UNK _UNK _UNK . i 'm _UNK _UNK and _UNK _UNK , but do not wanna eat out , i do nt have enough fun of _UNK and i have a _UNK of my house , and now i 'm going with my _UNK . </s>
Example 1 generated: 
<s> " i 'm so bad , i 'm scared i ca nt find any this _UNK & lt;3 _UNK ... i 'm scared you were a good night , but i have to get it all over i m _UNK . _UNK _UNK . i 'm going for my house . emotionally in _UNK 's 70 _UNK . _UNK , i m _UNK . my chest is _UNK and it shows up _UNK the _UNK trash to go out of sex wine in _UNK . _UNK i have no idea i ca n't afford to go back to work , but i 'm not thinking by my house .... its like the hills , and now i 'm not too bad ! ! i have t

In [None]:
## Question 11
net = torch.load(PATH+'model_tweet_15.pt')
net.eval()
test_loss, perplexity = test(net, 'test_2.txt')
print('test loss {:5.2f} | test perplexity {:8.2f}'.format(test_loss.item(), perplexity))

test loss  5.05 | test perplexity   155.99


In [None]:
## Question 12
net = torch.load(PATH+'model_tweet_5.pt') # best model
net.eval()
test_loss, perplexity = test(net, 'test_1.txt')
print('test loss {:5.2f} | test perplexity {:8.2f}'.format(test_loss.item(), perplexity))

test loss  5.61 | test perplexity   273.83


In [None]:
## Question 13
max_len_seq =  5 # best model
text_train, label_train, vocab_size, encod, unencod, weights_matrix, words = processing(raw_tweet.copy(), max_len_seq, False, True)


---- Result after tokenization
Length words: 12630
text: [['<s>', 'i', 'love', '@health4uandpets', 'u', 'guys', 'r', 'the', 'best', '!', '!', '</s>'], ['<s>', 'i', 'm', 'meeting', 'up', 'with', 'one', 'of', 'my', 'besties', 'tonight', '!', 'ca', 'nt', 'wait', '!', '!', '-', 'girl', 'talk', '!', '!', '</s>']]

---- Result after reorganizing by sequences
text: [['<s>', 'i', 'love', '@health4uandpets', 'u', 'guys'], ['i', 'love', '@health4uandpets', 'u', 'guys', 'r']]
len(text): 78861
max_len_seq: 5
ok
Word found in glove: 8038/12632

---- Result after encoding
text_train: [[2, 4, 66, 3997, 86, 241], [4, 66, 3997, 86, 241, 329]]

---- Result after padding
text_train: [[2, 4, 66, 3997, 86, 241], [4, 66, 3997, 86, 241, 329]]

---- Result after creating x and y
text_train: [[   2    4   66 3997   86]
 [   4   66 3997   86  241]]
label_train: [[   4   66 3997   86  241]
 [  66 3997   86  241  329]]
vocab size: 12632


In [None]:
net = WordLSTM(mode='glove', weights_matrix=weights_matrix).cuda()
train(net, text_train, label_train, batch_size = 32, epochs=20, print_every=4000)
torch.save(net, PATH+'model_tweet_glove_5.pt')

vocab_size  12632
Epoch: 1/20... Loss: 5.974874...
Epoch: 2/20... Loss: 5.545497...
Epoch: 3/20... Loss: 5.121936...
Epoch: 4/20... Loss: 4.787803...
Epoch: 5/20... Loss: 4.673324...
Epoch: 6/20... Loss: 4.471931...
Epoch: 7/20... Loss: 4.407890...
Epoch: 8/20... Loss: 4.326817...
Epoch: 9/20... Loss: 4.154729...
Epoch: 10/20... Loss: 4.160047...
Epoch: 11/20... Loss: 4.273318...
Epoch: 12/20... Loss: 4.265508...
Epoch: 13/20... Loss: 4.197412...
Epoch: 14/20... Loss: 4.208659...
Epoch: 15/20... Loss: 4.113024...
Epoch: 16/20... Loss: 4.008296...
Epoch: 17/20... Loss: 3.929439...
Epoch: 18/20... Loss: 3.944991...
Epoch: 19/20... Loss: 3.946811...
Epoch: 20/20... Loss: 3.855349...


In [None]:
net = torch.load(PATH+'model_tweet_glove_5.pt')
net.eval()
for i in range(10):
    print("Example {} generated: ".format(i))
    print(sample(net, 500, encod, unencod, prime = "<s>"))

Example 0 generated: 
<s> i 'm so sad . </s>
Example 1 generated: 
<s> " everybody i m so sore , and it is n't n't get any idea , and i 'm so jealous . " i 'm still going for the last of the washing of the rift valley , speeches ! i have n't seen it , and it 's a sad throat , speeches . ihad is n't n't be a long idea to go home to the house . </s>
Example 2 generated: 
<s> " everybody , it is really good ! i m so sad . </s>
Example 3 generated: 
<s> i 'm not a closet . </s>
Example 4 generated: 
<s> is going to see it ! i 'm so tired . </s>
Example 5 generated: 
<s> i m scared . " i have n't been a virus twilighter . it 's not a great idea to see it ! </s>
Example 6 generated: 
<s> is not a good idea to see it ! ! i m going to go to school tomorrow </s>
Example 7 generated: 
<s> " @darealsunisakim oh , that is n't going for the next one of my soul of the show ! i 'm so tired ! ! i 'm so sad , i have n't slept for the cake . i 'm so lucky ! i m stuck up . </s>
Example 8 generated: 
<s> 

In [None]:
## Question 14
net = torch.load(PATH+'model_tweet_glove_5.pt') # best model
net.eval()
test_loss, perplexity = test(net, 'test_2.txt')
print('test loss {:5.2f} | test perplexity {:8.2f}'.format(test_loss.item(), perplexity))

test loss  7.54 | test perplexity  1879.76


In [None]:
## Question 15      GLOVETWITTER
max_len_seq =  5# best model
text_train, label_train, vocab_size, encod, unencod, weights_matrix, words = processing(raw_tweet.copy(), max_len_seq, False, True)


---- Result after tokenization
Length words: 12630
text: [['<s>', 'i', 'love', '@health4uandpets', 'u', 'guys', 'r', 'the', 'best', '!', '!', '</s>'], ['<s>', 'i', 'm', 'meeting', 'up', 'with', 'one', 'of', 'my', 'besties', 'tonight', '!', 'ca', 'nt', 'wait', '!', '!', '-', 'girl', 'talk', '!', '!', '</s>']]

---- Result after reorganizing by sequences
text: [['<s>', 'i', 'love', '@health4uandpets', 'u', 'guys'], ['i', 'love', '@health4uandpets', 'u', 'guys', 'r']]
len(text): 78861
max_len_seq: 5
ok
Word found in glove: 8411/12632

---- Result after encoding
text_train: [[2, 4, 66, 3997, 86, 241], [4, 66, 3997, 86, 241, 329]]

---- Result after padding
text_train: [[2, 4, 66, 3997, 86, 241], [4, 66, 3997, 86, 241, 329]]

---- Result after creating x and y
text_train: [[   2    4   66 3997   86]
 [   4   66 3997   86  241]]
label_train: [[   4   66 3997   86  241]
 [  66 3997   86  241  329]]
vocab size: 12632


In [None]:
net = WordLSTM(mode='glove', weights_matrix=weights_matrix).cuda()
train(net, text_train, label_train, batch_size = 32, epochs=20, print_every=4000)
torch.save(net, PATH+'model_tweet_gloveT_5.pt')

vocab_size  12632
Epoch: 1/20... Loss: 5.840675...
Epoch: 2/20... Loss: 5.305589...
Epoch: 3/20... Loss: 5.023337...
Epoch: 4/20... Loss: 4.863178...
Epoch: 5/20... Loss: 4.762218...
Epoch: 6/20... Loss: 4.510095...
Epoch: 7/20... Loss: 4.485887...
Epoch: 8/20... Loss: 4.361248...
Epoch: 9/20... Loss: 4.194154...
Epoch: 10/20... Loss: 4.096000...
Epoch: 11/20... Loss: 4.043387...
Epoch: 12/20... Loss: 3.999984...
Epoch: 13/20... Loss: 3.885160...
Epoch: 14/20... Loss: 3.799375...
Epoch: 15/20... Loss: 3.817178...
Epoch: 16/20... Loss: 3.816291...
Epoch: 17/20... Loss: 3.941351...
Epoch: 18/20... Loss: 3.702725...
Epoch: 19/20... Loss: 3.619230...
Epoch: 20/20... Loss: 3.567654...


In [None]:
net = torch.load(PATH+'model_tweet_gloveT_5.pt')
net.eval()
for i in range(10):
    print("Example {} generated: ".format(i))
    print(sample(net, 500, encod, unencod, prime = "<s>"))

Example 0 generated: 
<s> " @neilhimself yeah i m going back to work , i 'm going back back for school tomorrow , and i m not nominated . otoh i m so lucky i 'm not nominated , but do nt want it , but i 'm f*d ; i 'm not exhausted . i m going to bed . </s>
Example 1 generated: 
<s> " @mussomitchel i know i could be the problem of a hin . </s>
Example 2 generated: 
<s> is so sad , i have a doctors weight of the house . </s>
Example 3 generated: 
<s> " @darealsunisakim yeah , i m so hungry ! " @marsgirl86 @sudeepnayak ! i m going to bed . but ca not comfort coughing . </s>
Example 4 generated: 
<s> " @mussomitchel yeah , it sucks ! ! </s>
Example 5 generated: 
<s> is going to be the last day , but it was not a long of the morning ! </s>
Example 6 generated: 
<s> i hate the boys . </s>
Example 7 generated: 
<s> " @mussomitchel oh , but do i 'm a closet twilighter , i m so sad i 'm gon na be a good morning . </s>
Example 8 generated: 
<s> i 'm so sad i m not a sad day , but i do not have t

In [None]:
## Question 16
net = torch.load(PATH+'model_tweet_gloveT_5.pt') # best model
net.eval()
test_loss, perplexity = test(net, 'test_2.txt')
print('test loss {:5.2f} | test perplexity {:8.2f}'.format(test_loss.item(), perplexity))

test loss  7.65 | test perplexity  2108.56


In [None]:
math.exp(-test_loss)

0.0004742569062105376

**PROBLEM 3**

In [6]:
import re
import csv
import nltk
import numpy as np
nltk.download('punkt')
# torch.cuda.is_available() checks and returns a Boolean True if a GPU is available, else it'll return False
is_cuda = torch.cuda.is_available()
if is_cuda:
    device = torch.device("cuda")
    print("GPU is available")
else:
    device = torch.device("cpu")
    print("GPU not available, CPU used")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
GPU is available


In [19]:
data_train_complete = pd.read_csv(PATH+'sentiment_complete.csv', encoding = "ISO-8859-1", engine='python', header=None)

In [24]:
data_train_complete.head()

Unnamed: 0,0,1,2,3,4,5
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [25]:
## LOAD DATA
data_train = pd.read_csv(PATH+'sentiment-train.csv')
data_test = pd.read_csv(PATH+'sentiment-test.csv')
data_train_label = data_train.sentiment 
data_train = data_train.text
data_test_label = data_test.sentiment 
data_test = data_test.text
data_train_complete_label = data_train_complete[0]
data_train_complete = data_train_complete[5]
print("\n---- Preliminary text analysis")
print("Length data_train: {}".format(len(data_train)))
print("Length data_test: {}".format(len(data_test)))
print("Length data_complete: {}".format(len(data_train_complete)))
print("Length data_train_label: {}".format(len(data_train_label)))
print("Length data_test_label: {}".format(len(data_test_label)))
print("Length data_train_complete_label: {}".format(len(data_train_complete_label)))
print("data_train: {}".format(data_train[0]))
print("data_test: {}".format(data_test[0]))
#print("data_train_complete: {}".format(data_train_complete[0]))


---- Preliminary text analysis
Length data_train: 60000
Length data_test: 359
Length data_complete: 1600000
Length data_train_label: 60000
Length data_test_label: 359
Length data_train_complete_label: 1600000
data_train: I LOVE @Health4UandPets u guys r the best!! 
data_test: @stellargirl I loooooooovvvvvveee my Kindle2. Not that the DX is cool, but the 2 is fantastic in its own right.


In [None]:
# Organizing data
train_labels = [x for x in data_train_label]
train_sentences = [x for x in data_train]
test_labels = [x for x in data_test_label]
test_sentences = [x for x in data_test]
# Some simple cleaning of data
for i in range(len(train_sentences)):
    train_sentences[i] = re.sub('\d','0',train_sentences[i])
for i in range(len(test_sentences)):
    test_sentences[i] = re.sub('\d','0',test_sentences[i])
for i in range(len(train_sentences)):
    if 'www.' in train_sentences[i] or 'http:' in train_sentences[i] or 'https:' in train_sentences[i] or '.com' in train_sentences[i]:
        train_sentences[i] = re.sub(r"([^ ]+(?<=\.[a-z]{3}))", "<url>", train_sentences[i])
for i in range(len(test_sentences)):
    if 'www.' in test_sentences[i] or 'http:' in test_sentences[i] or 'https:' in test_sentences[i] or '.com' in test_sentences[i]:
        test_sentences[i] = re.sub(r"([^ ]+(?<=\.[a-z]{3}))", "<url>", test_sentences[i])

print("\n---- Cleaning")
print("Length train_sentences: {}".format(len(train_sentences)))
print("Length train_labels: {}".format(len(train_labels)))
print("Length test_sentences: {}".format(len(test_sentences)))
print("Length test_labels: {}".format(len(test_labels)))
print("data_train: {}".format(train_sentences[0]))
print("data_test: {}".format(test_sentences[0]))

## Preprocessing
words = Counter() #Dictionary that will map a word to the number of times it appeared in all the training sentences
for i, sentence in enumerate(train_sentences):
    train_sentences[i] = [] #The sentences will be stored as a list of words/tokens
    for word in nltk.word_tokenize(sentence): #Tokenizing the words
        words.update([word.lower()]) #Converting all the words to lower case
        train_sentences[i].append(word)

words = {k:v for k,v in words.items() if v>1} # Removing the words that only appear once
words = sorted(words, key=words.get, reverse=True)
words = ['_PAD','_UNK'] + words # Adding padding and unknown to our vocabulary so that they will be assigned an index
word2idx = {o:i for i,o in enumerate(words)}# Dictionaries to store the word to index mappings and vice versa
idx2word = {i:o for i,o in enumerate(words)}
vocab_size = len(word2idx) + 1

for i, sentence in enumerate(train_sentences):
    train_sentences[i] = [word2idx[word] if word in word2idx else word2idx['_UNK'] for word in sentence]
for i, sentence in enumerate(test_sentences):
    test_sentences[i] = [word2idx[word.lower()] if word.lower() in word2idx else word2idx['_UNK'] for word in nltk.word_tokenize(sentence)]

def pad_input(sentences, seq_len):
    features = np.zeros((len(sentences), seq_len),dtype=int)
    for ii, review in enumerate(sentences):
        if len(review) != 0:
            features[ii, -len(review):] = np.array(review)[:seq_len]
    return features

seq_len = 200 #The length that the sentences will be padded/shortened to
train_sentences = pad_input(train_sentences, seq_len)
test_sentences = pad_input(test_sentences, seq_len)

# Converting our labels into numpy arrays
train_labels = np.array(train_labels)
test_labels = np.array(test_labels)

print("\n---- Preprocessing")
print("Length train_sentences: {}".format(len(train_sentences)))
print("Length train_labels: {}".format(len(train_labels)))
print("Length test_sentences: {}".format(len(test_sentences)))
print("Length test_labels: {}".format(len(test_labels)))
#print("data_train: {}".format(train_sentences[0]))
#print("data_test: {}".format(test_sentences[0]))

## Batches
import torch
from torch.utils.data import TensorDataset, DataLoader
train_data = TensorDataset(torch.from_numpy(train_sentences), torch.from_numpy(train_labels))
test_data = TensorDataset(torch.from_numpy(test_sentences), torch.from_numpy(test_labels))
batch_size = 400
batch_size_test = 359
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size_test)
dataiter = iter(train_loader)
sample_x, sample_y = dataiter.next()
print(sample_x.shape, sample_y.shape)


---- Cleaning
Length train_sentences: 60000
Length train_labels: 60000
Length test_sentences: 359
Length test_labels: 359
data_train: I LOVE @Health0UandPets u guys r the best!! 
data_test: @stellargirl I loooooooovvvvvveee my Kindle0. Not that the DX is cool, but the 0 is fantastic in its own right.

---- Preprocessing
Length train_sentences: 60000
Length train_labels: 60000
Length test_sentences: 359
Length test_labels: 359
torch.Size([400, 200]) torch.Size([400])


In [16]:
import torch.nn as nn
output_size = 1
embedding_dim = 400
hidden_dim = 512
n_layers = 2
class SentimentNet(nn.Module):
    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, drop_prob=0.5, mode='lstm', weights_matrix=[]):
        super(SentimentNet, self).__init__()
        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        self.mode = mode
        self.bidirectional = False
        if weights_matrix != []:
            num_embeddings=weights_matrix.shape[0]
            embedding_dim=weights_matrix.shape[1]
            self.embedding = nn.Embedding(num_embeddings, embedding_dim)
            self.embedding.weight = nn.Parameter(torch.tensor(weights_matrix,dtype=torch.float32))
            self.embedding.weight.requires_grad = False
        else:
            self.embedding = nn.Embedding(vocab_size, embedding_dim)
        if self.mode=='lstm':
            self.unit = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=drop_prob, batch_first=True)
        elif self.mode=='blstm':
            self.unit = nn.LSTM(embedding_dim, hidden_dim, n_layers, bidirectional=True, dropout=drop_prob, batch_first=True)
            self.bidirectional = True
        elif self.mode=='gru':
            self.unit = nn.GRU(embedding_dim, hidden_dim, n_layers, dropout=drop_prob, batch_first=True)
        elif self.mode=='bgru':
            self.unit = nn.GRU(embedding_dim, hidden_dim, n_layers, bidirectional=True, dropout=drop_prob, batch_first=True)
            self.bidirectional = True
        self.fc = nn.Linear(hidden_dim, output_size)
        self.dropout = nn.Dropout(0.2)
        self.sigmoid = nn.Sigmoid()
    def forward(self, x, hidden):
        batch_size = x.size(0)
        x = x.long()
        embeds = self.embedding(x)
        unit_out, hidden = self.unit(embeds, hidden)
        unit_out = unit_out.contiguous().view(-1, self.hidden_dim)
        out = self.dropout(unit_out)
        out = self.fc(out)
        out = self.sigmoid(out)
        out = out.view(batch_size, -1)
        out = out[:,-1]
        return out, hidden
    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        if self.bidirectional:
            hidden = (weight.new(self.n_layers*2, batch_size, self.hidden_dim).zero_().to(device),
                    weight.new(self.n_layers*2, batch_size, self.hidden_dim).zero_().to(device))
        else:
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(device),
                    weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(device))
        if 'gru' in self.mode:
            hidden = hidden[0]
        return hidden

In [17]:
def train(model, train_loader, mode):
    lr=0.005
    criterion = nn.BCELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    epochs = 5
    clip = 5
    model.train()
    for i in range(epochs):
        h = model.init_hidden(batch_size)
        for inputs, labels in train_loader:
            '''
            you combine the two hidden layers into a tuple. Because of this you cannot use .data. 
            For a GRU you only need one hidden layer not two. So you can either use h1 or h2. 
            You only need two for lstms because it also requires the cell state.
            '''
            if "gru" in mode:
                h = h.data
                h = h.detach()
            else:
                h = tuple([e.data for e in h])
                h = (h[0].detach(),h[1].detach())
            inputs, labels = inputs.to(device), labels.to(device)
            model.zero_grad()
            output, h = model(inputs, h)
            loss = criterion(output.squeeze(), labels.float())
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), clip)
            optimizer.step()
        print("Epoch: {}/{}...".format(i+1, epochs), "Loss: {:.6f}...".format(loss.item()))
    return model

def test(model, test_loader, batch_size_test=359):
    test_losses = []
    num_correct = 0
    criterion = nn.BCELoss()
    h = model.init_hidden(batch_size_test)
    model.eval()
    for inputs, labels in test_loader:
        if "gru" in mode:
            h = h.data
            h = h.detach()
        else:
            h = tuple([e.data for e in h])
            h = (h[0].detach(),h[1].detach())
        inputs, labels = inputs.to(device), labels.to(device)
        #print(inputs.size())
        output, h = model(inputs, h)
        test_loss = criterion(output.squeeze(), labels.float())
        test_losses.append(test_loss.item())
        pred = torch.round(output.squeeze()) #rounds the output to 0/1
        correct_tensor = pred.eq(labels.float().view_as(pred))
        correct = np.squeeze(correct_tensor.cpu().numpy())
        num_correct += np.sum(correct)
    return test_losses, num_correct

In [None]:
## Question 1
mode = 'lstm'
model = SentimentNet(vocab_size, output_size, embedding_dim, hidden_dim, n_layers, mode=mode)
model.to(device)
print(model)

SentimentNet(
  (embedding): Embedding(19960, 400)
  (unit): LSTM(400, 512, num_layers=2, batch_first=True, dropout=0.5)
  (fc): Linear(in_features=512, out_features=1, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (sigmoid): Sigmoid()
)


In [None]:
## Train model
model = train(model, train_loader, mode)
torch.save(model, PATH+'model_class_lstm.pt')

In [None]:
model = torch.load(PATH+'model_class_lstm.pt')
test_losses, num_correct = test(model, test_loader)
test_acc = num_correct/len(test_loader.dataset)        
print("Test loss: {:.3f}".format(np.mean(test_losses)))
print("Test accuracy: {:.3f}%".format(test_acc*100))

Test loss: 0.745
Test accuracy: 74.373%


In [None]:
## Question 2
mode='gru'
model = SentimentNet(vocab_size, output_size, embedding_dim, hidden_dim, n_layers, mode=mode)
model.to(device)
print(model)

SentimentNet(
  (embedding): Embedding(19960, 400)
  (unit): GRU(400, 512, num_layers=2, batch_first=True, dropout=0.5)
  (fc): Linear(in_features=512, out_features=1, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (sigmoid): Sigmoid()
)


In [None]:
model = train(model, train_loader, mode)
torch.save(model, PATH+'model_class_gru.pt')

In [None]:
model = torch.load(PATH+'model_class_gru.pt')
test_losses, num_correct = test(model, test_loader)
test_acc = num_correct/len(test_loader.dataset)        
print("Test loss: {:.3f}".format(np.mean(test_losses)))
print("Test accuracy: {:.3f}%".format(test_acc*100))

Test loss: 0.555
Test accuracy: 76.880%


In [None]:
## Question 3
mode='blstm'
model = SentimentNet(vocab_size, output_size, embedding_dim, hidden_dim, n_layers, mode=mode)
model.to(device)
print(model)

SentimentNet(
  (embedding): Embedding(19960, 400)
  (unit): LSTM(400, 512, num_layers=2, batch_first=True, dropout=0.5, bidirectional=True)
  (fc): Linear(in_features=512, out_features=1, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (sigmoid): Sigmoid()
)


In [None]:
model = train(model, train_loader, mode)
torch.save(model, PATH+'model_class_blstm.pt')

Epoch: 1/5... Loss: 0.526390...
Epoch: 2/5... Loss: 0.481762...
Epoch: 3/5... Loss: 0.402982...
Epoch: 4/5... Loss: 0.356812...
Epoch: 5/5... Loss: 0.315888...


In [None]:
model = torch.load(PATH+'model_class_blstm.pt')
test_losses, num_correct = test(model, test_loader)
test_acc = num_correct/len(test_loader.dataset)        
print("Test loss: {:.3f}".format(np.mean(test_losses)))
print("Test accuracy: {:.3f}%".format(test_acc*100))

Test loss: 0.624
Test accuracy: 77.716%


In [None]:
## Question 4
mode='bgru'
model = SentimentNet(vocab_size, output_size, embedding_dim, hidden_dim, n_layers, mode=mode)
model.to(device)
print(model)

SentimentNet(
  (embedding): Embedding(19960, 400)
  (unit): GRU(400, 512, num_layers=2, batch_first=True, dropout=0.5, bidirectional=True)
  (fc): Linear(in_features=512, out_features=1, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (sigmoid): Sigmoid()
)


In [None]:
model = train(model, train_loader, mode)
torch.save(model, PATH+'model_class_bgru.pt')

Epoch: 1/5... Loss: 0.479517...
Epoch: 2/5... Loss: 0.439087...
Epoch: 3/5... Loss: 0.399345...
Epoch: 4/5... Loss: 0.400972...
Epoch: 5/5... Loss: 0.421475...


In [None]:
model = torch.load(PATH+'model_class_bgru.pt')
test_losses, num_correct = test(model, test_loader)
test_acc = num_correct/len(test_loader.dataset)        
print("Test loss: {:.3f}".format(np.mean(test_losses)))
print("Test accuracy: {:.3f}%".format(test_acc*100))

Test loss: 0.650
Test accuracy: 75.209%


In [35]:
## Question 5
def encode_glove(words):
    ## Create glove structure from file
    glove = {}
    with open(PATH+'glove.twitter.27B.100d.txt') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], "float32")
            glove[word] = vector
    for el in ['_PAD','_UNK']:
        glove[el] = np.random.normal(scale=0.6, size=(100, )) #np.random.rand(100)
    ## Create embedding matrix for nn (based on the words in the vocab)
    matrix_len = len(words)
    weights_matrix = np.zeros((matrix_len, 100))
    words_found = 0
    for i, word in enumerate(words):
        try: 
            weights_matrix[i] = glove[word]
            words_found += 1
        except KeyError:
            weights_matrix[i] = np.random.normal(scale=0.6, size=(100, )) #np.random.rand(100)
    print("Word found in glove: {}/{}".format(words_found,matrix_len))
    return weights_matrix, glove

weights_matrix, glove = encode_glove(words)
vocab_size = len(weights_matrix)

Word found in glove: 86782/247693


In [None]:
mode = 'blstm'
model = SentimentNet(vocab_size, output_size, embedding_dim, hidden_dim, n_layers, mode=mode, weights_matrix=weights_matrix)
model.to(device)
print(model)

SentimentNet(
  (embedding): Embedding(19959, 100)
  (unit): LSTM(100, 512, num_layers=2, batch_first=True, dropout=0.5, bidirectional=True)
  (fc): Linear(in_features=512, out_features=1, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (sigmoid): Sigmoid()
)


  


In [None]:
model = train(model, train_loader, mode)
torch.save(model, PATH+'model_class_blstm_glove.pt')

Epoch: 1/5... Loss: 0.549527...
Epoch: 2/5... Loss: 0.515563...
Epoch: 3/5... Loss: 0.479801...
Epoch: 4/5... Loss: 0.476016...
Epoch: 5/5... Loss: 0.494489...


In [None]:
model = torch.load(PATH+'model_class_blstm_glove.pt')
test_losses, num_correct = test(model, test_loader)
test_acc = num_correct/len(test_loader.dataset)        
print("Test loss: {:.3f}".format(np.mean(test_losses)))
print("Test accuracy: {:.3f}%".format(test_acc*100))

Test loss: 0.452
Test accuracy: 79.666%


In [None]:
## Question 6 
mode = 'blstm'
from sklearn.model_selection import KFold
kfold = KFold(n_splits=5, shuffle=True)

In [None]:
# K-fold Cross Validation model evaluation
for emb, hid in zip([100, 100, 400, 400], [128, 512, 128, 512]):
  print(f'------- embedding_dim={embedding_dim}, hidden_dim={hidden_dim}')
  embedding_dim = emb
  hidden_dim = hid
  avg = 0
  for fold, (train_ids, test_ids) in enumerate(kfold.split(train_data)):
    print(f'--- FOLD {fold}')
    # Init
    model = SentimentNet(vocab_size, output_size, embedding_dim, hidden_dim, n_layers, mode=mode, weights_matrix=weights_matrix)
    model.to(device)

    # Sample elements randomly from a given list of ids, no replacement.
    train_subsampler = torch.utils.data.SubsetRandomSampler(train_ids)
    test_subsampler = torch.utils.data.SubsetRandomSampler(test_ids)
    
    # Define data loaders for training and testing data in this fold
    train_sub_loader = torch.utils.data.DataLoader(train_data, batch_size=400, sampler=train_subsampler)
    test_sub_loader = torch.utils.data.DataLoader(train_data, batch_size=400, sampler=test_subsampler)
    
    model = train(model, train_sub_loader, mode)
    test_losses, num_correct = test(model, test_sub_loader, batch_size_test=400)
    test_acc = num_correct/len(test_subsampler)      
    avg += test_acc  
    #print("Test loss: {:.3f}".format(np.mean(test_losses)))
    print("Test accuracy: {:.3f}%".format(test_acc*100))
  print("Test avg fold accuracy: {:.3f}%".format((avg/5)*100))

------- embedding_dim=100, hidden_dim=128
--- FOLD 0


  


Epoch: 1/5... Loss: 0.487306...
Epoch: 2/5... Loss: 0.464413...
Epoch: 3/5... Loss: 0.516506...
Epoch: 4/5... Loss: 0.401988...
Epoch: 5/5... Loss: 0.417240...
Test accuracy: 77.758%
--- FOLD 1
Epoch: 1/5... Loss: 0.534377...
Epoch: 2/5... Loss: 0.431462...
Epoch: 3/5... Loss: 0.466414...
Epoch: 4/5... Loss: 0.446498...
Epoch: 5/5... Loss: 0.424405...
Test accuracy: 77.917%
--- FOLD 2
Epoch: 1/5... Loss: 0.560606...
Epoch: 2/5... Loss: 0.511217...
Epoch: 3/5... Loss: 0.422498...
Epoch: 4/5... Loss: 0.481465...
Epoch: 5/5... Loss: 0.399270...
Test accuracy: 77.433%
--- FOLD 3
Epoch: 1/5... Loss: 0.506984...
Epoch: 2/5... Loss: 0.505783...
Epoch: 3/5... Loss: 0.440371...
Epoch: 4/5... Loss: 0.461741...
Epoch: 5/5... Loss: 0.442439...
Test accuracy: 78.117%
--- FOLD 4
Epoch: 1/5... Loss: 0.566215...
Epoch: 2/5... Loss: 0.501004...
Epoch: 3/5... Loss: 0.457864...
Epoch: 4/5... Loss: 0.469962...
Epoch: 5/5... Loss: 0.397017...
Test accuracy: 77.775%
Test avg fold accuracy: 77.800%
------- e

In [None]:
## Question 7
mode = 'blstm'
embedding_dim = 400
hidden_dim = 128
model = SentimentNet(vocab_size, output_size, embedding_dim, hidden_dim, n_layers, mode=mode, weights_matrix=weights_matrix)
model.to(device)
print(model)

SentimentNet(
  (embedding): Embedding(19959, 100)
  (unit): LSTM(100, 128, num_layers=2, batch_first=True, dropout=0.5, bidirectional=True)
  (fc): Linear(in_features=128, out_features=1, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (sigmoid): Sigmoid()
)


  


In [None]:
model = train(model, train_loader, mode)
torch.save(model, PATH+'model_class_best.pt')

Epoch: 1/5... Loss: 0.500725...
Epoch: 2/5... Loss: 0.523367...
Epoch: 3/5... Loss: 0.477262...
Epoch: 4/5... Loss: 0.412091...
Epoch: 5/5... Loss: 0.440028...


In [None]:
model = torch.load(PATH+'model_class_best.pt')
test_losses, num_correct = test(model, test_loader)
test_acc = num_correct/len(test_loader.dataset)        
print("Test loss: {:.3f}".format(np.mean(test_losses)))
print("Test accuracy: {:.3f}%".format(test_acc*100))

Test loss: 0.427
Test accuracy: 80.501%


In [30]:
## Question 8
# Organizing data
train_labels_raw = [x for x in data_train_complete_label]
train_sentences_raw = [x for x in data_train_complete]
test_labels = [x for x in data_test_label]
test_sentences = [x for x in data_test]

train_labels = []
train_sentences = []
# eliminate all neutral labels
for sent, label in zip(train_sentences_raw, train_labels_raw):
    if label == 2:
      continue
    elif label == 4:
      label = 1
    train_labels.append(label)
    train_sentences.append(sent)
    

# Some simple cleaning of data
for i in range(len(train_sentences)):
    train_sentences[i] = re.sub('\d','0',train_sentences[i])
for i in range(len(test_sentences)):
    test_sentences[i] = re.sub('\d','0',test_sentences[i])
for i in range(len(train_sentences)):
    if 'www.' in train_sentences[i] or 'http:' in train_sentences[i] or 'https:' in train_sentences[i] or '.com' in train_sentences[i]:
        train_sentences[i] = re.sub(r"([^ ]+(?<=\.[a-z]{3}))", "<url>", train_sentences[i])
for i in range(len(test_sentences)):
    if 'www.' in test_sentences[i] or 'http:' in test_sentences[i] or 'https:' in test_sentences[i] or '.com' in test_sentences[i]:
        test_sentences[i] = re.sub(r"([^ ]+(?<=\.[a-z]{3}))", "<url>", test_sentences[i])

print("\n---- Cleaning")
print("Length train_sentences: {}".format(len(train_sentences)))
print("Length train_labels: {}".format(len(train_labels)))
print("Length test_sentences: {}".format(len(test_sentences)))
print("Length test_labels: {}".format(len(test_labels)))
print("data_train: {}".format(train_sentences[0]))
print("data_test: {}".format(test_sentences[0]))

## Preprocessing
words = Counter() #Dictionary that will map a word to the number of times it appeared in all the training sentences
for i, sentence in enumerate(train_sentences):
    train_sentences[i] = [] #The sentences will be stored as a list of words/tokens
    for word in nltk.word_tokenize(sentence): #Tokenizing the words
        words.update([word.lower()]) #Converting all the words to lower case
        train_sentences[i].append(word)

words = {k:v for k,v in words.items() if v>1} # Removing the words that only appear once
words = sorted(words, key=words.get, reverse=True)
words = ['_PAD','_UNK'] + words # Adding padding and unknown to our vocabulary so that they will be assigned an index
word2idx = {o:i for i,o in enumerate(words)}# Dictionaries to store the word to index mappings and vice versa
idx2word = {i:o for i,o in enumerate(words)}
vocab_size = len(word2idx) + 1

for i, sentence in enumerate(train_sentences):
    train_sentences[i] = [word2idx[word] if word in word2idx else word2idx['_UNK'] for word in sentence]
for i, sentence in enumerate(test_sentences):
    test_sentences[i] = [word2idx[word.lower()] if word.lower() in word2idx else word2idx['_UNK'] for word in nltk.word_tokenize(sentence)]

def pad_input(sentences, seq_len):
    features = np.zeros((len(sentences), seq_len),dtype=int)
    for ii, review in enumerate(sentences):
        if len(review) != 0:
            features[ii, -len(review):] = np.array(review)[:seq_len]
    return features

seq_len = 200 #The length that the sentences will be padded/shortened to
train_sentences = pad_input(train_sentences, seq_len)
test_sentences = pad_input(test_sentences, seq_len)

# Converting our labels into numpy arrays
train_labels = np.array(train_labels)
test_labels = np.array(test_labels)

print("\n---- Preprocessing")
print("Length train_sentences: {}".format(len(train_sentences)))
print("Length train_labels: {}".format(len(train_labels)))
print("Length test_sentences: {}".format(len(test_sentences)))
print("Length test_labels: {}".format(len(test_labels)))
#print("data_train: {}".format(train_sentences[0]))
#print("data_test: {}".format(test_sentences[0]))

## Batches
import torch
from torch.utils.data import TensorDataset, DataLoader
train_data = TensorDataset(torch.from_numpy(train_sentences), torch.from_numpy(train_labels))
test_data = TensorDataset(torch.from_numpy(test_sentences), torch.from_numpy(test_labels))
batch_size = 400
batch_size_test = 359
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size_test)
dataiter = iter(train_loader)
sample_x, sample_y = dataiter.next()
print(sample_x.shape, sample_y.shape)


---- Cleaning
Length train_sentences: 1600000
Length train_labels: 1600000
Length test_sentences: 359
Length test_labels: 359
data_train: @switchfoot <url>/0y0zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D
data_test: @stellargirl I loooooooovvvvvveee my Kindle0. Not that the DX is cool, but the 0 is fantastic in its own right.

---- Preprocessing
Length train_sentences: 1600000
Length train_labels: 1600000
Length test_sentences: 359
Length test_labels: 359
torch.Size([400, 200]) torch.Size([400])


In [33]:

for sent, label in zip(train_sentences_raw, train_labels_raw):
    if label == 2:
      print('ok')
      break
      continue
    elif label == 4:
      label = 1

In [36]:
mode = 'blstm'
embedding_dim = 400
hidden_dim = 128
model = SentimentNet(vocab_size, output_size, embedding_dim, hidden_dim, n_layers, mode=mode, weights_matrix=weights_matrix)
model.to(device)
print(model)

  


SentimentNet(
  (embedding): Embedding(247693, 100)
  (unit): LSTM(100, 128, num_layers=2, batch_first=True, dropout=0.5, bidirectional=True)
  (fc): Linear(in_features=128, out_features=1, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (sigmoid): Sigmoid()
)


In [37]:
model = train(model, train_loader, mode)
torch.save(model, PATH+'model_class_best_complete.pt')

Epoch: 1/5... Loss: 0.424559...
Epoch: 2/5... Loss: 0.431419...


KeyboardInterrupt: ignored

In [38]:
#model = torch.load(PATH+'model_class_best_complete.pt')
test_losses, num_correct = test(model, test_loader)
test_acc = num_correct/len(test_loader.dataset)        
print("Test loss: {:.3f}".format(np.mean(test_losses)))
print("Test accuracy: {:.3f}%".format(test_acc*100))

Test loss: 0.355
Test accuracy: 83.565%
