In [14]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [15]:
%cd 'drive/MyDrive/Colab Notebooks/END/Ass7'

!ls

[Errno 2] No such file or directory: 'drive/MyDrive/Colab Notebooks/END/Ass7'
/content/drive/MyDrive/Colab Notebooks/END/Ass7
'Copy of Sentiment Analysis using LSTM RNN.ipynb'   tokenizer.pkl
 saved_weights.pt				    tokenizer_SNLP.pkl
'Stanford Sentiment Analysis.ipynb'		    tweets.csv
 stanfordSentimentTreebank


In [16]:
import os
import sys

import pandas


def get_phrase_sentiments(base_directory):
    def group_labels(label):
        if label in ["very negative", "negative"]:
            return "negative"
        elif label in ["positive", "very positive"]:
            return "positive"
        else:
            return "neutral"

    dictionary = pandas.read_csv(os.path.join(base_directory, "dictionary.txt"), sep="|")
    dictionary.columns = ["phrase", "id"]
    dictionary = dictionary.set_index("id")
    
    sentiment_labels = pandas.read_csv(os.path.join(base_directory, "sentiment_labels.txt"), sep="|")
    sentiment_labels.columns = ["id", "sentiment"]
    sentiment_labels = sentiment_labels.set_index("id")
    phrase_sentiments = dictionary.join(sentiment_labels)
    
    phrase_sentiments["fine"] = pandas.cut(phrase_sentiments.sentiment, [0, 0.2, 0.4, 0.6, 0.8, 1.0],
                                           include_lowest=True,
                                           labels=["very negative", "negative", "neutral", "positive", "very positive"])
    phrase_sentiments["coarse"] = phrase_sentiments.fine.apply(group_labels)
    return phrase_sentiments


def get_sentence_partitions(base_directory):
    sentences = pandas.read_csv(os.path.join(base_directory, "datasetSentences.txt"), index_col="sentence_index",
                                sep="\t") 
    splits = pandas.read_csv(os.path.join(base_directory, "datasetSplit.txt"), index_col="sentence_index")
    return sentences.join(splits).set_index("sentence")


def partition(base_directory):
    phrase_sentiments = get_phrase_sentiments(base_directory)
    sentence_partitions = get_sentence_partitions(base_directory)

    # noinspection PyUnresolvedReferences
    data = phrase_sentiments.join(sentence_partitions, on="phrase")
    # set all the ones without split labels into train set(this particularly includes phrases)
    data["splitset_label"] = data["splitset_label"].fillna(1).astype(int)
    print('Does any sentence not have sentiment?',pandas.isna(data["sentiment"]).value_counts())
    data["phrase"] = data["phrase"].str.replace(r"\s('s|'d|'re|'ll|'m|'ve|n't)\b", lambda m: m.group(1))
    return data.groupby("splitset_label")


base_directory, output_directory = './stanfordSentimentTreebank','./stanfordSentimentTreebank'
os.makedirs(output_directory, exist_ok=True)
for splitset, partition in partition(base_directory):
    split_name = {1: "train", 2: "test", 3: "dev"}[splitset]
    filename = os.path.join(output_directory, "stanford-sentiment-treebank.%s.csv" % split_name)
    # delete the split label column and save to a separate file
    del partition["splitset_label"]
    partition.to_csv(filename)

Does any sentence not have sentiment? False    239245
Name: sentiment, dtype: int64


In [17]:
# print(pandas.read_csv('stanfordSentimentTreebank/stanford-sentiment-treebank.train.csv'))
train_df = pandas.read_csv('stanfordSentimentTreebank/stanford-sentiment-treebank.train.csv', usecols=['phrase','fine'],index_col=False)
print(train_df.head())
print('train shape',train_df.shape)
print('train label count',train_df.fine.value_counts())


val_df = pandas.read_csv('stanfordSentimentTreebank/stanford-sentiment-treebank.dev.csv', usecols=['phrase','fine'],index_col=False)
print(val_df.head())
print('val shape',val_df.shape)
print('val label count',val_df.fine.value_counts())

          phrase           fine
0            ! '        neutral
1           ! ''        neutral
2         ! Alas        neutral
3    ! Brilliant  very positive
4  ! Brilliant !  very positive
train shape (236076, 2)
train label count neutral          118856
positive          49402
negative          42153
very positive     14713
very negative     10952
Name: fine, dtype: int64
                                              phrase      fine
0  ... Brian De Palma is utterly mad : cinema mad...   neutral
1  ... Designed to provide a mix of smiles and te...  negative
2  ... Mafia , rap stars and hood rats butt their...  positive
3  ... a boring parade of talking heads and techn...  negative
4  ... a fun little timewaster , helped especiall...  positive
val shape (1044, 2)
val label count negative         276
positive         259
neutral          219
very positive    158
very negative    132
Name: fine, dtype: int64


In [18]:
import random
import torch, torchtext
from torchtext import data 

# Manual Seed
SEED = 43
torch.manual_seed(SEED)

<torch._C.Generator at 0x7f5fc4da1f30>

In [19]:
Review = data.Field(sequential = True, tokenize = 'spacy', batch_first =True, include_lengths=True)
Label = data.LabelField(tokenize ='spacy', is_target=True, batch_first =True, sequential =False)

In [20]:
fields = [('phrase', Review),('fine',Label)]
fields

[('phrase', <torchtext.data.field.Field at 0x7f5ee63a4c50>),
 ('fine', <torchtext.data.field.LabelField at 0x7f5fc23e84a8>)]

In [21]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [22]:
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
stopwords = stopwords.words("english")

def remove_stopwords(word_list):
  return [word for word in word_list if word not in stopwords]


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [23]:
!pip install google-trans-new



In [24]:
import random
import google_trans_new
from google_trans_new import google_translator
from nltk.corpus import wordnet
import nltk
nltk.download('wordnet')

def get_synonyms(word):
  syns = wordnet.synsets(word) 
  return syns[0].lemmas()[0].name() if syns else word
  

def random_insertion(sentence, n=2): 
    words = sentence
    if len(words)<2:
      return words
    for _ in range(n):
        new_synonym = get_synonyms(random.choice(words))
        sentence.insert(random.randrange(len(sentence)+1), new_synonym) 
    return sentence

def random_deletion(words, p=0.5): 
    if len(words) <2: # return if single word
        return words
    remaining = list(filter(lambda x: random.uniform(0,1) > p,words)) 
    if len(remaining) == 0: # if not left, sample a random word
        return [random.choice(words)] 
    else:
        return remaining

def random_swap(sentence, n=5): 
    length = range(len(sentence)) 
    if len(sentence) <2:
      return sentence
    for _ in range(n):
        idx1, idx2 = random.sample(length, 2)
        sentence[idx1], sentence[idx2] = sentence[idx2], sentence[idx1] 
    return sentence

def google_trans(sentence):
    translator = google_translator()
    # sentence = ['The dog slept on the rug']

    available_langs = list(google_trans_new.LANGUAGES.keys()) 

    trans_lang = random.choice(available_langs) 
    # print(f"Translating to {google_trans_new.LANGUAGES[trans_lang]}")
    translations = translator.translate(sentence,lang_tgt=trans_lang, lang_src='en' )[2:-3]
    # print(translations)

    translations_en_random = translator.translate(translations, lang_src=trans_lang, lang_tgt='en') 
    # print('translate',translations_en_random)
    return translations_en_random if len(translations_en_random) else sentence

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [25]:
# from torchtext.data.dataset import Dataset
# class StanfordDataset(Dataset):
#     def __init__(self, isValSet_bool, augmentation_dict):
#         self.augmentation_dict = augmentation_dict

#         if isValSet_bool:
#             self.reviews_list = StanfordDataset_val
#             assert self.reviews_list
#         else:
#             self.reviews_list = StanfordDataset_train
#             assert self.reviews_list

#         print("{!r}: {} {} samples".format(
#                 self,
#                 len(self.reviews_list),
#                 "validation" if isValSet_bool else "training"
#         ))

#     def __len__(self):
#           return len(self.reviews_list)

#     def __getitem__(self, ndx):
#         review_item = self.reviews_list[ndx]
#         if self.augmentation_dict:
#             review_item_aug = doAugment(self.augmentation_dict, review_item)
#         else:
#             review_item_aug = review_item
        
#         review_item_aug[0].phrase = [Review.vocab.stoi[w] for w in review_item_aug[0].phrase]
#         # review_item_aug[0].phrase = torch.from_numpy([Review.vocab.stoi[w] for w in review_item_aug[0].phrase]).to(torch.float32)
#         review_item_aug[0].fine =  Label.vocab.stoi[review_item_aug[0].fine]
#         # review_item_aug[0].fine = torch.tensor([Label.vocab.stoi[review_item_aug[0].fine]],
#         #             dtype=torch.long,
#         #         )
#         print('review_item_aug',review_item_aug)
#         return review_item_aug[0]

# train_ds = StanfordDataset(False, augment)
# val_ds = StanfordDataset(True, augment)
# train_iterator, val_iterator = data.BucketIterator.splits((vars(train_ds)['reviews_list'], vars(val_ds)['reviews_list']), batch_size = 32, sort_key = lambda x: len(x.phrase), sort_within_batch=True, device = device)
# train_iterator = data.BucketIterator.splits(train_ds, batch_size = 32, sort_key = lambda x: len(x.reviews_list.phrase), sort_within_batch=True, device = device)
# val_iterator = data.BucketIterator.splits(val_ds, batch_size = 32, sort_key = lambda x: len(x.reviews_list.phrase), sort_within_batch=True, device = device)


In [26]:

import copy

augment={'random_insert':True, 'random_delete':True, 'random_swap':True, 'google_translate':True}

def doAugment(aug_dict, data_to_augment):
    sentence = copy.copy(data_to_augment)
    print(sentence)
    if aug_dict['random_insert']:
      if random.random() > 0.5:
        return random_insertion(sentence)
        # print('inserted sent',sentence)          
    if aug_dict['random_delete']:
      if random.random() > 0.5:
        return random_deletion(sentence)
        # print('delete',sentence)
    if aug_dict['random_swap']: 
      if random.random() > 0.5:
        return random_swap(sentence)
        # print('swap',sentence)
    if aug_dict['google_translate']: 
      if random.random() > 0.8:
        # print('whats the sentence to translate', sentence)
        return google_trans(sentence)
        # print('trans',sentence)
    return sentence
    # vars(make_copy)['phrase'] = sentence
    # print('after change',vars(data_to_augment)['phrase'], vars(make_copy)['phrase'] )
    

def createAugmentedDataset():
  dataset_train = []
  dataset_val = []
  for i in range(train_df.shape[0]):
      text = train_df.phrase.iloc[i]
      label = train_df.fine.iloc[i]
      curr = data.Example.fromlist([text, label], fields)
      without_stopwords = remove_stopwords(curr.phrase)
      if not len(without_stopwords):
        continue
      # dataset_train.append(data.Example.fromlist([without_stopwords, label], fields))
      text = doAugment(augment,without_stopwords)
      dataset_train.append(data.Example.fromlist([text, label], fields))
  
  dataset_val = [data.Example.fromlist([val_df.phrase[i],val_df.fine[i]], fields) for i in range(val_df.shape[0])] 

  return data.Dataset(dataset_train, fields), data.Dataset(dataset_val, fields)

StanfordDataset_train, StanfordDataset_val = createAugmentedDataset()


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
['Spike', 'Lee', "'s", 'masterful']
['Steven', 'Seagal']
['Stevenson', "'s", 'tale']
['Stevenson', "'s", 'tale', 'well']
['Stevenson', "'s", 'tale', 'well', 'earlier', 'Disney', 'efforts']
['Strangers']
['Suge', 'Knight']
['The', 'Bread', ',', 'My', 'Sweet']
['The', 'Queen', 'Damned']
['The', 'Salton', 'Sea']
['The', 'Shipping', 'News']
['The', 'Tune']
['Tosca']
['Townsend']
['Townsend', '.', 'When', 'speaks']
['Trouble']
['Truckzilla']
['Truckzilla', ',', 'cryin']
['Washington']
['Weaver', "'s", 'sensitive', 'reactions']
['Weaver', "'s", 'sensitive', 'reactions', 'make', 'two', '-', 'actor', 'master', 'class']
['Wendigo']
['Windtalkers']
['Yvan', "'s", 'rambunctious', ',', 'Jewish', 'sister', 'non', '-', 'Jew', 'husband']
['Yvan', 'Charlotte']
['`']
['`', 'issues']
['`', 'issues', "'"]
['`', '`', 'The', 'Bourne', 'Identity', "''"]
['`', '`', 'The', 'Bourne', 'Identity', "''", 'return', 'traditional', 'action', 'genre', '

In [27]:
# for i in StanfordDataset_train:
#   vars(i)['phrase'] = remove_stopwords(vars(i)['phrase'])
#   if not len( vars(i)['phrase']):
#     del i
# Review.build_vocab(StanfordDataset_train)
# Label.build_vocab(StanfordDataset_train)

In [28]:
Review.build_vocab(StanfordDataset_train, vectors = "glove.6B.100d",)
Label.build_vocab(StanfordDataset_train)
print('Size of input vocab : ', len(Review.vocab))
print('Size of label vocab : ', len(Label.vocab))
print('Top 10 words appreared repeatedly :', list(Review.vocab.freqs.most_common(10)))
print('Labels : ', Label.vocab.stoi)


Size of input vocab :  25508
Size of label vocab :  5
Top 10 words appreared repeatedly : [(',', 90781), ("'", 44267), ('-', 35872), ('.', 35314), ("'s", 27609), ('movie', 12629), ('film', 9016), ('The', 7387), ('`', 7251), ("''", 7026)]
Labels :  defaultdict(<function _default_unk_index at 0x7f5f71df8c80>, {'neutral': 0, 'positive': 1, 'negative': 2, 'very positive': 3, 'very negative': 4})


In [29]:
train_iterator,val_iterator = data.BucketIterator.splits((StanfordDataset_train, StanfordDataset_val), batch_size = 32, sort_key = lambda x: len(x.phrase), sort_within_batch=True, device = device)


for i in train_iterator:
  print(vars(i))
  break

{'batch_size': 32, 'dataset': <torchtext.data.dataset.Dataset object at 0x7f5eda1f3908>, 'fields': dict_keys(['phrase', 'fine']), 'input_fields': ['phrase'], 'target_fields': ['fine'], 'phrase': (tensor([[   21,    10,    10,  5171,     4,  7654,    11,    17],
        [  469,   327,   113,   327,   113,  1564,   134, 12539],
        [12248,   350,   976, 10210,   795,    72,   258,   157],
        [ 6082,   222,  2828,    53,   785,  6082,    59,  1764],
        [ 1305,     6,  1305,    96,  3839,  4240,  1305,     5],
        [  595,   363,   375,     4,     4,     4,   267,  2599],
        [ 1992,   303,  1040,   511,   222,   948,  5646,     5],
        [  954,   994,    84,   994, 12466,  1058,    48,    48],
        [ 9779,    14,    29,  5281,  9779,   696,  1700,    14],
        [ 1296,    21,    19,    17,   558, 20138,     5,  1296],
        [ 9583,  9383,  9383,     4,   650,     4,   608,   574],
        [ 4730,  3196,  3881,     2,     2,    73,    60,    73],
        [   

In [30]:
import os, pickle
with open('tokenizer_SNLP.pkl', 'wb') as tokens: 
    pickle.dump(Review.vocab.stoi, tokens)

In [40]:
import torch.nn as nn
import torch.nn.functional as F

class classifier(nn.Module):
    
    # Define all the layers used in model
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, dropout):
        
        super().__init__()          
        
        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        # LSTM layer
        self.encoder = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           dropout=dropout,
                           batch_first=True)
        # try using nn.GRU or nn.RNN here and compare their performances
        # try bidirectional and compare their performances
        
        # Dense layer
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, text, text_lengths):
        
        # text = [batch size, sent_length]
        embedded = self.embedding(text)
        # embedded = [batch size, sent_len, emb dim]
      
        # packed sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths.cpu(), batch_first=True)
        
        packed_output, (hidden, cell) = self.encoder(packed_embedded)
        #hidden = [batch size, num layers * num directions,hid dim]
        #cell = [batch size, num layers * num directions,hid dim]
        # print(hidden.shape)
        # Hidden = [batch size, hid dim * num directions]
        dense_outputs = self.fc(hidden)   
        # print('out',dense_outputs.shape, dense_outputs[0].shape)
        # Final activation function softmax
        # ????? shouldnt you be taking dense_output[-1] here? 
        # print('check with -1 here')
        output = F.softmax(dense_outputs[0], dim=1)
            
        return output

In [41]:
# Define hyperparameters
size_of_vocab = len(Review.vocab)
embedding_dim = 300
num_hidden_nodes = 100
num_output_nodes = 5
num_layers = 2
dropout = 0.4

# Instantiate the model
model = classifier(size_of_vocab, embedding_dim, num_hidden_nodes, num_output_nodes, num_layers, dropout = dropout)

In [42]:
print(model)

#No. of trianable parameters
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
    
print(f'The model has {count_parameters(model):,} trainable parameters')

classifier(
  (embedding): Embedding(25508, 300)
  (encoder): LSTM(300, 100, num_layers=2, batch_first=True, dropout=0.4)
  (fc): Linear(in_features=100, out_features=5, bias=True)
)
The model has 7,894,505 trainable parameters


In [43]:
import torch.optim as optim

# define optimizer and loss
optimizer = optim.Adam(model.parameters(), lr=2e-4)
criterion = nn.CrossEntropyLoss()

# define metric
def binary_accuracy(preds, y):
    #round predictions to the closest integer
    _, predictions = torch.max(preds, 1)
    
    correct = (predictions == y).float() 
    acc = correct.sum() / len(correct)
    return acc
    
# push to cuda if available
model = model.to(device)
criterion = criterion.to(device)

In [1]:
def train(model, iterator, optimizer, criterion):
    
    # initialize every epoch 
    epoch_loss = 0
    epoch_acc = 0
    
    # set the model in training phase
    model.train()  
    
    for batch in iterator:
        
        # resets the gradients after every batch
        optimizer.zero_grad()   
        # retrieve text and no. of words
        review, review_lengths = batch.phrase 

        # convert to 1D tensor
        predictions = model(review, review_lengths).squeeze()  
        
        # compute the loss
        loss = criterion(predictions, batch.fine)        
        
        # compute the binary accuracy
        acc = binary_accuracy(predictions, batch.fine)   
        
        # backpropage the loss and compute the gradients
        loss.backward()       
        
        # update the weights
        optimizer.step()      
        
        # loss and accuracy
        epoch_loss += loss.item()  
        epoch_acc += acc.item()
        # break
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [36]:
def evaluate(model, iterator, criterion):
    
    # initialize every epoch
    epoch_loss = 0
    epoch_acc = 0

    # deactivating dropout layers
    model.eval()
    
    # deactivates autograd
    with torch.no_grad():
    
        for batch in iterator:
        
            # retrieve text and no. of words
            review, review_lengths = batch.phrase
            
            # convert to 1d tensor
            predictions = model(review, review_lengths).squeeze()
            
            # compute loss and accuracy
            loss = criterion(predictions, batch.fine)
            acc = binary_accuracy(predictions, batch.fine)
            
            # keep track of loss and accuracy
            epoch_loss += loss.item()
            epoch_acc += acc.item()
            # break
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [63]:
N_EPOCHS = 10
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
     
    # train the model
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    
    # evaluate the model
    valid_loss, valid_acc = evaluate(model, val_iterator, criterion)
    
    # save the best model
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'saved_weights.pt')
    print(f'Epoch:{epoch:03}')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}% \n')

Epoch:000
	Train Loss: 1.184 | Train Acc: 71.56%
	 Val. Loss: 1.492 |  Val. Acc: 40.68% 

Epoch:001
	Train Loss: 1.178 | Train Acc: 71.96%
	 Val. Loss: 1.487 |  Val. Acc: 41.59% 

Epoch:002
	Train Loss: 1.175 | Train Acc: 72.44%
	 Val. Loss: 1.489 |  Val. Acc: 41.31% 

Epoch:003
	Train Loss: 1.171 | Train Acc: 72.81%
	 Val. Loss: 1.495 |  Val. Acc: 40.64% 

Epoch:004
	Train Loss: 1.168 | Train Acc: 73.09%
	 Val. Loss: 1.480 |  Val. Acc: 42.29% 

Epoch:005
	Train Loss: 1.165 | Train Acc: 73.40%
	 Val. Loss: 1.487 |  Val. Acc: 41.31% 

Epoch:006
	Train Loss: 1.162 | Train Acc: 73.66%
	 Val. Loss: 1.493 |  Val. Acc: 40.49% 

Epoch:007
	Train Loss: 1.159 | Train Acc: 73.90%
	 Val. Loss: 1.503 |  Val. Acc: 39.39% 

Epoch:008
	Train Loss: 1.158 | Train Acc: 74.16%
	 Val. Loss: 1.501 |  Val. Acc: 39.73% 

Epoch:009
	Train Loss: 1.156 | Train Acc: 74.35%
	 Val. Loss: 1.498 |  Val. Acc: 40.44% 



### Observations:

1. Overfit problem is persistent and the model does not learn well.
2. Data augmentation. Any one technique is applicable to each data sample. Implementation is inefficient. 
3. RNN/LSTMs are good for sequences of the same kind, not sequences of different kind, as in this case. 
4. Recursive Neural Network or similar ideas are more suited for such a dataset.


