In [11]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import pandas as pd
from torch.utils.data import random_split

In [14]:
def prepare_sequence(seq, to_ix):
    idxs = list()
    for w in seq:
        try:
            idxs.append(to_ix[w])
        except:
            idxs.append(to_ix["unk"])
    return torch.tensor(idxs, dtype=torch.long)

full_dataset = pd.read_csv('/home/jeet/Downloads/work/SEM-VI/DL/Assignment/A1-Q3_Dataset-20190402T184338Z-001/A1-Q3_Dataset/mrdata.csv')

# print(random.shuffle(full_dataset))
training_data, testset = random_split(full_dataset, (int(len(full_dataset)*0.8), int(len(full_dataset))-int(len(full_dataset)*0.8)))

# training_data = full_dataset[:int(len(full_dataset)*0.8)]
# testset = full_dataset[int(len(full_dataset)*0.8):]
training_data = training_data.dataset
testset = testset.dataset
print(len(full_dataset), len(training_data), len(testset))
# training_data, testset = torch.utils.data.random_split(full_dataset, ((int(len(full_dataset)*0.8)), len(full_dataset) - int(len(full_dataset)*0.8)))
# print(training_data.Phrase)

65534 65534 65534


In [15]:
trainset = list()
for phrase, tag in zip(training_data.Phrase, training_data.Sentiment):
    trainset.append((str(phrase).split(), tag))
# print(trainset[0])

In [16]:
training_data = trainset

tages = ['negative', 'somewhat negative', 'neutral', 'somewhat positive', 'positive']

word_to_ix = {'unk':0}
for sent, tags in training_data:
    for word in sent:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)
# print(word_to_ix)
tag_to_ix = {'negative':0, 'somewhat negative':1, 'neutral':2, 'somewhat positive':3, 'positive':4}

# These will usually be more like 32 or 64 dimensional.
# We will keep them small, so we can see how the weights change as we train.
EMBEDDING_DIM = 32
HIDDEN_DIM = 32

In [17]:
class LSTMModal(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMModal, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

In [18]:
model = LSTMModal(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(tag_to_ix))
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

In [19]:

counter=0
print("Training ....\n")
for epoch in range(5):
    print("Epoch ", epoch)
    for sentence, tags in training_data:
#         print(sentence, tags)
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Step 2. Get our inputs ready for the network, that is, turn them into
        # Tensors of word indices.
        sentence_in = prepare_sequence(sentence, word_to_ix)
#         print(sentence_in)
        targets = tags

        # Step 3. Run our forward pass.
        tag_scores = model(sentence_in)
        tag_scores = torch.sum(tag_scores, dim=0)/len(sentence_in)
#         print(torch.sum(tag_scores, dim=0))
#         print(targets)
#         print(tag_scores)
        # Step 4. Compute the loss, gradients, and update the parameters by
        #  calling optimizer.step()
        
        loss = loss_function(tag_scores.unsqueeze(0), torch.tensor(targets).unsqueeze(0))
        loss.backward()
        optimizer.step()
        counter+=1
        if counter%1000==0:
            print("Counter ", counter)

print("Training Done....")

Training ....

Epoch  0
Counter  1000
Counter  2000
Counter  3000
Counter  4000
Counter  5000
Counter  6000
Counter  7000
Counter  8000
Counter  9000
Counter  10000
Counter  11000
Counter  12000
Counter  13000
Counter  14000
Counter  15000
Counter  16000
Counter  17000
Counter  18000
Counter  19000
Counter  20000
Counter  21000
Counter  22000
Counter  23000
Counter  24000
Counter  25000
Counter  26000
Counter  27000
Counter  28000
Counter  29000
Counter  30000
Counter  31000
Counter  32000
Counter  33000
Counter  34000
Counter  35000
Counter  36000
Counter  37000
Counter  38000
Counter  39000
Counter  40000
Counter  41000
Counter  42000
Counter  43000
Counter  44000
Counter  45000
Counter  46000
Counter  47000
Counter  48000
Counter  49000
Counter  50000
Counter  51000
Counter  52000
Counter  53000
Counter  54000
Counter  55000
Counter  56000
Counter  57000
Counter  58000
Counter  59000
Counter  60000
Counter  61000
Counter  62000
Counter  63000
Counter  64000
Counter  65000
Epoch  1
C

In [21]:
# See what the scores are after training
total = 0
correct = 0
print("Testing ....\n")
with torch.no_grad():
    for phrase, tag in zip(testset.Phrase, testset.Sentiment):
#         print(phrase, tag)
        inputs = prepare_sequence(str(phrase).split(), word_to_ix)
#         print(phrase, inputs)
        tag_scores = model(inputs)

        tag_scores = torch.sum(tag_scores, dim=0)/len(inputs)
        values, indices = torch.max(tag_scores, 0)
        
#         print(tag_scores)
        if indices.numpy() == tag:
            correct+=1

        #     print(values, indices.numpy(), training_data[0][1])
        total+=1
        
accuracy = (correct*100)/(total*1.0)
print("Accuracy :", accuracy)

Testing ....

Accuracy : 56.21814630573443
