In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
from torchtext.vocab import GloVe
import spacy
import io
import jsonlines
import json
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import random_split


EMBED_DIM = 300
HIDDEN_LAYER_DIM = 10
NUM_EPOCHS = 15

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
nlp = spacy.load("en_core_web_sm")

def preprocess_text(text):    
    words = nlp(text)
    sentence  = "".join([token.text_with_ws for token in words]).strip()
    return sentence

def process_training_data():
    negative_reviews = io.open('./Train.neg',encoding='latin-1').readlines()
    positive_reviews = io.open('./Train.pos',encoding='latin-1').readlines()
    with jsonlines.open('train.jsonl',mode='w') as writer:

        for review in positive_reviews:
            processed_text = preprocess_text(review)
            d = {'text': processed_text , 'sentiment': 1}
            writer.write(d)
        for review in negative_reviews:
            processed_text = preprocess_text(review)
            d = {'text': processed_text , 'sentiment': 0}
            writer.write(d)
process_training_data()


In [34]:
from torch.utils.data import DataLoader,Dataset

class ReviewDataSet(Dataset):

    def __init__(self,file):
        super().__init__()
        
        self.file = file
        self.data = []
        with open(self.file) as f:
            for line in f:
                sample = json.loads(line)
                self.data.append([sample['text'],sample['sentiment']])
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        return self.data[index]

    

dataset = ReviewDataSet('train.jsonl') 

train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size

train_dataset,validation_dataset = random_split(dataset,[train_size,test_size])

train_dataloader = DataLoader(train_dataset,batch_size=32,shuffle=True)
val_dataloader = DataLoader(validation_dataset,batch_size=32,shuffle=True)
glove = GloVe()


In [24]:
def getWordEmbeddings(batch_data,glove):
    
    reviews = batch_data[0]
    sentiment = batch_data[1]
    
    reviews_tensor = []
    lengths = []

    for review in reviews:
        words = review.split()
        words_tensor = []
        lengths.append(len(words))
        for word in words:
            words_tensor.append(glove.__getitem__(word))
        reviews_tensor.append(torch.stack(words_tensor,dim=0))
    
    mask = torch.zeros((len(lengths),max(lengths)))
    for i in range(len(lengths)):
        mask[i,:lengths[i]] = 1.0
    
    return (pad_sequence(reviews_tensor,batch_first=True),mask,sentiment)




In [26]:
'''from gensim import models

w = models.KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin',binary=True)
w.get_vector('rat')
'''

"from gensim import models\n\nw = models.KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin',binary=True)\nw.get_vector('rat')\n"

In [27]:
class DAN(nn.Module):

    def __init__(self,embed_dim=EMBED_DIM,hidden_dim = HIDDEN_LAYER_DIM):
        super().__init__()
        self.embed_dim = embed_dim
        self.hidden_dim = hidden_dim
        self.fc1 = nn.Linear(self.embed_dim,self.hidden_dim)
        self.fc2 = nn.Linear(self.hidden_dim,1)
        self.sigmoid  = nn.Sigmoid()

    def forward(self,inp,inp_mask):
        
        inp_lengths = torch.sum(inp_mask,-1,keepdim=True)
        total = torch.sum(inp*(inp_mask.unsqueeze(2)),axis=1)
        vector_average = total / inp_lengths
        ans = F.relu(self.fc1(vector_average))
        ans = self.sigmoid(self.fc2(ans))
        return ans
    


In [61]:
'''inp = torch.randn((2,3,5))
mask = torch.tensor([[1.0,0.0,0.0],[1.0,0.0,1.0]])

print(inp)'''

tensor([[[-0.7277, -0.3738,  0.6601, -0.8360,  0.4285],
         [-0.3967, -0.9428,  1.2575,  0.3563, -2.6444],
         [-2.6430,  0.6987, -0.0439,  0.9138,  0.2052]],

        [[-0.2515, -0.5871,  0.6125, -0.2867, -0.4073],
         [ 1.2012,  1.6067,  0.8382,  0.4374, -0.9485],
         [ 0.6081, -0.4818,  1.3801, -0.3231, -2.0356]]])


In [28]:
# Training loop
dan = DAN(EMBED_DIM,HIDDEN_LAYER_DIM)
criterion = nn.BCELoss()
optimizer = optim.Adam(dan.parameters(),lr=0.01)

for e in range(NUM_EPOCHS):
    training_loss = 0.0
    size = 0
    dan.train()

    for i,data in enumerate(train_dataloader,0):
        
        optimizer.zero_grad()
        
        input_reviews , input_mask , output_labels = getWordEmbeddings(data,g)
        output = dan(input_reviews,input_mask).squeeze()

        loss = criterion(output,output_labels.float())
        training_loss += loss.item()
        loss.backward()
        optimizer.step()
        size = max(size,i+1)

    dan.eval()
    validation_loss = 0.0

    val_size = 0
    for i,data in enumerate(val_dataloader,0):
        input_reviews,input_mask,output_labels = getWordEmbeddings(data,g)
        ouput = dan(input_reviews,input_mask).squeeze()
        loss = criterion(ouput,output_labels.float())
        validation_loss += loss.item()
        val_size = max(size,i+1)

    print(str(training_loss/size )+ "   " + str(validation_loss/val_size))


0.527477594766211   0.3108069509901899
0.4702962192290641   0.3055168257114735
0.4550522862279669   0.31075330902921394
0.457102488805639   0.3142033854222044
0.4444233176714562   0.30292312658213555
0.4376748577552907   0.32122663273773294
0.43630648816519596   0.3029258468366684
0.4278222356546432   0.3055676274160121
0.4250665230161332   0.3083383796062875
0.4229727272499115   0.31551304911362366
0.41623802538564864   0.32657083282445337
0.4125348765482294   0.30959973072117947
0.40747506226947966   0.31230205614516077
0.40188099293315666   0.3155527620556507
0.3991619037028323   0.32496913054839094
