In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
from torchtext.vocab import GloVe,FastText
import spacy
import io
import jsonlines
import json
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import random_split
import fasttext

cpu = torch.device('cpu')

if torch.has_mps:
    device = torch.device('mps')
else:
    if torch.cuda.is_available():
        device = torch.device('cuda')
    else:
        device = torch.device('cpu')


EMBED_DIM = 300
HIDDEN_LAYER_DIM = 100
NUM_EPOCHS = 40

In [3]:
model = fasttext.load_model('./crawl-300d-2M-subword/crawl-300d-2M-subword.bin')



In [61]:
model.get_word_vector('asfkskfjkeglaekjghgidfg').shape

(300,)

In [19]:
f = FastText()

In [49]:
f.__getitem__('it\'s')

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 

tensor([ 0.3640, -0.4614, -0.2887,  0.2210,  0.3395,  0.2251, -0.2298,  0.2102,
        -0.1499, -1.2309,  0.1906, -0.6621,  0.2563,  0.1245, -0.0430, -0.4992,
        -0.1759, -1.3224,  0.1590, -0.0765, -0.3088,  0.0463, -0.1286,  0.1315,
        -0.1142, -0.4390,  0.5309, -0.2725,  0.1481, -0.3749,  0.2236, -0.4525,
         0.8860, -0.0410, -0.4543, -0.0826,  0.4276, -0.3297,  0.2673,  0.3213,
        -0.3717,  0.6772, -0.4830,  0.0100,  0.3571, -0.0447,  0.3538, -0.9765,
        -0.1465,  0.0931, -0.4583,  0.0137, -0.0181,  0.0088,  0.7065, -0.4261,
         0.1341, -0.5798,  0.2476, -0.1317, -0.1288, -0.6861, -0.5583, -0.0515,
        -0.0277,  0.1584,  0.1007,  0.1715, -0.4610, -0.3673, -0.0540,  0.0867,
        -0.2057, -0.0238,  0.0137,  0.1134,  0.1019,  0.1092, -0.6096,  0.0876,
         0.1344, -0.0335, -0.6103,  0.5123,  0.6763,  0.3822,  0.7131, -1.2000,
         0.1577,  0.2205, -1.1074, -0.0709, -0.3897,  0.6892,  0.1607,  0.8657,
        -0.1863, -0.3489,  0.1519,  0.20

In [4]:
nlp = spacy.load("en_core_web_sm")

def preprocess_text(text):    
    words = nlp(text)
    #sentence = [token.text_with_ws for token in words]
    sentence  = "".join([token.text_with_ws for token in words]).strip()
    sentence = sentence.lower()
    return sentence

def process_training_data():
    negative_reviews = io.open('./Train.neg',encoding='latin-1').readlines()
    positive_reviews = io.open('./Train.pos',encoding='latin-1').readlines()
    with jsonlines.open('train.jsonl',mode='w') as writer:

        for review in positive_reviews:
            processed_text = preprocess_text(review)
            d = {'text': processed_text , 'sentiment': 1}
            writer.write(d)
        for review in negative_reviews:
            processed_text = preprocess_text(review)
            d = {'text': processed_text , 'sentiment': 0}
            writer.write(d)
process_training_data()


In [5]:
from torch.utils.data import DataLoader,Dataset

class ReviewDataSet(Dataset):

    def __init__(self,file):
        super().__init__()
        
        self.file = file
        self.data = []
        with open(self.file) as f:
            for line in f:
                sample = json.loads(line)
                self.data.append([sample['text'],sample['sentiment']])
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        return self.data[index]

    

dataset = ReviewDataSet('train.jsonl') 

train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
print(train_size)
print(test_size)

train_dataset,validation_dataset = random_split(dataset,[train_size,test_size])

train_dataloader = DataLoader(train_dataset,batch_size=32,shuffle=True)
val_dataloader = DataLoader(validation_dataset,batch_size=32,shuffle=True)
glove = GloVe()


8000
2000


In [9]:
def getWordEmbeddings(batch_data,glove):
    
    
    if(len(batch_data)==2):
        reviews = batch_data[0]
        sentiment = batch_data[1]
    else:
        reviews = batch_data[0]
        sentiment = None
    
    reviews_tensor = []
    lengths = []
    for review in reviews:
        words = review.split()
        words_tensor = []
        lengths.append(len(words))
        for word in words:
            #words_tensor.append(glove.__getitem__(word))
            words_tensor.append(torch.tensor(glove.get_word_vector(word)))
        reviews_tensor.append(torch.stack(words_tensor,dim=0))
    
    mask = torch.zeros((len(lengths),max(lengths)))
    for i in range(len(lengths)):
        mask[i,:lengths[i]] = 1.0
    
    return (pad_sequence(reviews_tensor,batch_first=True),mask,sentiment)




In [19]:
'''from gensim import models

w = models.KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin',binary=True)
w.get_vector('rat')
'''

"from gensim import models\n\nw = models.KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin',binary=True)\nw.get_vector('rat')\n"

In [7]:
class DAN(nn.Module):

    def __init__(self,embed_dim=EMBED_DIM,hidden_dim = HIDDEN_LAYER_DIM):
        super().__init__()
        self.embed_dim = embed_dim
        self.hidden_dim = hidden_dim
        self.fc1 = nn.Linear(self.embed_dim,self.hidden_dim)
        self.fc2 = nn.Linear(self.hidden_dim,1)
        self.sigmoid  = nn.Sigmoid()

    def forward(self,inp,inp_mask):
        
        inp_lengths = torch.sum(inp_mask,-1,keepdim=True)
        total = torch.sum(inp*(inp_mask.unsqueeze(2)),axis=1)
        vector_average = total / inp_lengths
        ans = F.relu(self.fc1(vector_average))
        ans = self.sigmoid(self.fc2(ans))
        return ans
    


In [21]:
'''inp = torch.randn((2,3,5))
mask = torch.tensor([[1.0,0.0,0.0],[1.0,0.0,1.0]])

print(inp)'''

'inp = torch.randn((2,3,5))\nmask = torch.tensor([[1.0,0.0,0.0],[1.0,0.0,1.0]])\n\nprint(inp)'

In [10]:
# Training loop
dan = DAN(EMBED_DIM,HIDDEN_LAYER_DIM)

criterion = nn.BCELoss()
optimizer = optim.Adam(dan.parameters(),lr=0.01)

for e in range(NUM_EPOCHS):
    training_loss = 0.0
    size = 0
    dan.train()

    for i,data in enumerate(train_dataloader,0):
        
        optimizer.zero_grad()
        
        input_reviews , input_mask , output_labels = getWordEmbeddings(data,model)

        output = dan(input_reviews,input_mask).squeeze()
        
     

        loss = criterion(output,output_labels.float())
       
        
        
        training_loss += loss.item()
        loss.backward()
        optimizer.step()
        size = max(size,i+1)

    dan.eval()
    validation_loss = 0

    val_size = 0
    for i,data in enumerate(val_dataloader,0):
        input_reviews,input_mask,output_labels = getWordEmbeddings(data,model)
        output = dan(input_reviews,input_mask).squeeze()
        nearest_class = torch.round(output)
        correct = (nearest_class == output_labels.float()).float()
        validation_loss += correct.sum()
  
    print(str(training_loss/size )+ "   " + str(validation_loss/len(validation_dataset)))


0.580641510605812   tensor(0.7480)
0.4952510052919388   tensor(0.7520)
0.4796864761114121   tensor(0.7635)
0.46994533884525297   tensor(0.7660)
0.46831831336021423   tensor(0.7780)
0.46359367173910143   tensor(0.7810)
0.46076147639751436   tensor(0.7820)
0.46139667761325837   tensor(0.7500)
0.4522773082256317   tensor(0.7795)
0.4508477953672409   tensor(0.7675)
0.4544805326461792   tensor(0.7520)
0.44381807452440264   tensor(0.7775)
0.44355688804388044   tensor(0.7730)
0.43802326601743696   tensor(0.7765)
0.43508167409896853   tensor(0.7800)
0.43086389738321307   tensor(0.7755)
0.4289894720315933   tensor(0.7760)
0.4263450213670731   tensor(0.7660)
0.42774930596351624   tensor(0.7765)
0.42039433801174164   tensor(0.7740)
0.4186685197353363   tensor(0.7790)
0.41415614742040635   tensor(0.7755)
0.41782544887065887   tensor(0.7605)
0.4121598373353481   tensor(0.7755)
0.4091121007800102   tensor(0.7670)
0.40661845916509626   tensor(0.7640)
0.4075793016552925   tensor(0.7785)
0.395162081003

In [43]:
def test(filename):
    reviews = open(filename,'r').readlines()
    for i in range(len(reviews)):
        r = reviews[i]
        reviews[i] = preprocess_text(r)
    
    reviews,reviews_mask,labels = getWordEmbeddings([reviews],glove)
    dan.eval()

    output = dan(reviews,reviews_mask)
    print(output)


test('test.txt')


tensor([[0.0000],
        [1.0000],
        [1.0000]], grad_fn=<SigmoidBackward0>)
