### Imports

In [1]:
import pickle as pkl
import json
from torch.utils.data import TensorDataset, DataLoader
import torch
import torch.utils.data
from torch import nn
from torch.autograd import Variable
from tqdm import tqdm
import re
import numpy as np

### Glove dictionary

In [2]:
def get_glove_dictionary(path):
    with open(path,encoding="utf-8") as f:
        glove = f.readlines()
    glove_dictionary = {}
    for idx in tqdm(range(len(glove))):
        glove_vec = []
        for num in glove[idx].split()[1:len(glove[idx])]:
            glove_vec.append(float(num))
        glove_dictionary[glove[idx].split()[0]] = np.array([glove_vec])
    return glove_dictionary

In [3]:
glove_dictionary = get_glove_dictionary("../2_data/wv_50d.txt")

100%|██████████| 400000/400000 [00:07<00:00, 51397.77it/s]


### Loading review data

In [4]:
def load_review(path):
    with open(path,encoding="utf-8") as f:
        content_whole = f.readlines()
    content_list = []
    for idx in range(len(content_whole)):
        content_each = []
        contents_each = []
        for word in re.sub("[-,)(^.!:?;*$%&/]"," ", content_whole[idx]).split():
            content_each.append(word.lower())
        contents_each.append(content_each[-1])
        content_each.remove(content_each[-1])
        contents_each.append(content_each)
        content_list.append(contents_each)
    return content_list

In [5]:
content_list = load_review("../2_data/senti_binary_train.txt")

In [41]:
content_list[0][]

['0', ['hide', 'new', 'secretions', 'from', 'the', 'parental', 'units']]

### Vectorizing the review data

In [6]:
def vectorize_review(content_list,dictionary):
    set_data = []
    for idx in tqdm(range(len(content_list))):
        if content_list[idx][1][0] not in dictionary.keys():
            continue
        glove_value = dictionary[content_list[idx][1][0]]
        for idx_1 in range(1,len(content_list[idx][1])):
            if content_list[idx][1][idx_1] not in dictionary.keys():
                continue
            glove_value = np.append(glove_value,dictionary[content_list[idx][1][idx_1]],axis=0)
        sets = []
        x = glove_value.mean(axis=0)
        x = torch.tensor(x,dtype=torch.float32)
        x = x.type(torch.DoubleTensor)
        sets.append(x)
        y = np.array(content_list[idx][0],dtype='int64')
        y = torch.tensor(y,dtype=torch.int64)
        sets.append(y)
        set_data.append(tuple(sets))
    return set_data

In [7]:
set_data = vectorize_review(content_list,glove_dictionary)

100%|██████████| 67349/67349 [00:03<00:00, 17371.61it/s]


### Model

In [8]:
class Model(nn.Module):
    
    def __init__(self, input_dim, hidden1_dim, hidden2_dim, output_dim):
        
        super(Model, self).__init__()
        
        self.hl1 = nn.Linear(input_dim, hidden1_dim)
        self.hl1a = nn.ReLU()
        self.layer1 = [self.hl1, self.hl1a]
        
        self.hl2 = nn.Linear(hidden1_dim, hidden2_dim)
        self.hl2a = nn.ReLU()
        self.layer2 = [self.hl2, self.hl2a]
        
        self.ol = nn.Linear(hidden2_dim, output_dim)
        self.ola = (lambda x: x)
        self.layer3 = [self.ol, self.ola]
        
        self.layers = [self.layer1, self.layer2, self.layer3]
        
    def forward(self, x):
        
        out = x
        
        for pa, a in self.layers:
            
            out = a(pa(out))
        
        return out

In [9]:
model = Model(50, 20, 20, 2)
model.double()

Model(
  (hl1): Linear(in_features=50, out_features=20, bias=True)
  (hl1a): ReLU()
  (hl2): Linear(in_features=20, out_features=20, bias=True)
  (hl2a): ReLU()
  (ol): Linear(in_features=20, out_features=2, bias=True)
)

### Trainer

In [10]:
class Trainer():
    
    def __init__(self, model, data):
        
        self.model = model
        self.data = data
        
        self.train_loader = torch.utils.data.DataLoader(dataset=self.data, batch_size=8, shuffle=True)
        
    def train(self, lr, ne):
        
        criterion = nn.CrossEntropyLoss()
        optimizer = torch.optim.Adam(self.model.parameters(), lr=lr)

        self.model.train()
        
        self.costs = []
        
        for e in range(ne):
            
            print('training epoch %d / %d ...' %(e+1, ne))
            
            train_cost = 0
        
            for batch_idx, (inputs, targets) in enumerate(self.train_loader):

                inputs = Variable(inputs)
                targets = Variable(targets)
                
                optimizer.zero_grad()
                outputs = self.model(inputs)
                loss = criterion(outputs, targets)
                train_cost += loss
                loss.backward()
                optimizer.step()
                
            self.costs.append(train_cost/len(self.data))
            print('cost: %f' %(self.costs[-1]))

In [11]:
trainer = Trainer(model, set_data)

In [12]:
trainer.train(0.005, 100)

training epoch 1 / 100 ...
cost: 0.061580
training epoch 2 / 100 ...
cost: 0.058905
training epoch 3 / 100 ...
cost: 0.057841
training epoch 4 / 100 ...
cost: 0.057267
training epoch 5 / 100 ...
cost: 0.056932
training epoch 6 / 100 ...
cost: 0.056475
training epoch 7 / 100 ...
cost: 0.056203
training epoch 8 / 100 ...
cost: 0.055992
training epoch 9 / 100 ...
cost: 0.055746
training epoch 10 / 100 ...
cost: 0.055554
training epoch 11 / 100 ...
cost: 0.055269
training epoch 12 / 100 ...
cost: 0.055348
training epoch 13 / 100 ...
cost: 0.055171
training epoch 14 / 100 ...
cost: 0.054991
training epoch 15 / 100 ...
cost: 0.054914
training epoch 16 / 100 ...
cost: 0.054835
training epoch 17 / 100 ...
cost: 0.054763
training epoch 18 / 100 ...
cost: 0.054679
training epoch 19 / 100 ...
cost: 0.054632
training epoch 20 / 100 ...
cost: 0.054533
training epoch 21 / 100 ...
cost: 0.054416
training epoch 22 / 100 ...
cost: 0.054422
training epoch 23 / 100 ...
cost: 0.054475
training epoch 24 / 

### testing (for the other data)

In [29]:
def test_data(path):
    content_list2 = load_review(path)
    set_data2 = vectorize_review(content_list2,glove_dictionary) 
    x_test = []
    y_test = []
    for idx in range(len(set_data2)):
        x_test.append(set_data2[idx][0])
        y_test.append(set_data2[idx][1])
    pred = []
    for idx in range(len(x_test)):
        _, y_pred = torch.max(model(x_test[idx]),0)
        pred.append(y_pred)
    accuracy = 1-(abs(np.array(pred)-np.array(y_test)).sum()/len(y_test))
    return accuracy

In [40]:
test_data("../2_data/senti_binary_test.txt")

100%|██████████| 1821/1821 [00:00<00:00, 11140.32it/s]


0.717439293598234

### testing (for interactive purposes)

In [None]:
"I hate this movie"

In [71]:
def vectorize_a_sentence(sentence,dictionary):
    sentence=sentence.lower().split()
    glove_value = dictionary[sentence[0]]
    for idx in range(1,len(sentence)):
        glove_value = np.append(glove_value,dictionary[sentence[idx]],axis=0)
    x = glove_value.mean(axis=0)
    x = torch.tensor(x,dtype=torch.float32)
    x = x.type(torch.DoubleTensor)
    return x

In [89]:
vectorized=vectorize_a_sentence('I thought the story was not so good',glove_dictionary)
vectorized

tensor([ 0.2914,  0.1153, -0.1855, -0.3239,  0.6791,  0.0631, -0.5332,  0.0085,
        -0.4375,  0.0213, -0.0482,  0.4214, -0.6428, -0.1142,  0.7044,  0.2567,
         0.1647,  0.0497, -0.3210, -0.3732, -0.2914,  0.3986,  0.3583,  0.0261,
         0.6578, -1.9735, -0.8563,  0.2407,  0.4923, -0.4313,  3.4127, -0.0459,
        -0.2465, -0.3642,  0.0103, -0.1358,  0.2113,  0.3361,  0.0683, -0.2789,
        -0.1749,  0.0461, -0.1287,  0.2632, -0.1160,  0.0905, -0.1788, -0.2021,
         0.0378,  0.2649], dtype=torch.float64)

In [90]:
_, y_pred = torch.max(model(vectorized),0)
y_pred

tensor(0)