In [1]:
import torch
import torch.nn as nn
import torch.functional as func
import pandas as pd
import re
import numpy as np
import math
from sklearn.model_selection import train_test_split

In [2]:
def create_weights():
    idx = 0
    words_dict = {}
    vectors = []
    with open("glove6b/glove.6B/glove.6B.300d.txt", 'r') as file:
        for row in file:
            word, *weights = row.split()
            weights = list(map(float, weights))
            words_dict[word] = idx
            idx +=1
            vectors.append(weights)
    return words_dict, vectors 

In [18]:
DIM_SIZE=300

In [4]:
words_dict, vectors = create_weights()

In [6]:
data = pd.read_csv("labeledTrainData.tsv", sep='\t')
data_predict = pd.read_csv("testData.tsv", sep='\t')

In [5]:
def get_words(text):
    text = re.sub("<[^>]*>", "",text)
    text = re.sub("[^a-zA-Z]", " ", text)
    words = text.split()
    words = [word.lower() for word in words]
    return words

In [15]:
def get_all_words_unique(reviews):
    all_words = set()
    for text in data.review:
        words = get_words(text)
        all_words.update(set(words))
    all_words = list(all_words)
    return all_words

In [16]:
all_words = get_all_words_unique(pd.concat([data.review, data_predict.review]))

In [19]:
def create_weights_matrix(all_words, vectors, words_dict):
    weights_len = len(all_words)+1
    weights_matrix = np.zeros((weights_len, DIM_SIZE))
    word_idx = {}
    for i, word in enumerate(all_words):
        if word in words_dict:
            weights_matrix[i] = np.array(vectors[words_dict[word]])
        else:
            weights_matrix[i] = np.random.normal(0,1,DIM_SIZE)
        word_idx[word] = i
    padding_index = len(all_words)
    weights_matrix[padding_index] = np.zeros(DIM_SIZE)
    return weights_matrix, word_idx

In [20]:
weights_matrix, word_idx = create_weights_matrix(all_words, vectors, words_dict)

In [26]:
def conver_review_to_idx(text):
    return [word_idx[word] for word in get_words(text)]

In [27]:
reviews = list(data.review)
reviews = [conver_review_to_idx(review) for review in reviews]
sentiments = list(data.sentiment)

In [30]:
review_lengths = [len(review) for review in reviews]
pad_value = int(np.percentile(review_lengths, 75))
print(pad_value)

287


In [31]:
def pad_sentences(x, size):
    new_x = np.zeros((len(x), size))
    for i in range(len(x)):
        if len(x[i])>=size:
            new_x[i] = x[i][:size]
        else:
            new_x[i] = x[i]+[0]*(size-len(x[i]))
    return new_x

In [32]:
reviews = pad_sentences(reviews, pad_value)

In [16]:
def create_emb_layer(emb_matrix):
    emb_size, dim_size = emb_matrix.shape
    layer = torch.nn.Embedding(emb_size, dim_size, padding_idx=0)
    layer.load_state_dict({'weight':torch.Tensor(emb_matrix)})
    return layer, emb_size, dim_size

In [19]:
class ClassifierCNN(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.emb_layer, emb_size, dim_size = create_emb_layer(emb_matrix)
        self.conv_3 = torch.nn.Conv2d(in_channels=1, out_channels=100, kernel_size=[3, dim_size])
        self.conv_4 = torch.nn.Conv2d(in_channels=1, out_channels=100, kernel_size=[4, dim_size])
        self.conv_5 = torch.nn.Conv2d(in_channels=1, out_channels=100, kernel_size=[5, dim_size])
        
        self.percp = torch.nn.Linear(3*100, 1)
        self.dropout = torch.nn.Dropout(0.5)
    
    def forward(self, text):
        text = torch.tensor(list(np.transpose(text)), dtype=torch.long)
        text = self.emb_layer(text).permute(1,0,2)
        #print(text.size())
        
        text = text.unsqueeze(1)
        #print(text.size())
        
        conved3 = self.conv_3(text).squeeze(3)
        conved4 = self.conv_4(text).squeeze(3)
        conved5 = self.conv_5(text).squeeze(3)
        #print (conved3.size())
        
        relu3 = torch.nn.functional.relu(conved3)
        relu4 = torch.nn.functional.relu(conved4)
        relu5 = torch.nn.functional.relu(conved5)
        #print(relu3.size())
        
        pooled3 = torch.nn.functional.max_pool1d(relu3, relu3.shape[2]).squeeze(2)
        pooled4 = torch.nn.functional.max_pool1d(relu4, relu4.shape[2]).squeeze(2)
        pooled5 = torch.nn.functional.max_pool1d(relu5, relu5.shape[2]).squeeze(2)
        #print(pooled3.size())
        
        cat = torch.cat((pooled3, pooled4, pooled5), dim=1)
        cat_dropout = self.dropout(cat)
        return self.percp(cat_dropout)
        
        
        

In [16]:
reviews_train, reviews_val, labels_train, labels_val = train_test_split(reviews, sentiments, test_size=.2)

In [20]:
model = ClassifierCNN()
optimizer = torch.optim.Adam(model.parameters())
criterion = torch.nn.BCEWithLogitsLoss()

In [21]:
def train(model, optimizer, criterion, texts, labels):
    
    model.train()
    
    batch_size=50
    texts_amount = len(texts)
    iteration_amount = math.ceil(texts_amount/batch_size)
    
    for i in range(iteration_amount):
        x = texts[(i*batch_size):min(texts_amount, (i+1)*batch_size)]
        #print(x)
        y = torch.tensor(labels[(i*batch_size):min(texts_amount, (i+1)*batch_size)], dtype=torch.float32)
        optimizer.zero_grad()
        predictions = model(x)
        #print(predictions)
        predictions = predictions.squeeze(1)
        #print (predictions.type())
        loss = criterion(predictions, y)
        loss.backward()
        optimizer.step()

In [22]:
def eval_m(model, texts, sentiments):
    model.eval()
    batch_size=50
    texts_amount = len(texts)
    iteration_amount = math.ceil(texts_amount/batch_size)
    
    predictions_all = np.array([])
    for i in range(iteration_amount):
        x = texts[(i*batch_size):min(texts_amount, (i+1)*batch_size)]
        predictions = model(x)
        res = torch.sigmoid(predictions).detach().numpy()
        predictions_all = np.append(predictions_all, res)
    
    predictions_all = np.array([1 if x >= 0.5 else 0 for x in predictions_all])
    accuracy = sum(predictions_all==np.array(sentiments))/len(predictions_all)
    return predictions_all, accuracy
    

In [23]:
def eval_m_witout_testing(model, texts):
    model.eval()
    batch_size=50
    texts_amount = len(texts)
    iteration_amount = math.ceil(texts_amount/batch_size)
    
    predictions_all = np.array([])
    for i in range(iteration_amount):
        x = texts[(i*batch_size):min(texts_amount, (i+1)*batch_size)]
        predictions = model(x)
        res = torch.sigmoid(predictions).detach().numpy()
        predictions_all = np.append(predictions_all, res)
    
    predictions_all = np.array([1 if x >= 0.5 else 0 for x in predictions_all])
    return predictions_all
    

In [24]:
for i in np.arange(1, 4):
    print(i)
    train(model, optimizer, criterion, reviews, sentiments)
    #pred_all, accuracy = eval_m(model, reviews_val, labels_val)
    #print ("%d - %f" % (i, accuracy))

1
2
3


In [None]:
1 - 0.840000

(array([1, 1, 0, ..., 0, 0, 1]), 0.98796)

In [381]:
np.append(a,b)

array([2, 3, 3, 4])

In [382]:
a

array([2, 3])

In [25]:
reviews_to_predict = list(data_to_predict.review)
reviews_to_predict = [preprocess(review) for review in reviews_to_predict]
reviews_to_predict = pad_sentences(reviews_to_predict, 282)

In [26]:
predictions = eval_m_witout_testing(model, reviews_to_predict)
pred_all = [1 if x >= 0.5 else 0 for x in predictions]

In [27]:
data_to_predict['sentiment'] = pred_all
data_to_predict[['id','sentiment']].to_csv("cnn6.csv",index=False, sep=',')

In [419]:
len(reviews_to_predict)

25000