In [427]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data.dataloader as dataloader
import torch.optim as optim
import torch.autograd as autograd
import torchtext.vocab as torchvocab
from torch.autograd import Variable
import tqdm
import os
import time
import re
import pandas as pd
import string
import gensim
import time
import random
import snowballstemmer
import collections
from collections import Counter
from nltk.corpus import stopwords
from itertools import chain
from sklearn.metrics import accuracy_score

In [446]:
data = pd.read_excel("Corpus_190905_simplify-final.xlsx")
data1 = pd.read_excel("Corpus_190905_simplify-final-1.xlsx")

In [447]:
def clean_text(text):
    
    ## Remove puncuation
    text = text.translate(string.punctuation)
    
    ## Convert words to lower case and split them
    text = text.lower().split()
    
    # Remove stop words
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops and len(w) >= 3]
    
    text = " ".join(text)
    ## Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    ## Stemming
    text = text.split()
    stemmer = snowballstemmer.stemmer('english')
    stemmed_words = [stemmer.stemWord(word) for word in text]
    text = " ".join(stemmed_words)
    return text

In [448]:
train_tokenized = []
for review in data["Description"].astype(str):
    train_tokenized.append(clean_text(review))

In [449]:
test_tokenized = []
for reviews in data1["Description"].astype(str):
    test_tokenized.append(clean_text(reviews))

In [450]:
test_tokenized1=[]
train_tokenized1=[]
for i in range (len(train_tokenized)):
    train_tokenized1.append(train_tokenized[i].split())
for i in range (len(test_tokenized)):
    test_tokenized1.append(test_tokenized[i].split())

In [478]:
vocab = set(chain(*train_tokenized1))
vocab_size = len(vocab)

In [463]:
wvmodel = gensim.models.KeyedVectors.load_word2vec_format('gensim_WordVec.txt',
                                                          binary=False)

In [479]:
word_to_idx  = {word: i+1 for i, word in enumerate(vocab)}
word_to_idx['<unk>'] = 0
idx_to_word = {i+1: word for i, word in enumerate(vocab)}
idx_to_word[0] = '<unk>'

In [480]:
def encode_samples(tokenized_samples, vocab):
    features = []
    for sample in tokenized_samples:
        feature = []
        for token in sample:
            if token in word_to_idx:
                feature.append(word_to_idx[token])
            else:
                feature.append(0)
        features.append(feature)
    return features

def pad_samples(features, maxlen=60, PAD=0):
    padded_features = []
    for feature in features:
        if len(feature) >= maxlen:
            padded_feature = feature[:maxlen]
        else:
            padded_feature = feature
            while(len(padded_feature) < maxlen):
                padded_feature.append(PAD)
        padded_features.append(padded_feature)
    return padded_features

In [481]:
train_features = torch.tensor(pad_samples(encode_samples(train_tokenized1, vocab)))
train_labels =torch.tensor([score for score in data["Pin Type -Symbol"]])
test_features = torch.tensor(pad_samples(encode_samples(test_tokenized1, vocab)))
test_labels =torch.tensor([score for score in data1["Pin Type -Symbol"]])

In [482]:
train_features

tensor([[2439, 2455, 3079,  ...,    0,    0,    0],
        [1589,  467, 2554,  ...,    0,    0,    0],
        [2455, 1708,  583,  ...,    0,    0,    0],
        ...,
        [1797, 1708, 2294,  ...,    0,    0,    0],
        [ 784, 1708, 2294,  ...,    0,    0,    0],
        [2039,  646,  567,  ...,    0,    0,    0]])

In [483]:
test_features

tensor([[3368, 3079,  567,  ...,    0,    0,    0],
        [2595,  982, 2589,  ...,    0,    0,    0],
        [2518, 1708, 2294,  ..., 1566, 3574, 2439],
        ...,
        [2890, 3079, 3553,  ...,    0,    0,    0],
        [1017,  646, 2554,  ...,    0,    0,    0],
        [1301, 3079, 1662,  ...,    0,    0,    0]])

In [492]:
class SentimentNet(nn.Module):
    def __init__(self, vocab_size, embed_size, num_hiddens, num_layers,
                 bidirectional, weight, labels, use_gpu, **kwargs):
        super(SentimentNet, self).__init__(**kwargs)
        self.num_hiddens = num_hiddens
        self.num_layers = num_layers
        self.use_gpu = use_gpu
        self.bidirectional = bidirectional
        self.embedding = nn.Embedding.from_pretrained(weight)
        self.embedding.weight.requires_grad = False
        self.encoder = nn.LSTM(input_size=embed_size, hidden_size=self.num_hiddens,
                               num_layers=num_layers, bidirectional=self.bidirectional,
                               dropout=0.4)
        if self.bidirectional:
            self.decoder = nn.Linear(num_hiddens * 4, labels)
        else:
            self.decoder = nn.Linear(num_hiddens * 2, labels)

    def forward(self, inputs):
        embeddings = self.embedding(inputs)
        states, hidden = self.encoder(embeddings.permute([1, 0, 2]))
        encoding = torch.cat([states[0], states[-1]], dim=1)
        outputs = self.decoder(encoding)
        return outputs

In [493]:
weight = torch.zeros(vocab_size+1, embed_size)

for i in range(len(wvmodel.index2word)):
    try:
        index = word_to_idx[wvmodel.index2word[i]]
    except:
        continue
    weight[index, :] = torch.from_numpy(wvmodel.get_vector(
        idx_to_word[word_to_idx[wvmodel.index2word[i]]]))

In [494]:
num_epochs = 10
embed_size = 100
num_hiddens = 100
num_layers = 2
bidirectional = True
batch_size = 64
labels = 7
lr = 0.8
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
use_gpu = True
net = SentimentNet(vocab_size=(vocab_size+1), embed_size=embed_size,
                   num_hiddens=num_hiddens, num_layers=num_layers,
                   bidirectional=bidirectional, weight=weight,
                   labels=labels, use_gpu=use_gpu)
net.to(device)
loss_function = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=lr)

In [495]:
train_set = torch.utils.data.TensorDataset(train_features, train_labels)
test_set = torch.utils.data.TensorDataset(test_features, test_labels)

train_iter = torch.utils.data.DataLoader(train_set, batch_size=batch_size,
                                         shuffle=True)
test_iter = torch.utils.data.DataLoader(test_set, batch_size=batch_size,
                                        shuffle=False)

In [496]:
for epoch in range(num_epochs):
    start = time.time()
    train_loss, test_losses = 0, 0
    train_acc, test_acc = 0, 0
    n, m = 0, 0
    c = []
    b = []
    for feature, label in train_iter:
        n += 1
        net.zero_grad()
        feature = Variable(feature.cuda())
        label = Variable(label.cuda())
        score = net(feature)
        loss = loss_function(score, label)
        loss.backward()
        optimizer.step()
        train_acc += accuracy_score(torch.argmax(score.cpu().data,
                                                 dim=1), label.cpu())
        train_loss += loss
    with torch.no_grad():
        for test_feature, test_label in test_iter:
            m += 1
            test_feature = test_feature.cuda()
            test_label = test_label.cuda()
            test_score = net(test_feature)
            test_loss = loss_function(test_score, test_label)
            test_acc += accuracy_score(torch.argmax(test_score.cpu().data,
                                                    dim=1), test_label.cpu())
            c.append(test_label.cpu())
            b.append(torch.argmax(test_score.cpu().data,
                                                    dim=1))
            test_losses += test_loss
        
    end = time.time()
    runtime = end - start
    print('epoch: %d, train loss: %.4f, train acc: %.2f, test loss: %.4f, test acc: %.2f, time: %.2f' %
          (epoch, train_loss.data / n, train_acc / n, test_losses.data / m, test_acc / m, runtime))

epoch: 0, train loss: 1.4363, train acc: 0.46, test loss: 0.9746, test acc: 0.65, time: 2.44
epoch: 1, train loss: 0.9520, train acc: 0.68, test loss: 0.8006, test acc: 0.74, time: 2.40
epoch: 2, train loss: 0.7395, train acc: 0.76, test loss: 0.8273, test acc: 0.71, time: 2.39
epoch: 3, train loss: 0.6309, train acc: 0.79, test loss: 0.7611, test acc: 0.75, time: 2.40
epoch: 4, train loss: 0.5558, train acc: 0.82, test loss: 0.9065, test acc: 0.71, time: 2.39
epoch: 5, train loss: 0.5081, train acc: 0.84, test loss: 0.8586, test acc: 0.74, time: 2.39
epoch: 6, train loss: 0.4660, train acc: 0.85, test loss: 0.7918, test acc: 0.77, time: 2.40
epoch: 7, train loss: 0.4271, train acc: 0.86, test loss: 0.8277, test acc: 0.75, time: 2.38
epoch: 8, train loss: 0.3928, train acc: 0.88, test loss: 0.9555, test acc: 0.74, time: 2.40
epoch: 9, train loss: 0.3702, train acc: 0.88, test loss: 0.8987, test acc: 0.76, time: 2.40


In [489]:
a = []
ab=''
a.append(c)
new_crazy = filter(str.isdigit, str(a))
asd=','
ab = asd.join((list(new_crazy)))
First = ab.split(',')

In [490]:
d = []
e =''
d.append(b)
new_crazy1 = filter(str.isdigit, str(d))
asdf=','
bf = asdf.join((list(new_crazy1)))
bf
MOdelTrain = bf.split(',')

In [491]:
from sklearn.metrics import confusion_matrix
confusion_matrix(First, MOdelTrain,labels=["0", "1","2","3","4", "5","6"])

array([[ 305,   30,   65,    5,   43,   19,    0],
       [  21, 1099,   62,    4,  142,   26,    1],
       [  25,  104,  967,    0,   91,   47,    1],
       [  11,    6,   40,  172,   23,    2,    1],
       [  48,  113,   78,   12, 1275,   10,    3],
       [  10,   29,   29,    0,   26,   24,    3],
       [   0,    5,    0,    0,    4,    2,   18]], dtype=int64)

In [364]:
c = 1617+2849+2668+1545+1058+537+642
d = 18203
acc = c/d
acc

0.599681371202549