In [7]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from nltk import word_tokenize
from tqdm import tqdm # Instantly make loops show a smart progress meter
import re
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import random
import os

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [6]:
#%% Read the Rating data
ratings_df = pd.read_csv('./DATA/Ratings.csv').drop('Unnamed: 0', axis=1)
ratings_df.review = ratings_df.review.astype(str)

# divide reviews as negative and positive based on scores
ratings_df['senti'] = ratings_df['score'].apply(lambda x: 1 if int(x)>2.5 else 0)
# tokenize the data and vectorize the reviews
max_features = 1000 # The maximu number of words to keep
# create a tokenizer
tokenizer = Tokenizer(num_words = max_features, split=' ')
# fit the tokenizer on the review text
tokenizer.fit_on_texts(ratings_df['review'].values)
X = tokenizer.texts_to_sequences(ratings_df['review'].values)
X = pad_sequences(X)

In [54]:
# y = pd.get_dummies(ratings_df['senti'].astype(int)).values
y = ratings_df['senti'].astype(int).values

In [100]:
class net(nn.Module):
    def __init__(self, embed_dim, lstm_dim, input_length):
        super(net, self).__init__()
        self.word_embeddings = nn.Embedding(max_features, embed_dim)
        self.lstm = nn.LSTM(embed_dim * input_length, lstm_dim, dropout=0.2)
        self.dense = nn.Linear(lstm_dim, 2)
    
    def forward(self, X):
        embeds = self.word_embeddings(X)
        lstm_out, _ = self.lstm(embeds.view(len(X), 1, -1))
        out = F.softmax(self.dense(lstm_out.view(len(X), -1)), dim=1)
        return out

In [170]:
nsample = 10000 # X.shape[0]
index = np.random.choice(X.shape[0], nsample, replace=False)
X_train,X_test, y_train,y_test = train_test_split(X[index,:], y[index], random_state=0)
X_train_torch = torch.tensor(X_train, dtype=torch.long)
y_train_torch = torch.tensor(y_train, dtype=torch.long)
X_test_torch = torch.tensor(X_test, dtype=torch.long)
y_test_torch = torch.tensor(y_test, dtype=torch.long)

In [174]:
embed_dim = 16
lstm_dim = 16

model = net(embed_dim, lstm_dim, X_train.shape[1])
loss_function = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

batch_size = 64

In [165]:
for epoch in range(5):
    for idx in range(batch_size, len(X_train_torch)+1, batch_size):
        X_train_batch = X_train_torch[idx-batch_size:idx, :]
        y_train_batch = y_train_torch[idx-batch_size:idx]
        
        model.zero_grad()
        
        score = model(X_train_batch)
        
        loss = loss_function(score, y_train_batch)
        loss.backward()
        optimizer.step()

In [171]:
print(np.sum(y_test == 1) / float(len(y_test)) * 100)
print(np.sum(y_train == 1) / float(len(y_train)) * 100)

81.72
81.36


In [175]:
# test accuracy
y_predict = np.where(model(X_test_torch)[:,1] > 0.5, 1, 0)
np.sum(y_predict == y_test) / float(len(y_test)) *100

20.96

In [176]:
# train accuracy
n = 2500
index = np.random.choice(X_train.shape[0], n, replace=False)
y_train_predict = np.where(model(X_train_torch[index,:])[:,1] > 0.5, 1, 0)
np.sum(y_train_predict == y_train[index]) / float(len(y_train[index]))*100

21.52

In [156]:
# save the trained model
torch.save(model.state_dict(), './model/senti_torch_embed{}_lstm{}.pth'.format(embed_dim, lstm_dim))