# LSTM for Anime Reviews Classification

In [None]:
import pandas as pd

In [2]:
df = pd.read_json('anime_reviews.json')

In [3]:
df = df.transpose()

In [4]:
from sklearn.utils import shuffle
df = shuffle(df)

## Data pre-processing

### Tokenization

In [5]:
import spacy
import string
import re

tok = spacy.load('en')
def tokenize (text):
    text = re.sub(r"[^\x00-\x7F]+", " ", text)
    regex = re.compile('[' + re.escape(string.punctuation) + '0-9\\r\\t\\n]') # remove punctuation and numbers
    nopunct = regex.sub(" ", text.lower())
    return [token.text for token in tok.tokenizer(nopunct)]

### Deletion of infrequent words

In [7]:
from collections import Counter

c = Counter()
for index, row in df.iterrows():
    c.update(tokenize(row['text']))

print("num_words before:", len(c.keys()))
for word in list(c):
    if c[word] < 2:
        del c[word]
print("num_words after:", len(c.keys()))

num_words before: 85849
num_words after: 50123


### Creation of vocabulary for embeddings

In [8]:
vocab2index = {"":0, "UNK":1}
words = ["", "UNK"]
for word in c:
    vocab2index[word] = len(words)
    words.append(word)

### Creation of embeddings

In [9]:
def encode_sentence(text, vocab2index, N=6000):
    tokenized = tokenize(text)
    encoded = np.zeros(N, dtype=int)
    enc1 = np.array([vocab2index.get(word, vocab2index["UNK"]) for word in tokenized])
    length = min(N, len(enc1))
    encoded[:length] = enc1[:length]
    return encoded, length

In [10]:
import numpy as np
df['encoded_text'] = df['text'].apply(lambda x: np.array(encode_sentence(x,vocab2index)))

  


## LSTM

In [11]:
#library imports
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from sklearn.metrics import mean_squared_error

### Dataset creation & Transfer to Cuda

In [12]:
class ReviewsDataset(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.y = Y
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return torch.from_numpy(self.X[idx][0].astype(np.int32)).to("cuda:0"), self.y[idx], self.X[idx][1]

In [13]:
x = list(df['encoded_text'])
y = list(df['score'].astype(int))

for index, value in enumerate(x):
    if value[1] == 0:
        del x[index]
        del y[index]

from sklearn.model_selection import train_test_split
x_train, x_valid, y_train, y_valid = train_test_split(x, y, test_size=0.2, shuffle=True)
x_valid, x_test, y_valid, y_test = train_test_split(x, y, test_size=0.5, shuffle=True)

In [14]:
for index, value in enumerate(x_train):
    if value[1] == 0:
        del x_train[index]
        del y_train[index]

for index, value in enumerate(x_valid):
    if value[1] == 0:
        del x_valid[index]
        del y_valid[index]

for index, value in enumerate(x_test):
    if value[1] == 0:
        del x_test[index]
        del y_test[index]

In [15]:
y_train = torch.tensor(y_train).to("cuda:0")
y_valid = torch.tensor(y_valid).to("cuda:0")
y_test = torch.tensor(y_test).to("cuda:0")

In [16]:
train_ds = ReviewsDataset(x_train, y_train)
valid_ds = ReviewsDataset(x_valid, y_valid)
test_ds = ReviewsDataset(x_test, y_test)

### Traning Loop

In [17]:
def train_model(model, epochs=10, lr=0.001):
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(parameters, lr=lr)
    for i in range(epochs):
        model.train()
        sum_loss = 0.0
        total = 0
        for x, y, l in train_dl:
            x = x.long()
            y = y.long()
            y_pred = model(x, l)
            optimizer.zero_grad()
            loss = F.cross_entropy(y_pred, y)
            loss.backward()
            optimizer.step()
            sum_loss += loss.item()*y.shape[0]
            total += y.shape[0]
        val_loss, val_acc, val_rmse = validation_metrics(model, val_dl)
        if i % 1 == 0:
            print("train loss %.3f, val loss %.3f, val accuracy %.3f, and val rmse %.3f" % (sum_loss/total, val_loss, val_acc, val_rmse))

def validation_metrics (model, valid_dl):
    model.eval()
    correct = 0
    total = 0
    sum_loss = 0.0
    sum_rmse = 0.0
    for x, y, l in valid_dl:
        x = x.long()
        y = y.long()
        y_hat = model(x, l)
        loss = F.cross_entropy(y_hat, y)
        pred = torch.max(y_hat, 1)[1]
        correct += (pred == y).float().sum()
        total += y.shape[0]
        sum_loss += loss.item()*y.shape[0]
        pred = pred.to('cpu')
        y = y.to('cpu')
        sum_rmse += np.sqrt(mean_squared_error(pred, y.unsqueeze(-1)))*y.shape[0]
    return sum_loss/total, correct/total, sum_rmse/total

In [18]:
batch_size = 128
vocab_size = len(words)
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
val_dl = DataLoader(valid_ds, batch_size=batch_size)

### LSTM Model

In [19]:
class LSTM_variable_input(torch.nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim) :
        super().__init__()
        self.hidden_dim = hidden_dim
        self.dropout = nn.Dropout(0.3)
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim, 11)
        
    def forward(self, x, s):
        x = self.embeddings(x)
        x = self.dropout(x)
        out_pack, (ht, ct) = self.lstm(x)
        out = self.linear(ht[-1])
        return out.to("cuda:0")

In [20]:
model = LSTM_variable_input(vocab_size, 700, 70)
model = model.to("cuda:0")
print(model)

LSTM_variable_input(
  (dropout): Dropout(p=0.3, inplace=False)
  (embeddings): Embedding(50125, 700, padding_idx=0)
  (lstm): LSTM(700, 70, batch_first=True)
  (linear): Linear(in_features=70, out_features=11, bias=True)
)


### Traning

In [21]:
train_model(model, epochs=128, lr=0.01)

train loss 2.141, val loss 1.998, val accuracy 0.226, and val rmse 2.362
train loss 1.988, val loss 1.815, val accuracy 0.355, and val rmse 2.230
train loss 1.857, val loss 1.693, val accuracy 0.409, and val rmse 2.036
train loss 1.718, val loss 1.594, val accuracy 0.443, and val rmse 1.979
train loss 1.610, val loss 1.527, val accuracy 0.474, and val rmse 1.982
train loss 1.517, val loss 1.480, val accuracy 0.497, and val rmse 1.923
train loss 1.441, val loss 1.448, val accuracy 0.512, and val rmse 1.903
train loss 1.376, val loss 1.413, val accuracy 0.530, and val rmse 1.878
train loss 1.318, val loss 1.385, val accuracy 0.544, and val rmse 1.833
train loss 1.265, val loss 1.363, val accuracy 0.563, and val rmse 1.789
train loss 1.232, val loss 1.344, val accuracy 0.576, and val rmse 1.803
train loss 1.200, val loss 1.333, val accuracy 0.583, and val rmse 1.787
train loss 1.152, val loss 1.321, val accuracy 0.593, and val rmse 1.773
train loss 1.120, val loss 1.307, val accuracy 0.59

### Evaluation

In [23]:
correct = 0
correct_by_one = 0
correct_by_two = 0
total = 0
test_dl = DataLoader(test_ds, batch_size=batch_size)
with torch.no_grad():
    for data in test_dl:
        texts, labels, l = data
        texts, labels, l = texts.cuda(), labels.cuda(), l.cpu()
        texts = texts.long()
        outputs = model(texts, l)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        correct_by_one += (predicted == (labels - 1)).sum().item()
        correct_by_one += (predicted == (labels + 1)).sum().item()
        correct_by_two += (predicted == (labels - 2)).sum().item()
        correct_by_two += (predicted == (labels + 2)).sum().item()

## Results

In [24]:
correct / total

0.7352424014066817

In [25]:
a = (correct + correct_by_one) / total
a

0.8399899522732982

In [26]:
a = (correct + correct_by_one + correct_by_two) / total
a

0.9075609143431299

## Sources

https://towardsdatascience.com/multiclass-text-classification-using-lstm-in-pytorch-eac56baed8df