In [20]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import string
import re

from collections import Counter
from torch.utils.data import Dataset, DataLoader
from tqdm.notebook import tqdm

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [4]:
class Vocabulary(object):
    def __init__(self, addPad=False, addUnk=False):
        
        self._token_to_idx = {}
        self._idx_to_token = {}
        
        self._pad_token = "<PAD>"
        self._unk_token = "<UNK>"

        if addPad == True:
            self.add_token(self._pad_token)

        if addUnk == True:
            self.add_token(self._unk_token)

    def add_token(self, token):
        if token in self._token_to_idx:
            index = self._token_to_idx[token]
        else:
            index = len(self._token_to_idx)
            self._token_to_idx[token] = index
            self._idx_to_token[index] = token
        return index

    def lookup_token(self, token):
        if token not in self._token_to_idx:
            return self._token_to_idx[self._unk_token]

        return self._token_to_idx[token]

    def lookup_index(self, index):
        if index not in self._idx_to_token:
            raise KeyError("the index (%d) is not in the Vocabulary" % index)
        return self._idx_to_token[index]

    def __len__(self):
        return len(self._token_to_idx)
    
class ReviewVectorizer(object):
    def __init__(self, review_vocab, rating_vocab):
        self.review_vocab = review_vocab
        self.rating_vocab = rating_vocab

    def vectorize(self, review, max_len):
        vectorized_review = np.zeros(max_len, dtype=np.int32)

        index = 0
        for token in review.split(" "):
            if token not in string.punctuation:
                vectorized_review[index] = self.review_vocab.lookup_token(token)
                index+=1
                if index >= max_len:
                    break

        return vectorized_review

    @classmethod
    def from_dataframe(cls, review_df, cutoff=25):
        review_vocab = Vocabulary(addPad = True, addUnk = True)
        rating_vocab = Vocabulary(addPad = False, addUnk = False)

        for rating in sorted(set(review_df.rating)):
            rating_vocab.add_token(rating)

        word_counts = Counter()
        for review in review_df.review:
            for word in review.split(" "):
                if word not in string.punctuation:
                    word_counts[word] += 1
               
        for word, count in word_counts.items():
            if count > cutoff:
                review_vocab.add_token(word)

        return cls(review_vocab, rating_vocab)

class ReviewDataset(Dataset):
    def __init__(self, review_df, vectorizer):
        self.review_df = review_df 
        self._vectorizer = vectorizer

        self._max_seq_length = 500#max(map(len, self.review_df.review))

        self.train_df = self.review_df[self.review_df.split=='train']
        self.train_size = len(self.train_df)

        self.val_df = self.review_df[self.review_df.split=='val']
        self.validation_size = len(self.val_df)

        self.test_df = self.review_df[self.review_df.split=='test']
        self.test_size = len(self.test_df)

        self._lookup_dict = {'train': (self.train_df, self.train_size), 
                             'val': (self.val_df, self.validation_size), 
                             'test': (self.test_df, self.test_size)}

        self.set_split('train')
        
    @classmethod
    def load_dataset_and_make_vectorizer(cls, review_csv):
        review_df = pd.read_csv(review_csv)
        return cls(review_df, ReviewVectorizer.from_dataframe(review_df))

    def get_vectorizer(self):
        return self._vectorizer

    def set_split(self, split="train"):
        self._target_split = split
        self._target_df, self._target_size = self._lookup_dict[split]

    def __len__(self):
        return self._target_size

    def __getitem__(self, index):
        row = self._target_df.iloc[index]

        review = self._vectorizer.vectorize(row.review, self._max_seq_length)
        rating = self._vectorizer.rating_vocab._token_to_idx[row.rating]

        return {'x_data':review,
                'y_target':rating}

In [5]:
class ReviewClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, rnn_hidden_dim, num_classes, batch_first=True):
        super(ReviewClassifier, self).__init__()

        self.emb = nn.Embedding(vocab_size, embedding_dim=embedding_dim, padding_idx=0)
        
        self.rnn = nn.GRU(embedding_dim, rnn_hidden_dim, batch_first=batch_first)
        
        self.fc = nn.Linear(rnn_hidden_dim, 1)
        
    def forward(self, x_in, apply_sofmax = False):
        x_embedded = self.emb(x_in)
        
        y_out, _ = self.rnn(x_embedded)
        
        batch_size, seq_len, feature_size = y_out.shape
        
        y_out = y_out.contiguous().view(batch_size*seq_len, feature_size)
        y_out = self.fc(F.dropout(y_out, p=0.5))
        
        new_feat_size = y_out.shape[-1]
        y_out = y_out.view(batch_size, seq_len, new_feat_size)
        y_out = y_out.contiguous().view(batch_size, -1)
        y_out = y_out[:, -1]
        
        return y_out

In [8]:
dataset = ReviewDataset.load_dataset_and_make_vectorizer("/content/drive/MyDrive/Colab Notebooks/reviews_with_splits_lite.csv")

batch_size = 128
vocab_size = dataset._vectorizer.review_vocab.__len__()
embedding_dim = 100
rnn_hidden_dim = 64
num_classes = dataset._vectorizer.rating_vocab.__len__()
learning_rate = 0.01
num_epochs = 100
print_every=10

model = ReviewClassifier(vocab_size=vocab_size,
                        embedding_dim =embedding_dim,
                        rnn_hidden_dim=rnn_hidden_dim,
                        num_classes=num_classes).to(device)

loss_func = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

def get_batch_loader(dataset, batch_size):
    data_indices = np.arange(dataset._target_size)    
    np.random.shuffle(data_indices)
    
    data = []
    for index in data_indices:
        data.append(dataset[index])
        
    data_loader = DataLoader(data, batch_size=batch_size)
        
    return data_loader

train_loader = get_batch_loader(dataset=dataset, batch_size=batch_size)

dataset.set_split("val")
val_loader = get_batch_loader(dataset=dataset, batch_size=batch_size)

In [1]:
for epoch in tqdm(range(num_epochs), position = 0, leave=True):
    
    running_loss = 0.0
    model.train()
    
    for batch_index, batch in enumerate(train_loader):
        optimizer.zero_grad()
        
        y_pred = model(batch['x_data'].to(device))
        
        loss = loss_func(y_pred, batch['y_target'].float().to(device))

        running_loss += (loss.item() - running_loss) / (batch_index + 1)

        loss.backward()

        optimizer.step()
        
    val_running_loss = 0.0
    model.eval()
        
    for batch_index, batch in enumerate(val_loader):
        y_pred = model(batch['x_data'].to(device))
        
        loss = loss_func(y_pred, batch["y_target"].float().to(device))
        loss_t = loss.item()

        val_running_loss += (loss_t - val_running_loss) / (batch_index + 1)
        
    if epoch == 0 or (epoch+1) % print_every == 0:
        print(f"Epoch: {epoch + 1} / {num_epochs}. Train Loss: {running_loss}.\
        Validation Loss: {val_running_loss}")

NameError: name 'tqdm' is not defined

In [12]:
def compute_accuracy(y_pred, y_target):
    y_target = y_target.cpu()
    y_pred_indices = (torch.sigmoid(y_pred)>0.5).cpu().long()#.max(dim=1)[1]
    n_correct = torch.eq(y_pred_indices, y_target).sum().item()
    return n_correct / len(y_pred_indices) * 100

In [16]:
dataset.set_split('test')
batch_generator = get_batch_loader(dataset, batch_size=batch_size)

running_loss = 0.
running_acc = 0.
model.eval()

for batch_index, batch_dict in enumerate(batch_generator):
    y_pred = model(batch_dict['x_data'].to(device))

    loss = loss_func(y_pred, batch_dict['y_target'].float().to(device))
    loss_t = loss.item()
    running_loss += (loss_t - running_loss) / (batch_index + 1)

    acc_t = compute_accuracy(y_pred, batch_dict['y_target'])
    running_acc += (acc_t - running_acc) / (batch_index + 1)

print("Test loss: {:.3f}".format(running_loss))
print("Test Accuracy: {:.2f}".format(running_acc))

Test loss: 0.388
Test Accuracy: 84.73


In [27]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"([.,!?])", r" \1 ", text)
    text = re.sub(r"[^a-zA-Z.,!?]+", r" ", text)
    return text

def predict_rating(review, classifier, vectorizer, decision_threshold=0.5):
    review = preprocess_text(review)
    
    vectorized_review = torch.tensor(vectorizer.vectorize(review, dataset._max_seq_length))
    result = classifier(vectorized_review.view(1, -1))
    
    probability_value = torch.sigmoid(result).item()
    index = 1
    if probability_value < decision_threshold:
        index = 0

    return vectorizer.rating_vocab.lookup_index(index)


test_review = "The service terrible and the food was ok"

model = model.cpu()
prediction = predict_rating(test_review, model, dataset.get_vectorizer(), decision_threshold=0.5)
print("{} -> {}".format(test_review, prediction))

The service terrible and the food was ok -> negative
