# This notebook was prepared by Taras Semenchenko

In [1]:
%matplotlib inline

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [2]:
df_train = pd.read_csv('../input/yelp-reviews-dataset/train.csv', header=None)
df_test = pd.read_csv('../input/yelp-reviews-dataset/test.csv', header=None)
df_train.columns = ['rating', 'review']
df_test.columns = ['rating', 'review']
df_train.head()

Unnamed: 0,rating,review
0,5,dr. goldberg offers everything i look for in a...
1,2,"Unfortunately, the frustration of being Dr. Go..."
2,4,Been going to Dr. Goldberg for over 10 years. ...
3,4,Got a letter in the mail last week that said D...
4,1,I don't know what Dr. Goldberg was like before...


In [3]:
df_train['review'][4]

"I don't know what Dr. Goldberg was like before  moving to Arizona, but let me tell you, STAY AWAY from this doctor and this office. I was going to Dr. Johnson before he left and Goldberg took over when Johnson left. He is not a caring doctor. He is only interested in the co-pay and having you come in for medication refills every month. He will not give refills and could less about patients's financial situations. Trying to get your 90 days mail away pharmacy prescriptions through this guy is a joke. And to make matters even worse, his office staff is incompetent. 90% of the time when you call the office, they'll put you through to a voice mail, that NO ONE ever answers or returns your call. Both my adult children and husband have decided to leave this practice after experiencing such frustration. The entire office has an attitude like they are doing you a favor. Give me a break! Stay away from this doc and the practice. You deserve better and they will not be there when you really nee

In [4]:
import re

def clear_data(sent):
    clear_words = []
    for word in sent.lower().split():
        cleaned = re.findall('\w+', word)
        if len(cleaned) > 0:
            clear_words += [cleaned[0]]
    return clear_words[:100]

training_data = df_train.copy()
training_data['review'] = training_data['review'].apply(clear_data)


word2idx = {}
frequency = {}
for label, sent in training_data.values.tolist():
    for word in sent:
        if word not in word2idx:
            word2idx[word] = len(word2idx)
            frequency[word] = 1
        else:
            frequency[word] += 1

label2idx = {1: 0, 
             2: 1, 
             3: 2,
             4: 3, 
             5: 4
}

In [5]:
sorted_frequency = sorted(list(frequency.items()), key=lambda x: x[1], reverse=True)
sorted_frequency[:5]

[('the', 2497740),
 ('and', 1611212),
 ('i', 1495814),
 ('a', 1318494),
 ('to', 1187485)]

In [6]:
dictionary = sorted_frequency[:50000]
used_words = {word for word, count in dictionary}

In [7]:
training_data.head()

Unnamed: 0,rating,review
0,5,"[dr, goldberg, offers, everything, i, look, fo..."
1,2,"[unfortunately, the, frustration, of, being, d..."
2,4,"[been, going, to, dr, goldberg, for, over, 10,..."
3,4,"[got, a, letter, in, the, mail, last, week, th..."
4,1,"[i, don, know, what, dr, goldberg, was, like, ..."


In [8]:
import numpy as np

# a helper function for converting a sequence of words to a Tensor of numerical values
# will be used later in training
def prepare_sequence(seq, to_idx):
    '''This function takes in a sequence of words and returns a 
    corresponding Tensor of numerical values (indices for each word).'''
    idxs = [to_idx[w] if w in used_words else 99999 for w in seq]
    idxs = np.array(idxs)
    return torch.from_numpy(idxs)

In [9]:
# check out what prepare_sequence does for one of our training sentences:
example_input = prepare_sequence(clear_data("The dog answers the phone"), word2idx)
print(example_input)

tensor([  70, 2310,   90,   70,   91])


In [10]:
df_train['cleaned'] = df_train['review'].apply(lambda x: prepare_sequence(clear_data(x), word2idx)[:100])
df_test['cleaned'] = df_test['review'].apply(lambda x: prepare_sequence(clear_data(x), word2idx)[:100])
df_train['cleaned']

0         [tensor(0), tensor(1), tensor(2), tensor(3), t...
1         [tensor(69), tensor(70), tensor(71), tensor(62...
2         [tensor(123), tensor(124), tensor(15), tensor(...
3         [tensor(164), tensor(8), tensor(165), tensor(7...
4         [tensor(4), tensor(110), tensor(190), tensor(5...
                                ...                        
649995    [tensor(4), tensor(75), tensor(8), tensor(2666...
649996    [tensor(91), tensor(4048), tensor(20), tensor(...
649997    [tensor(419), tensor(183), tensor(136), tensor...
649998    [tensor(4), tensor(128), tensor(254), tensor(1...
649999    [tensor(4), tensor(34), tensor(123), tensor(18...
Name: cleaned, Length: 650000, dtype: object

In [11]:
padded_sequence_train = torch.nn.utils.rnn.pad_sequence(df_train['cleaned'], batch_first=True)
padded_sequence_test = torch.nn.utils.rnn.pad_sequence(df_test['cleaned'], batch_first=True)
padded_sequence_train

tensor([[   0,    1,    2,  ...,    0,    0,    0],
        [  69,   70,   71,  ...,   70,  122,   85],
        [ 123,  124,   15,  ...,    0,    0,    0],
        ...,
        [ 419,  183,  136,  ...,    0,    0,    0],
        [   4,  128,  254,  ...,    0,    0,    0],
        [   4,   34,  123,  ...,   34,   15, 1064]])

In [12]:
import torch.nn.functional as F

class LSTMTagger(nn.Module):
    
    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        ''' Initialize the layers of this model.'''
        super(LSTMTagger, self).__init__()
        
        self.hidden_dim = hidden_dim

        # embedding layer that turns words into a vector of a specified size
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        # the LSTM takes embedded word vectors (of a specified size) as inputs 
        # and outputs hidden states of size hidden_dim
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True)

        self.c1 = nn.Conv1d(100, 100, 5, padding=2)
        # self.p1 = nn.MaxPool1d(2)

        # the linear layer that maps the hidden state output dimension 
        # to the number of tags we want as output, tagset_size (in this case this is 3 tags)
        
        # self.dence1 = nn.Linear(hidden_dim * 100 * 2, tagset_size)
        self.dence1 = nn.Linear(hidden_dim * 100 * 2, 32)
        self.dence2 = nn.Linear(32, tagset_size)

        # self.hidden2tag = nn.Linear(hidden_dim * 100 * 2, tagset_size)
        
        # initialize the hidden state (see code below)
        self.hidden = self.init_hidden()

        
    def init_hidden(self):
        ''' At the start of training, we need to initialize a hidden state;
           there will be none because the hidden state is formed based on perviously seen data.
           So, this function defines a hidden state with all zeroes and of a specified size.'''
        # The axes dimensions are (n_layers, batch_size, hidden_dim)
        return (torch.zeros(2, 100, self.hidden_dim).to(device),
                torch.zeros(2, 100, self.hidden_dim).to(device))

    def forward(self, x):
        ''' Define the feedforward behavior of the model.'''
        # create embedded word vectors for each word in a sentence
        x = self.word_embeddings(x)
        # get the output and hidden state by passing the lstm over our word embeddings
        # the lstm takes in our embeddings and hiddent state
        # lstm_out, self.hidden = self.lstm(embeds.view(len(sentence), 1, -1), self.hidden)
        x, self.hidden = self.lstm(x, self.hidden)
        # lstm_out, self.hidden = self.lstm2(lstm_out, self.hidden)
        
        #print(lstm_out.shape)

        # print(lstm_out.shape)
        
        
        x = F.relu(self.c1(x))
        # print(conv_out.shape)
        # conv_out = self.p1(conv_out)

        # print(conv_out.shape)

        # get the scores for the most likely tag for a word
        x = self.dence1(x.view(len(x), -1))
        x = self.dence2(x)
        logits = F.log_softmax(x, dim=1)
        
        return logits

In [14]:
import torch.utils.data as data_utils

batch_size = 32

labels_train = torch.from_numpy(df_train['rating'].to_numpy() - 1)
labels_test = torch.from_numpy(df_test['rating'].to_numpy() - 1)

train = data_utils.TensorDataset(padded_sequence_train, labels_train)
test = data_utils.TensorDataset(padded_sequence_test, labels_test)

train_loader = data_utils.DataLoader(train, batch_size=batch_size, shuffle=True, num_workers=4)
test_loader = data_utils.DataLoader(test, batch_size=batch_size, shuffle=True, num_workers=4)

In [15]:
# the embedding dimension defines the size of our word vectors
# for our simple vocabulary and training set, we will keep these small
EMBEDDING_DIM = 100
HIDDEN_DIM = 100
NUM_CLASSES = 5
BATCH_SIZE = 32

# instantiate our model
model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word2idx), NUM_CLASSES).to(device)

# define our loss and optimizer
# loss_function = nn.NLLLoss()
# loss_function = torch.nn.CrossEntropyLoss().to(device)
loss_function = torch.nn.NLLLoss().to(device)
# optimizer = optim.SGD(model.parameters(), lr=1)
optimizer = optim.Adam(model.parameters(), lr=0.001)

lrscheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer)

In [16]:
# normally these epochs take a lot longer 
# but with our toy data (only 3 sentences), we can do many epochs in a short time

from tqdm.notebook import tqdm
from torch.utils.data import DataLoader

n_epochs = 5

for epoch in range(n_epochs):

    model.train()

    train_losses = []
    train_accs = []
    
    iters = 0
    
    for index, (sentence, tag) in tqdm(enumerate(train_loader), total=len(training_data) // BATCH_SIZE):
                        
        model.zero_grad()

            
        sentence, tag = sentence.to(device), tag.to(device)
        
        model.hidden = model.init_hidden()
                
        tag_scores = model(sentence)

        loss = loss_function(tag_scores, tag)
        train_losses += [loss.item()]
        loss.backward()

        train_accs += [(tag_scores.argmax(1) == tag).sum().item() / batch_size]

        optimizer.step()

    print("Epoch: %d, acc: %1.5f, loss: %1.5f" % (epoch, np.mean(train_accs), np.mean(train_losses)))
    

HBox(children=(FloatProgress(value=0.0, max=20312.0), HTML(value='')))


Epoch: 0, acc: 0.53928, loss: 1.05301


HBox(children=(FloatProgress(value=0.0, max=20312.0), HTML(value='')))


Epoch: 1, acc: 0.58211, loss: 0.96140


HBox(children=(FloatProgress(value=0.0, max=20312.0), HTML(value='')))


Epoch: 2, acc: 0.59982, loss: 0.92222


HBox(children=(FloatProgress(value=0.0, max=20312.0), HTML(value='')))


Epoch: 3, acc: 0.61514, loss: 0.88866


HBox(children=(FloatProgress(value=0.0, max=20312.0), HTML(value='')))


Epoch: 4, acc: 0.63213, loss: 0.85434


In [17]:
padded_sequence_test = padded_sequence_test.to(device)
output = model(padded_sequence_test)
labels_test = torch.from_numpy(df_test['rating'].to_numpy() - 1).to(device)
print('Test accuracy:', (output.argmax(1) == labels_test).sum().item() / len(labels_test))

Test accuracy: 0.5587
