In [70]:
import time 
import math
import re
import json

from datasets import load_dataset_builder, load_dataset

from bs4 import BeautifulSoup

import spacy

import numpy as np
import pandas as pd

import gensim.models
from gensim.models import KeyedVectors

from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import classification_report
from sklearn.utils import shuffle

import torch
import torch.nn as nn

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

In [71]:
# ds_builder = load_dataset_builder('emotion')
# dataset = load_dataset("emotion")

# train_data = pd.DataFrame(dataset['train'])
# validation_data = pd.DataFrame(dataset['validation'])
# test_data = pd.DataFrame(dataset['test'])
# data = pd.concat([train_data, validation_data, test_data])

In [72]:
ds_builder = load_dataset_builder('yelp_review_full')
dataset = load_dataset("yelp_review_full")

train_data = pd.DataFrame(dataset['train'])
test_data = pd.DataFrame(dataset['test'])
data = pd.concat([train_data, test_data])

Found cached dataset yelp_review_full (C:/Users/Jose/.cache/huggingface/datasets/yelp_review_full/yelp_review_full/1.0.0/e8e18e19d7be9e75642fc66b198abadb116f73599ec89a69ba5dd8d1e57ba0bf)


  0%|          | 0/2 [00:00<?, ?it/s]

In [73]:
def assignLabel(label):
    if label == 0 or label == 1:
        return 'negative'
    elif label == 2:
        return 'neutral'
    else:
        return 'positive'

In [74]:
data['label'] = data['label'].apply(assignLabel)
data

Unnamed: 0,label,text
0,positive,dr. goldberg offers everything i look for in a...
1,negative,"Unfortunately, the frustration of being Dr. Go..."
2,positive,Been going to Dr. Goldberg for over 10 years. ...
3,positive,Got a letter in the mail last week that said D...
4,negative,I don't know what Dr. Goldberg was like before...
...,...,...
49995,negative,Just wanted to write a review to chip in with ...
49996,positive,Great ambience. Great drinks. Great food. I lo...
49997,positive,I have been to the other Monks locations so I ...
49998,negative,Don't go here. I know you might want to try i...


In [75]:
nlp = spacy.load('en_core_web_sm')
stopwords = nlp.Defaults.stop_words

In [76]:
# extracting text from tags 
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

# removing square brackets
def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)

# lemmatization and removal of stop words
def lemma(text):
    doc = nlp(str(text).lower())
    lemmas = [token.lemma_ for token in doc 
          if (token.lemma_.isalnum() or token.lemma_ in ['.', '!', '?'] ) and token.lemma_ not in stopwords]
    if len(lemmas) < 2: 
        lemmas.insert(0, 'start')
        lemmas.append('end')
    text = ' '.join(lemmas)
    return text

# cleaning text
def clean(text):
    text = str(text)
    text = strip_html(text)
    text = remove_between_square_brackets(text)
    text = lemma(text)
    return text

In [77]:
data.to_csv('./data/yelp.csv', index=False)
data_clean = data.copy()
data_clean['text'] = data['text'].apply(clean)
data_clean.to_csv('./data/yelp_clean.csv', index=False)



KeyboardInterrupt: 

In [None]:
# data.to_csv('./data/data.csv', index=False)
# data_clean = data.copy()
# data_clean['text'] = data['text'].apply(clean)
# data_clean.to_csv('./data/data_clean.csv', index=False)

In [None]:
data_clean = pd.read_csv('./data/yelp_clean.csv')
data_clean
# data_clean = pd.read_csv('./data/IMDB_Dataset_Clean.csv')
# data_clean
# data_clean = pd.read_csv('./data/data_clean.csv')
# data_clean

In [None]:
sentences = []
_ = [sentences.append(x.split()) for x in data_clean['text']]

In [None]:
t = time.time()

word2vec_model = gensim.models.Word2Vec(
    sentences=sentences,
    window=10,
    min_count=1,
    negative=5,
    vector_size=100, 
    workers=12)

word2vec_model.save('./saved_models/word2vec_model')

print('Time to build vocab: {} mins'.format(round((time.time() - t) / 60, 2)))

In [None]:
word2vecDict = {}
for index, word in enumerate(word2vec_model.wv.index_to_key):
    word2vecDict[word] = word2vec_model.wv.vectors[index]

In [None]:
list(word2vecDict.keys())[:10]

In [None]:
data_vectorized = []
for text in data_clean['text']:
    temp = []
    for word in text.split():
        temp.append(word2vecDict[word])
    data_vectorized.append(np.array(temp))

In [None]:
# first 3 words of first text
print(data_clean['text'][0].split()[:3])
print('\n')
print(data_vectorized[0][:3])

In [None]:
# encoding sentiment
le = preprocessing.LabelEncoder()
data_label = le.fit_transform(data_clean['label'])

In [None]:
# train, validation, test: 80/10/10 
X_train, X_test, y_train, y_test = train_test_split(data_vectorized, data_label, test_size=1/4, random_state=1)
X_validation, X_test, y_validation, y_test = train_test_split(X_test, y_test, test_size=1/2, random_state=1)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers, dropout, bidirectional):
        super(LSTM, self).__init__()
        
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        if bidirectional: self.bidirectional = 2
        else: self.bidirectional = 1
        
        if num_layers < 2:
            dropout = 0
        
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers=num_layers, dropout=dropout, bidirectional=bidirectional)
        self.dropout = nn.Dropout(0.2)
        self.fc = nn.Linear(hidden_size*self.bidirectional, output_size)
        self.softmax = nn.LogSoftmax(dim=1)
    
    def forward(self, input, hidden, c):
        output, (hidden, c) = self.lstm(input, (hidden, c))
        output = self.dropout(output)
        output = output.squeeze()[-1].unsqueeze(0)
        output = self.fc(output)
        output = self.softmax(output)
        return output, hidden, c
    
    def initHidden(self):
        return torch.zeros(self.bidirectional*self.num_layers, 1, self.hidden_size, device=device)
    def initC(self):
        return torch.zeros(self.bidirectional*self.num_layers, 1, self.hidden_size, device=device)

In [None]:
def getLabel(output):
#     print(output)
#     print(output.topk(1))
    value_tensor, index_tensor = output.topk(1)
    return index_tensor[0].item()

In [None]:
def getElapsedTime(start_time):
    current_time = time.time()
    s = current_time - start_time
    m = math.floor(s / 60)
    s -= m * 60
    return '{}m {:.0f}s'.format(m, s)

In [None]:
def train(X_train, y_train):
    hidden, c = lstm.initHidden(), lstm.initC()
    lstm.zero_grad()
    output, hidden, c = lstm(X_train, hidden, c)
    loss = criterion(output, y_train)
    loss.backward()
    nn.utils.clip_grad_norm_(lstm.parameters(), 5)
    optimizer.step()

    return output, loss.item()

In [None]:
def test(X_test, y_test):
    
    n = len(X_test)
    print_every = 1000
    
    correct = 0
    total_loss = 0
    
    y_pred = []
    
    for i in range(n):
        with torch.no_grad():
            hidden, c = lstm.initHidden(), lstm.initC()
            output, hidden, c = lstm(torch.tensor(X_test[i], device=device).unsqueeze(1), hidden, c)
            loss = criterion(output, torch.tensor([y_test[i]], dtype=torch.long, device=device))
            total_loss+=loss.item()
            
            y_pred.append(getLabel(output))
            if y_pred[-1] == y_test[i]:
                correct+=1
            
            if i>0 and i%print_every == 0:
                pass
#                 print('Test Sample {} | {:.2f}%'.format(i, i*100/n))
                
                
    accuracy = correct / n
    total_loss = total_loss / n
    
    return accuracy, total_loss

In [None]:
# strict parameters
input_size = len(X_train[0][0])
output_size = len(le.classes_)

# adjustable parameters
hidden_size = 128
num_layers = 2
dropout = .2
bidirectional = False


lstm = LSTM(input_size, hidden_size, output_size, num_layers, dropout, bidirectional)
criterion = nn.NLLLoss()
lr = 0.001

optimizer = torch.optim.SGD(lstm.parameters(), lr=lr)

In [None]:
epochs = 100
n = len(X_train)

print_every = 5000
plot_every = 1000

start_time = time.time()

total_epoch_loss = []
total_validation_loss = []

total_validation_accuracy = []

for epoch in range(epochs):
    
    X_train, y_train = shuffle(X_train, y_train)
    
    epoch_loss = []
    current_loss = 0
    
    lstm.to(device)
    lstm.train()
        
    for i in range(n): 
        output, loss = train(torch.tensor(X_train[i], device=device).unsqueeze(1).to(device), torch.tensor([y_train[i]], dtype=torch.long, device=device).to(device))
        current_loss += loss
    
        if i>0 and i % print_every == 0:
            y_pred = getLabel(output)
            if y_pred == y_train[i]:
                y_pred = "Correct! It's {}".format(y_pred)  
            else:
                y_pred = 'Wrong! Guessed {} but should be: {}'.format(y_pred, y_train[i])
        
            print('Epoch %d Sample %d | %d%% | (%s) | Loss: %.4f | %s' % (epoch, i, i / n * 100, getElapsedTime(start_time), loss, y_pred))

        if i>0 and i % plot_every == 0:
            epoch_loss.append(current_loss / plot_every)
            current_loss = 0
            
    print('\n')
            
    # training loss
    training_loss = sum(epoch_loss) / len(epoch_loss)
    
    # validation loss
    lstm.eval()
    validation_accuracy, validation_loss = test(X_validation, y_validation)
    
    # save metrics if continuing
    total_epoch_loss.append(training_loss)
    total_validation_accuracy.append(validation_accuracy)
    total_validation_loss.append(validation_loss)
    
    # save model
    torch.save(lstm.state_dict(), './saved_models/lstm_epoch_{}.pth'.format(epoch))
    
    # plot current epoch loss
    plt.title('Epoch {} Loss'.format(epoch))
    plt.plot(epoch_loss)
    plt.ylabel('Loss')
    plt.show()

In [None]:
plt.title('Total Loss')
plt.plot(range(len(total_epoch_loss)), total_epoch_loss, label='Training Loss')
plt.plot(range(len(total_validation_loss)), total_validation_loss, label= 'Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

plt.title('Total Validation Accuracy')
plt.plot(total_validation_accuracy)
plt.show()

In [None]:
# final test
test_accuracy, test_loss = test(X_test, y_test)
print(test_accuracy)