# Train a LSTM to classify yelp reviews

In [4]:
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
import torch.optim as optim
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from gensim.models import KeyedVectors
import gensim.downloader as api
from tqdm import tqdm

In [3]:
from src.data_processing.process_labels import *
from src.data_processing.process_reviews import *

## Data processing

### Import data

In [5]:
# Read data
df = pd.read_csv('data/raw_reviews/reviews_v1.csv')

In [6]:
# Separate reviews and labels
reviews = df.text
food_labels = df.food
service_labels = df.service

### Get review labels
Get joint distribution for only food, only service, both food and service, and neither.

In [7]:
from src.data_processing.process_labels import *

In [8]:
y = label_generator(food_labels=food_labels.values, 
                    service_labels=service_labels.values).trim_and_fetch_labels()

In [9]:
y.shape

torch.Size([1000, 4])

### Get word2vec embeddings

In [10]:
# trim reviews to size of labels (y)
reviews = reviews[:len(y)].copy()

In [11]:
# Tokenize reviews
## seperating the words
from src.data_processing.word_tokenizer import basic_tokenizer
review_list = [basic_tokenizer(review) for review in reviews]

In [12]:
# Embedding reviews
## give each token 300 dim vector from word2vec
from src.data_processing.word_tokenizer import batch_embedding
# Load word2vec model
model = KeyedVectors.load('word2vec/word2vec-google-news-300.model')

In [13]:
# word embeddings for all reviews
x_all = batch_embedding(review_list, model)

Fetching review embeddings: 100%|██████████| 1000/1000 [00:06<00:00, 143.15it/s]


In [14]:
x_all.size()

torch.Size([1000, 652, 300])

In [17]:
# class frequences for [food only, service only, both, neither]
torch.bincount(torch.argmax(y, dim=1)) / len(y)

tensor([0.3770, 0.0660, 0.5300, 0.0270])

### Train/validate/test split

In [12]:
from src.data_processing.train_val_test import train_val_test

In [13]:
x_train, x_val, x_test = train_val_test(x_all, train_frac=0.6, val_frac=0.2, test_frac=0.2)
y_train, y_val, y_test = train_val_test(y, train_frac=0.6, val_frac=0.2, test_frac=0.2)

In [14]:
x_train.shape, y_train.shape

(torch.Size([600, 652, 300]), torch.Size([600, 4]))

### Create Pytorch Dataset and Dataloader

In [15]:
from torch.utils.data import DataLoader, TensorDataset

In [16]:
# create torch datasets
dataset_train, dataset_val, dataset_test = (TensorDataset(x_train, y_train),
                                            TensorDataset(x_val, y_val),
                                            TensorDataset(x_test, y_test))

In [17]:
# create torch dataloader
dataloader_train, dataloader_val, dataloader_test = (DataLoader(dataset_train, batch_size=1, shuffle=True),
                                                     DataLoader(dataset_val, batch_size=1, shuffle=True),
                                                     DataLoader(dataset_test, batch_size=1, shuffle=True))

## Hyperparameter Tuning

In [18]:
from src.models.model_zoo import *
from src.models.model_train import *

# instatiate model
input_size = 300
hidden_size = 300
num_layers = 2
dropout_prob = 0.5
output_size = 4
lstm_model = LSTMmodel(input_size=input_size, hidden_size=hidden_size, 
                       output_size=output_size, num_layers=num_layers, dropout_prob=dropout_prob)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(lstm_model.parameters(), lr=0.001)

In [19]:
# run training loop
train_loss_list, val_loss_list = training_loop(model=lstm_model, criterion=criterion, 
                                               optimizer=optimizer, patience=5, 
                                               dataloader_train=dataloader_train, 
                                               dataloader_val=dataloader_val, epochs=50)

Training epochs:   0%|          | 0/50 [04:36<?, ?it/s]


TypeError: val_one_epoch() takes 3 positional arguments but 4 were given