# Train a LSTM to classify yelp reviews

In [1]:
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from gensim.models import KeyedVectors
import gensim.downloader as api
from tqdm import tqdm

## Data processing

### Import data

In [2]:
# Read data
df = pd.read_csv('data/raw_reviews/reviews_v1.csv')

In [3]:
# Separate reviews and labels
reviews = df.text
food_labels = df.food
service_labels = df.service

### Get review labels
Get joint distribution for only food, only service, both food and service, and neither.

In [4]:
from src.data_processing.process_labels import *

In [5]:
y = label_generator(food_labels=food_labels.values, 
                    service_labels=service_labels.values).trim_and_fetch_labels()

In [6]:
y.shape

torch.Size([749, 4])

### Get word2vec embeddings

In [7]:
# trim reviews to size of labels (y)
reviews = reviews[:len(y)].copy()

In [8]:
# Tokenize reviews
## seperating the words
from src.data_processing.word_tokenizer import basic_tokenizer
review_list = [basic_tokenizer(review) for review in reviews]

In [9]:
# Embedding reviews
## give each token 300 dim vector from word2vec
from src.data_processing.word_tokenizer import batch_embedding
# Load word2vec model
model = KeyedVectors.load('word2vec/word2vec-google-news-300.model')

In [10]:
# word embeddings for all reviews
x_all = batch_embedding(review_list, model)

Fetching review embeddings:   0%|          | 0/749 [00:00<?, ?it/s]

Fetching review embeddings: 100%|██████████| 749/749 [00:00<00:00, 947.82it/s] 


In [11]:
x_all.size()

torch.Size([749, 652, 300])

### Train/validate/test split

In [12]:
from src.data_processing.train_val_test import train_val_test

In [13]:
x_train, x_val, x_test = train_val_test(x_all, train_frac=0.6, val_frac=0.2, test_frac=0.2)
y_train, y_val, y_test = train_val_test(y, train_frac=0.6, val_frac=0.2, test_frac=0.2)

In [14]:
x_train.shape, y_train.shape

(torch.Size([449, 652, 300]), torch.Size([449, 4]))

### Create Pytorch Dataset and Dataloader

In [15]:
from torch.utils.data import DataLoader, TensorDataset

In [16]:
# create torch datasets
dataset_train, dataset_val, dataset_test = (TensorDataset(x_train, y_train),
                                            TensorDataset(x_val, y_val),
                                            TensorDataset(x_test, y_test))

In [17]:
# create torch dataloader
dataloader_train, dataloader_val, dataloader_test = (DataLoader(dataset_train, batch_size=1, shuffle=True),
                                                     DataLoader(dataset_val, batch_size=1, shuffle=True),
                                                     DataLoader(dataset_test, batch_size=1, shuffle=True))