In [1]:
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from gensim.models import KeyedVectors
import gensim.downloader as api
from tqdm import tqdm

### Tokenize Reviews
Tokensize each review and convert them to pre-trained word2vec embeddings

In [2]:
# Read data
df = pd.read_csv('data/raw_reviews/reviews_v1.csv')

In [3]:
# Separate reviews and labels
reviews = df.text
food_labels = df.food
service_labels = df.service

In [4]:
# Tokenize reviews
from src.data_processing.word_tokenizer import basic_tokenizer
review_list = [basic_tokenizer(review) for review in reviews]

In [5]:
# Load word2vec model
model = KeyedVectors.load('word2vec/word2vec-google-news-300.model')

In [6]:
from src.data_processing.word_tokenizer import batch_embedding

In [7]:
batch_embedding(review_list, model).shape

Fetching review embeddings: 100%|██████████| 9998/9998 [00:09<00:00, 1084.65it/s]


torch.Size([9998, 992, 300])

In [None]:
# Tokensize each review and convert to word2vec
reviews_all = []
for review in tqdm(X, desc='Processing reviews: '):
    review_embeddings = []
    for word in review:
        try:
            review_embeddings.append(model[word])
        except KeyError: # for unknown word, use vector of zeros
            review_embeddings.append(np.zeros(300))
            
    reviews_all.append(torch.tensor(review_embeddings))

In [None]:
# Use PyTorch pad_sequence() so that all review sequences are the same length
# Output is shape (# reviews, len longest review, dimension embedding (300))
padded_reviews = pad_sequence(reviews_all, padding_value = -1, batch_first=True) 