<a href="https://colab.research.google.com/github/JJ0131/NLPwithPyTorchBook/blob/master/NLPwithPyTorch_Yelp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# The Very Basics

In [0]:
import torch
import torch.nn as nn

In [0]:
x = torch.ones((2,2),requires_grad=True)
x
y = x+2
y
z = y*y*3
z

tensor([[27., 27.],
        [27., 27.]], grad_fn=<MulBackward0>)

In [0]:
z.backward(torch.tensor([[1.,2.],[2.,2.]]))

In [0]:
x.grad

tensor([[18., 36.],
        [36., 36.]])

tensor([2., 3.])

In [0]:
x = torch.randn(3, requires_grad=True)

y = x * 2

In [0]:
y.data.norm()

tensor(5.1892)

In [0]:
y=0
for x in [-0.4022, -1.8039, -0.8658]:
  x = x*x
  y = y+x

print(y)

4.16542969


# Define your Model: Perceptron as an example

In [0]:
class Perceptron(nn.Module):
  def __init__(self,input_dim):
    super(Perceptron,self).__init__()
    self.fc1 = nn.Linear(input_dim,1)

  def forward(self, x_in):
    return torch.sigmoid(self.fc1(x_in)).squeeze()

In [0]:
p1 = Perceptron(3)

In [0]:
x = torch.randn((5,3))
x

tensor([[ 0.3397,  1.1633,  1.8041],
        [ 0.2138, -0.2485, -0.8075],
        [-0.2177,  0.9454,  0.1068],
        [ 0.7367, -1.4367, -1.3619],
        [ 0.4786,  0.8320, -1.6696]])

In [0]:
p1.forward(x)

tensor([0.4243, 0.3112, 0.3596, 0.2834, 0.1678], grad_fn=<SqueezeBackward0>)

# Loss Functions

##### Mean Squared Error

In [0]:
mse_loss = nn.MSELoss()
outputs = torch.randn((3,5), requires_grad=True)
targets = torch.randn((3,5))
loss = mse_loss(outputs,targets)
print(loss)

tensor(4.2377, grad_fn=<MseLossBackward>)


##### Categorical Cross-Entropy Loss

In [0]:
ce_loss = nn.CrossEntropyLoss()
outputs = torch.randn((3,5), requires_grad=True)
# targets = torch.tensor([1,0,3])
targets = torch.randint(0,3,(1,3)).squeeze()
loss = ce_loss(outputs,targets)
print(loss)

tensor(1.3355, grad_fn=<NllLossBackward>)


##### Binary Cross-Entropy Loss

In [0]:
bce_loss = nn.BCELoss()
outputs = torch.rand((4,1), requires_grad=True)
# targets = torch.tensor([1,0,3])
targets = torch.randint(0,1,(4,1),dtype=torch.float32)
loss = bce_loss(outputs,targets)
print(loss)

tensor(1.2456, grad_fn=<BinaryCrossEntropyBackward>)


# Choose an Optimizer

In [0]:
import torch.optim as optim
input_dim = 5
lr = 0.001

perceptron = Perceptron(input_dim=5)
bce_loss = nn.BCELoss()
outputs = torch.rand((4,1), requires_grad=True)
# targets = torch.tensor([1,0,3])
targets = torch.randint(0,1,(4,1),dtype=torch.float32)
loss = bce_loss(outputs,targets)
optimizer = optim.Adam(perceptron.parameters(),lr=lr)

# The Full Process

In [0]:
# each epoch is a complete pass over the training data
for epoch_i in range(n_epochs):
    # the inner loop is over the batches in the dataset
    for batch_i in range(n_batches):

        # Step 0: Get the data
        x_data, y_target = get_toy_data(batch_size)

        # Step 1: Clear the gradients 
        perceptron.zero_grad()

        # Step 2: Compute the forward pass of the model
        y_pred = perceptron(x_data, apply_sigmoid=True)

        # Step 3: Compute the loss value that we wish to optimize
        loss = bce_loss(y_pred, y_target)

        # Step 4: Propagate the loss signal backward
        loss.backward()

        # Step 5: Trigger the optimizer to perform one update
        optimizer.step()

# An Example: Classifying Sentiment of Restaurant Reviews

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
import collections
import numpy as np
import pandas as pd
import re
import torch
import torch.nn as nn

from argparse import Namespace

In [0]:
args = Namespace(
    raw_train_dataset_csv=r"drive/My Drive/yelp/raw_train.csv",
    raw_test_dataset_csv=r"drive/My Drive/yelp/raw_test.csv",
    proportion_subset_of_train=0.1,
    train_proportion=0.7,
    val_proportion=0.15,
    test_proportion=0.15,
    output_munged_csv=r"drive/My Drive/yelp/reviews_with_splits_lite.csv",
    seed=1337
)

In [0]:
train_reviews = pd.read_csv(args.raw_train_dataset_csv,names = ['rating','review'])
train_reviews.head()

Unnamed: 0,rating,review
0,1,"Unfortunately, the frustration of being Dr. Go..."
1,2,Been going to Dr. Goldberg for over 10 years. ...
2,1,I don't know what Dr. Goldberg was like before...
3,1,I'm writing this review to give you a heads up...
4,2,All the food is great here. But the best thing...


In [0]:
train_reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 560000 entries, 0 to 559999
Data columns (total 2 columns):
rating    560000 non-null int64
review    560000 non-null object
dtypes: int64(1), object(1)
memory usage: 8.5+ MB


In [0]:
train_reviews.rating.value_counts()

2    280000
1    280000
Name: rating, dtype: int64

# My version of subsetting Start

In [0]:
negative_index = train_reviews.query('rating==1').index
negative_index
np.random.seed(args.seed)
negative_index = train_reviews.query('rating==1').index.to_list()
positive_index = train_reviews.query('rating==2').index.to_list()

negative_subset_size = int(len(negative_index)*args.proportion_subset_of_train)
negative_train_size = int(negative_subset_size * args.train_proportion)
negative_val_size = int(negative_subset_size*args.val_proportion)
negative_test_size = int(negative_subset_size*args.test_proportion)

positive_subset_size = int(len(positive_index)*args.proportion_subset_of_train)
positive_train_size = int(positive_subset_size * args.train_proportion)
positive_val_size = int(positive_subset_size*args.val_proportion)
positive_test_size = int(positive_subset_size*args.test_proportion)


negative_subset_index = list(np.random.choice(negative_index,negative_subset_size))
positive_subset_index = list(np.random.choice(positive_index,positive_subset_size))


negative_train_index = list(negative_subset_index[:negative_train_size])
negative_val_index = list(negative_subset_index[negative_train_size:negative_train_size+negative_val_size])
negative_test_index = list(negative_subset_index[negative_train_size+negative_val_size:])

positive_train_index =  list(positive_subset_index[:positive_train_size])
positive_val_index =    list(positive_subset_index[positive_train_size:positive_train_size+positive_val_size])
positive_test_index =   list(positive_subset_index[positive_train_size+positive_val_size:])

train_subset = train_reviews.iloc[negative_train_index+positive_train_index].copy()
val_subset = train_reviews.iloc[negative_val_index+positive_val_index].copy()
test_subset = train_reviews.iloc[negative_test_index+positive_test_index].copy()

In [0]:
print(len(train_subset),args.train_proportion)
print(len(val_subset),args.val_proportion)
print(len(test_subset),args.test_proportion)

39200 0.7
8400 0.15
8400 0.15


# Author's version of subsetting (which sucks and is obscure)

##### Take 1/10 as the Subset of the Big Dataset

In [0]:
# Put the dataframe into a dictionary, using rating as the key
by_rating = collections.defaultdict(list)
for _, row in train_reviews.iterrows():
  by_rating[row.rating].append(row.to_dict())

In [0]:
for x in by_rating:
  print(by_rating[x][:2])

[{'rating': 1, 'review': "Unfortunately, the frustration of being Dr. Goldberg's patient is a repeat of the experience I've had with so many other doctors in NYC -- good doctor, terrible staff.  It seems that his staff simply never answers the phone.  It usually takes 2 hours of repeated calling to get an answer.  Who has time for that or wants to deal with it?  I have run into this problem with many other doctors and I just don't get it.  You have office workers, you have patients with medical needs, why isn't anyone answering the phone?  It's incomprehensible and not work the aggravation.  It's with regret that I feel that I have to give Dr. Goldberg 2 stars."}, {'rating': 1, 'review': "I don't know what Dr. Goldberg was like before  moving to Arizona, but let me tell you, STAY AWAY from this doctor and this office. I was going to Dr. Johnson before he left and Goldberg took over when Johnson left. He is not a caring doctor. He is only interested in the co-pay and having you come in 

##### dict.items() example

In [0]:
_ = {'1':[1,2,3,4],'2':[2,4,6,8],'3':[3,6,9,12]}
for key,values in _.items():
  print(key)
  print(values)

##### Take the subset based on propotion paramter

In [0]:
review_subset = []
for key, values in by_rating.items():
  n_total = len(values)
  n_subset = int(n_total * args.proportion_subset_of_train)
  review_subset.extend(values[:n_subset])

review_subset = pd.DataFrame(review_subset)

In [0]:
review_subset.head()

Unnamed: 0,rating,review
0,1,"Unfortunately, the frustration of being Dr. Go..."
1,1,I don't know what Dr. Goldberg was like before...
2,1,I'm writing this review to give you a heads up...
3,1,Wing sauce is like water. Pretty much a lot of...
4,1,Owning a driving range inside the city limits ...


In [0]:
review_subset.rating.value_counts()

2    28000
1    28000
Name: rating, dtype: int64

##### Build a new dataframe with the subset

In [0]:
# Splitting the subset by rating to create our new train, val, and test splits
by_rating = collections.defaultdict(list)
for _, row in review_subset.iterrows():
    by_rating[row.rating].append(row.to_dict())

In [0]:
for key, values in by_rating.items():
  print(key)
  print(values[0])

1
{'rating': 1, 'review': "Unfortunately, the frustration of being Dr. Goldberg's patient is a repeat of the experience I've had with so many other doctors in NYC -- good doctor, terrible staff.  It seems that his staff simply never answers the phone.  It usually takes 2 hours of repeated calling to get an answer.  Who has time for that or wants to deal with it?  I have run into this problem with many other doctors and I just don't get it.  You have office workers, you have patients with medical needs, why isn't anyone answering the phone?  It's incomprehensible and not work the aggravation.  It's with regret that I feel that I have to give Dr. Goldberg 2 stars."}
2
{'rating': 2, 'review': "Been going to Dr. Goldberg for over 10 years. I think I was one of his 1st patients when he started at MHMG. He's been great over the years and is really all about the big picture. It is because of him, not my now former gyn Dr. Markoff, that I found out I have fibroids. He explores all options with

In [0]:
final_list = []
np.random.seed(args.seed)

for key, reviews in by_rating.items():
  np.random.shuffle(reviews)

  n_total = len(reviews)
  n_train = int(args.train_proportion*n_total)
  n_val   = int(args.val_proportion*n_total)
  n_test  = int(args.test_proportion*n_total)

  # Give data point a split attribute
  for item in reviews[:n_train]:
      item['split'] = 'train'
  
  for item in reviews[n_train:n_train+n_val]:
      item['split'] = 'val'
      
  for item in reviews[n_train+n_val:n_train+n_val+n_test]:
      item['split'] = 'test'

  # Add to final list
  final_list.extend(reviews)


In [0]:
final_list[:3]

NameError: ignored

In [0]:
# Write split data to file
final_reviews = pd.DataFrame(final_list)
final_reviews.split.value_counts()

train    39200
val       8400
test      8400
Name: split, dtype: int64

In [0]:
# Preprocess the reviews
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"([.,!?])", r" \1 ", text)
    text = re.sub(r"[^a-zA-Z.,!?]+", r" ", text)
    return text
    
final_reviews.review = final_reviews.review.apply(preprocess_text)

In [0]:
final_reviews['rating'] = final_reviews.rating.apply({1: 'negative', 2: 'positive'}.get)

In [0]:
final_reviews.head()

Unnamed: 0,rating,review,split
0,negative,terrible place to work for i just heard a stor...,train
1,negative,"hours , minutes total time for an extremely s...",train
2,negative,my less than stellar review is for service . w...,train
3,negative,i m granting one star because there s no way t...,train
4,negative,the food here is mediocre at best . i went aft...,train


In [0]:
final_reviews.to_csv(args.output_munged_csv, index=False)

# Classification

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
from argparse import Namespace
from collections import Counter
import json
import os
import re
import string

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm_notebook

In [0]:
args = Namespace(
    # Data and Path information
    frequency_cutoff=25,
    model_state_file='model.pth',
    review_csv="drive/My Drive/yelp/reviews_with_splits_lite.csv",
    # review_csv='data/yelp/reviews_with_splits_full.csv',
    save_dir='model_storage/ch3/yelp/',
    vectorizer_file='vectorizer.json',
    # No Model hyper parameters
    # Training hyper parameters
    batch_size=128,
    early_stopping_criteria=5,
    learning_rate=0.001,
    num_epochs=100,
    seed=1337,
    # Runtime options
    catch_keyboard_interrupt=True,
    cuda=True,
    expand_filepaths_to_save_dir=True,
    reload_from_files=False,
)

In [0]:
temp = pd.read_csv(args.review_csv)

In [0]:
temp['rating'] = temp['rating'].map(lambda x: 0 if x=='negative' else 1)

In [13]:
temp.head(5)

Unnamed: 0,rating,review,split
0,0,terrible place to work for i just heard a stor...,train
1,0,"hours , minutes total time for an extremely s...",train
2,0,my less than stellar review is for service . w...,train
3,0,i m granting one star because there s no way t...,train
4,0,the food here is mediocre at best . i went aft...,train


In [0]:
train_set = temp.query("split=='train'").copy()
val_set = temp.query("split=='val'").copy()
test_set = temp.query("split=='test'").copy()

In [15]:
train_set.head()

Unnamed: 0,rating,review,split
0,0,terrible place to work for i just heard a stor...,train
1,0,"hours , minutes total time for an extremely s...",train
2,0,my less than stellar review is for service . w...,train
3,0,i m granting one star because there s no way t...,train
4,0,the food here is mediocre at best . i went aft...,train


In [16]:
test_set.head()

Unnamed: 0,rating,review,split
23800,0,visited last saturday evening and nothing was ...,test
23801,0,nothing at this location . i would rather go e...,test
23802,0,this may very well be what passes for good ita...,test
23803,0,i was about to get new tires but this store wo...,test
23804,0,the food is usually good and most of the staff...,test


### Make a Dictionary of Tokens

In [17]:
# make a dictionary
from collections import defaultdict
vocabulary_dict = defaultdict(lambda: 0)
for indice,row in train_set.iterrows():
  for token in row['review'].split(" "):
    if token not in string.punctuation:
      vocabulary_dict[token]+=1

for key in list(vocabulary_dict.keys())[:5]:
  print(key,' ',vocabulary_dict[key])

terrible   1448
place   21439
to   129606
work   3881
for   58446


In [0]:
# make a lookup for words and index
# unknown words's index is 0
token_to_idx = defaultdict(lambda : 0)
idx_to_token = defaultdict(lambda : 0)

# since unknown word is 0, then known words start from 1
counter=1
for key,value in vocabulary_dict.items():
  if value > args.frequency_cutoff:
    token_to_idx[key]=counter
    idx_to_token[counter] = key
    counter+=1

In [19]:
for key in list(token_to_idx.keys())[:10]:
  print(key,' ',token_to_idx[key])

terrible   1
place   2
to   3
work   4
for   5
i   6
just   7
heard   8
a   9
story   10


In [20]:
for key in list(idx_to_token.keys())[:10]:
  print(key,' ',idx_to_token[key])

1   terrible
2   place
3   to
4   work
5   for
6   i
7   just
8   heard
9   a
10   story


### Convert sentences into vectors using dictionaries

In [21]:
print('Total number of words in the reviews: ', len(vocabulary_dict))

Total number of words in the reviews:  52984


In [22]:
print('Total number of tokens used: ', len(token_to_idx))

Total number of tokens used:  7325


In [23]:
token_to_idx['word do not exists']

0

### Convert Sentences into Vectors

In [0]:
len_vector = len(token_to_idx)+1
train_vectors = []
for indice, row in train_set.iterrows():
  sent = row.review
  vector = [0]*len_vector
  for word in sent.split(" "):
    if word not in string.punctuation:
      vector[token_to_idx[word]]=1
  # now vector for the sentence is built
  train_vectors.append(vector)

val_vectors = []
for indice, row in val_set.iterrows():
  sent = row.review
  vector = [0]*len_vector
  for word in sent.split(" "):
    if word not in string.punctuation:
      vector[token_to_idx[word]]=1
  # now vector for the sentence is built
  val_vectors.append(vector)

test_vectors = []
for indice, row in test_set.iterrows():
  sent = row.review
  vector = [0]*len_vector
  for word in sent.split(" "):
    if word not in string.punctuation:
      vector[token_to_idx[word]]=1
  # now vector for the sentence is built
  test_vectors.append(vector)

In [25]:
print(len(train_vectors),len(val_vectors),len(test_vectors))

39200 8400 8400


In [0]:
train_vectors = np.array(train_vectors)
val_vectors = np.array(val_vectors)
test_vectors = np.array(test_vectors)

y_train =train_set.rating.values
y_val =val_set.rating.values
y_test =test_set.rating.values

In [28]:
print(train_vectors.shape, y_train.shape)
print(val_vectors.shape, y_val.shape)
print(test_vectors.shape, y_test.shape)

(39200, 7327) (39200,)
(8400, 7327) (8400,)
(8400, 7327) (8400,)


In [0]:
np.savez('drive/My Drive/yelp/dataset.npz',
         train_vectors = np.array(train_vectors),
         val_vectors = np.array(val_vectors),
         test_vectors = np.array(test_vectors),
         y_train =train_set.rating.values,
         y_val =val_set.rating.values,
         y_test =test_set.rating.values)


### Build a Dataset and a Dataloader

In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
from argparse import Namespace
from collections import Counter
import json
import os
import re
import string

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm_notebook

In [0]:
def set_seed_everywhere(seed):
    np.random.seed(seed)
    torch.manual_seed(seed)

# R.I.P Kobe
set_seed_everywhere(248)

In [0]:
class ReviewDataset(Dataset):
  """Dataset class for the review vectors"""
  def __init__(self,file_path='drive/My Drive/yelp/dataset.npz',partition='train'):
    """
    Args:
      file_Path (string): path to the numpy files
      partition (string): can be 'train','val','test
    """
    loaded_data = np.load(file_path)
    self.X = loaded_data[partition+'_vectors']
    self.y = loaded_data['y_'+partition]

  def __len__(self):
    return len(self.X)

  def __getitem__(self,idx):
    sample = {'review':self.X[idx].astype(float),
              'rating':self.y[idx].astype(float)}

    return sample

In [7]:
# Dataset
train_dataset = ReviewDataset(partition='train')
print(len(train_dataset))
for i in range(4):
  sample = train_dataset[i]
  print(i,sample['review'].shape,sample['rating'])

39200
0 (7327,) 0.0
1 (7327,) 0.0
2 (7327,) 0.0
3 (7327,) 0.0


In [9]:
# Dataloader
train_loader=DataLoader(train_dataset,batch_size=10,shuffle=True)
for i_batch, samples_batched in enumerate(train_loader):
  if i_batch==2000:
    print(samples_batched['rating'])
    print(samples_batched['review'].shape)

tensor([0., 1., 1., 1., 0., 1., 0., 1., 0., 1.])
torch.Size([10, 7327])
