# Imports

In [1]:
from argparse import Namespace
import collections
import os
from pathlib import Path
import re
import sys

import numpy as np
import pandas as pd
from tqdm import tqdm_notebook

In [2]:
%reload_ext autoreload
%autoreload 2

# Load Data

In [3]:
PATH = Path('../data/yelp/')
!ls {PATH}

raw_test.csv                 reviews_with_splits_full.csv
raw_train.csv                reviews_with_splits_lite.csv


In [4]:
!wc -l ../data/yelp/*.csv

   38000 ../data/yelp/raw_test.csv
  560000 ../data/yelp/raw_train.csv
  598001 ../data/yelp/reviews_with_splits_full.csv
   56001 ../data/yelp/reviews_with_splits_lite.csv
 1252002 total


In [5]:
args = Namespace(
    raw_train_csv = PATH / 'raw_train.csv',
    raw_test_csv = PATH / 'raw_test.csv',
    train_prop = 0.7,
    val_prop = 0.3,
    seed = 123,
    output_csv = PATH / 'reviews_with_splits_full.csv'
)

In [6]:
!head -2 {args.raw_train_csv}

"1","Unfortunately, the frustration of being Dr. Goldberg's patient is a repeat of the experience I've had with so many other doctors in NYC -- good doctor, terrible staff.  It seems that his staff simply never answers the phone.  It usually takes 2 hours of repeated calling to get an answer.  Who has time for that or wants to deal with it?  I have run into this problem with many other doctors and I just don't get it.  You have office workers, you have patients with medical needs, why isn't anyone answering the phone?  It's incomprehensible and not work the aggravation.  It's with regret that I feel that I have to give Dr. Goldberg 2 stars."
"2","Been going to Dr. Goldberg for over 10 years. I think I was one of his 1st patients when he started at MHMG. He's been great over the years and is really all about the big picture. It is because of him, not my now former gyn Dr. Markoff, that I found out I have fibroids. He explores all options with you and is very patient and understanding. 

Looks like there is no headers. Also, first column is rating and the second one is the review.

In [7]:
# Read data
train_df = pd.read_csv(args.raw_train_csv,
                       header=None, names=['rating', 'review'])

test_df = pd.read_csv(args.raw_test_csv,
                       header=None, names=['rating', 'review'])
train_df.shape, test_df.shape

((560000, 2), (38000, 2))

# Preprocess Data

In [8]:
# Check if there is missing values in rating column
train_df.rating.isna().sum(), test_df.rating.isna().sum()

(0, 0)

In [9]:
# Check if there is missing values in reviews column
train_df.review.isna().sum(), test_df.review.isna().sum()

(0, 0)

In [10]:
(train_df.review.apply(lambda x: len(x)) == 0).any()

False

In [11]:
(test_df.review.apply(lambda x: len(x)) == 0).any()

False

In [12]:
train_df.head(3)

Unnamed: 0,rating,review
0,1,"Unfortunately, the frustration of being Dr. Go..."
1,2,Been going to Dr. Goldberg for over 10 years. ...
2,1,I don't know what Dr. Goldberg was like before...


In [13]:
test_df.head(3)

Unnamed: 0,rating,review
0,1,Ordered a large Mango-Pineapple smoothie. Stay...
1,2,Quite a surprise! \n\nMy wife and I loved thi...
2,1,"First I will say, this is a nice atmosphere an..."


In [14]:
# Check the unique ratings
set(train_df.rating), set(test_df.rating)

({1, 2}, {1, 2})

In [15]:
train_df.iloc[0].to_dict()

{'rating': 1,
 'review': "Unfortunately, the frustration of being Dr. Goldberg's patient is a repeat of the experience I've had with so many other doctors in NYC -- good doctor, terrible staff.  It seems that his staff simply never answers the phone.  It usually takes 2 hours of repeated calling to get an answer.  Who has time for that or wants to deal with it?  I have run into this problem with many other doctors and I just don't get it.  You have office workers, you have patients with medical needs, why isn't anyone answering the phone?  It's incomprehensible and not work the aggravation.  It's with regret that I feel that I have to give Dr. Goldberg 2 stars."}

In [16]:
train_df.rating.value_counts()

2    280000
1    280000
Name: rating, dtype: int64

Balanced dataset.

In [17]:
# Add split column
train_df['split'] = 'train'
test_df['split'] = 'test'

In [18]:
# Shuffle training set and split to train and test
idxs = np.random.permutation(train_df.index.to_list())

n_train = int(len(train_df) * args.train_prop)
train_idxs = idxs[:n_train]
val_idxs = idxs[n_train:]
train_df.loc[val_idxs, 'split'] = 'val'

train_df.head()

Unnamed: 0,rating,review,split
0,1,"Unfortunately, the frustration of being Dr. Go...",val
1,2,Been going to Dr. Goldberg for over 10 years. ...,train
2,1,I don't know what Dr. Goldberg was like before...,train
3,1,I'm writing this review to give you a heads up...,val
4,2,All the food is great here. But the best thing...,train


In [19]:
train_df.split.value_counts()

train    392000
val      168000
Name: split, dtype: int64

In [20]:
pd.crosstab(train_df.split, train_df.rating, margins=True, normalize=True)

rating,1,2,All
split,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
train,0.349882,0.350118,0.7
val,0.150118,0.149882,0.3
All,0.5,0.5,1.0


In [21]:
# combine train and test dataset
final_df = pd.concat([train_df, test_df])
final_df.head()

Unnamed: 0,rating,review,split
0,1,"Unfortunately, the frustration of being Dr. Go...",val
1,2,Been going to Dr. Goldberg for over 10 years. ...,train
2,1,I don't know what Dr. Goldberg was like before...,train
3,1,I'm writing this review to give you a heads up...,val
4,2,All the food is great here. But the best thing...,train


In [22]:
final_df.split.value_counts()

train    392000
val      168000
test      38000
Name: split, dtype: int64

In [23]:
pd.crosstab(final_df.split, final_df.rating, margins=True, normalize=True)

rating,1,2,All
split,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
test,0.031773,0.031773,0.063545
train,0.327649,0.32787,0.655518
val,0.140579,0.140358,0.280936
All,0.5,0.5,1.0


In [24]:
# Preprocess the dataset
def preprocess_text(text):
    text = text.lower()
    
    # put whitespace around '!.,?'
    text = re.sub(r'([.,!?])', r' \1 ', text)
    
    # replace anything that is not alphabet or punctuation with whitespace
    text = re.sub(r'[^a-zA-Z!?.,]+', r' ', text)
    return text

In [25]:
final_df['review'] = final_df.review.apply(preprocess_text)

In [26]:
final_df['rating'] = final_df.rating.map({1: 'negative', 2: 'positive'})
final_df.head()

Unnamed: 0,rating,review,split
0,negative,"unfortunately , the frustration of being dr . ...",val
1,positive,been going to dr . goldberg for over years . i...,train
2,negative,i don t know what dr . goldberg was like befor...,train
3,negative,i m writing this review to give you a heads up...,val
4,positive,all the food is great here . but the best thin...,train


In [27]:
final_df.to_csv(args.output_csv, index=False)

In [28]:
%ls {PATH}

raw_test.csv                  reviews_with_splits_full.csv
raw_train.csv                 reviews_with_splits_lite.csv


In [29]:
!wc -l ../data/yelp/*.csv

   38000 ../data/yelp/raw_test.csv
  560000 ../data/yelp/raw_train.csv
  598001 ../data/yelp/reviews_with_splits_full.csv
   56001 ../data/yelp/reviews_with_splits_lite.csv
 1252002 total


# Test

In [30]:
df = pd.read_csv(PATH / 'reviews_with_splits_full.csv')

In [31]:
text = df.review.str.cat(sep=' ')

In [32]:
text[:1000]

'unfortunately , the frustration of being dr . goldberg s patient is a repeat of the experience i ve had with so many other doctors in nyc good doctor , terrible staff . it seems that his staff simply never answers the phone . it usually takes hours of repeated calling to get an answer . who has time for that or wants to deal with it ? i have run into this problem with many other doctors and i just don t get it . you have office workers , you have patients with medical needs , why isn t anyone answering the phone ? it s incomprehensible and not work the aggravation . it s with regret that i feel that i have to give dr . goldberg stars .  been going to dr . goldberg for over years . i think i was one of his st patients when he started at mhmg . he s been great over the years and is really all about the big picture . it is because of him , not my now former gyn dr . markoff , that i found out i have fibroids . he explores all options with you and is very patient and understanding . he do