# Preprocessing Yelp Dataset (Full version)

In [1]:
import re

import collections
import numpy as np
import pandas as pd

from tqdm.notebook import tqdm
from argparse import Namespace

tqdm.pandas(desc="Preprocessing Reviews")

  from pandas import Panel


## Define configurations

In [2]:
args = Namespace(
    raw_train_dataset_csv = "data/yelp/raw_train.csv",
    raw_test_dataset_csv = "data/yelp/raw_test.csv",
    output_munged_csv="data/yelp/reviews_with_splits_full.csv",
    seed=1337,
)

## Read Data

In [7]:
train_reviews = pd.read_csv(
    args.raw_train_dataset_csv,
    header = None,
    names = ["rating", "review"]
)
train_reviews = train_reviews[~pd.isnull(train_reviews.review)]
test_reviews = pd.read_csv(
    args.raw_test_dataset_csv,
    header=None,
    names=["rating", "review"]
)
test_reviews = test_reviews[~pd.isnull(test_reviews.review)]

In [8]:
train_reviews.head()

Unnamed: 0,rating,review
0,1,"Unfortunately, the frustration of being Dr. Go..."
1,2,Been going to Dr. Goldberg for over 10 years. ...
2,1,I don't know what Dr. Goldberg was like before...
3,1,I'm writing this review to give you a heads up...
4,2,All the food is great here. But the best thing...


## Split data into Train/Dev/Test

In [15]:
from sklearn.model_selection import train_test_split

train_reviews.rating.value_counts(), test_reviews.rating.value_counts()

(2    280000
 1    280000
 Name: rating, dtype: int64,
 2    19000
 1    19000
 Name: rating, dtype: int64)

In [22]:
train, val = train_test_split(train_reviews, train_size=0.7,
                                stratify=train_reviews.rating.values)
train = train.copy()
val = val.copy()

In [23]:
train.rating.value_counts() # Have the same distribution

2    196000
1    196000
Name: rating, dtype: int64

In [24]:
val.rating.value_counts()  # Have the same distribution

2    84000
1    84000
Name: rating, dtype: int64

In [25]:
test_reviews.rating.value_counts() # Have the same distribution

2    19000
1    19000
Name: rating, dtype: int64

In [26]:
train.reset_index(drop=True, inplace=True)
val.reset_index(drop=True, inplace=True)
test_reviews.reset_index(drop=True, inplace=True)

In [27]:
train["split"] = "train"
val["split"] = "val"
test_reviews["split"] = "test"

## Final reviews

In [28]:
final_reviews = pd.concat([train, val, test_reviews], axis=0, copy=True)
final_reviews.reset_index(drop=True, inplace=True)

In [29]:
final_reviews.split.value_counts()

train    392000
val      168000
test      38000
Name: split, dtype: int64

In [31]:
final_reviews.review.head()

0    I have been here once before and was impressed...
1    Came here for dinner as it walking distance fr...
2    I found this company through the BBB site.  Lo...
3    They have the most amazing warm, freshly baked...
4    This was our first taste of Ethiopian food, an...
Name: review, dtype: object

In [33]:
final_reviews[(final_reviews.review.isnull())]

Unnamed: 0,rating,review,split


## Preprocess Reviews

In [34]:
# Preprocess the reviews
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"([.,!?])", r" \1 ", text)
    text = re.sub(r"[^a-zA-Z.,!?]+", r" ", text)
    return text
    
final_reviews.review = final_reviews.review.progress_apply(
                                                        preprocess_text)

HBox(children=(FloatProgress(value=0.0, description='Preprocessing Reviews', max=598000.0, style=ProgressStyle…




In [35]:
final_reviews['rating'] = final_reviews.rating.progress_apply(
                            {1: 'negative', 2: 'positive'}.get)

HBox(children=(FloatProgress(value=0.0, description='Preprocessing Reviews', max=598000.0, style=ProgressStyle…




In [36]:
final_reviews.head()

Unnamed: 0,rating,review,split
0,negative,i have been here once before and was impressed...,train
1,negative,came here for dinner as it walking distance fr...,train
2,positive,i found this company through the bbb site . lo...,train
3,positive,"they have the most amazing warm , freshly bake...",train
4,positive,"this was our first taste of ethiopian food , a...",train


In [37]:
final_reviews.to_csv(args.output_munged_csv, index=False)