# Preprocessing Yelp Dataset (light version)

In [1]:
import re

import collections
import numpy as np
import pandas as pd

from tqdm.notebook import tqdm
from argparse import Namespace

tqdm.pandas(desc="Preprocessing Reviews")

  from pandas import Panel


## Define configurations

In [2]:
args = Namespace(
    raw_train_dataset_csv = "data/yelp/raw_train.csv",
    raw_test_dataset_csv = "data/yelp/raw_test.csv",
    proportion_subset_of_train = 0.1, # take 10% for the light version
    output_munged_csv="data/yelp/reviews_with_splits_lite.csv",
    seed=1337,
)

## Read Data

In [3]:
train_reviews = pd.read_csv(
    args.raw_train_dataset_csv,
    header = None,
    names = ["rating", "review"]
)

In [4]:
train_reviews.head()

Unnamed: 0,rating,review
0,1,"Unfortunately, the frustration of being Dr. Go..."
1,2,Been going to Dr. Goldberg for over 10 years. ...
2,1,I don't know what Dr. Goldberg was like before...
3,1,I'm writing this review to give you a heads up...
4,2,All the food is great here. But the best thing...


In [5]:
train_reviews.shape # This is the full dataset

(560000, 2)

## making the subset equal across the review classes

In [6]:
train_reviews.rating.value_counts()

2    280000
1    280000
Name: rating, dtype: int64

In [7]:
# Get ratings 1 dataframe
rating1_df = train_reviews.groupby("rating").get_group(1).sample(
                                                            frac=0.1)
# Get ratings 2 dataframe
rating2_df = train_reviews.groupby("rating").get_group(2).sample(
                                                            frac=0.1)
# Concat and Clean dataframe (reset_index and shuffle rows)
review_subset = pd.concat([rating1_df, rating2_df], axis=0).sample(
                                                            frac=1)
review_subset.reset_index(inplace=True, drop=True)

In [8]:
review_subset.rating.value_counts()

2    28000
1    28000
Name: rating, dtype: int64

In [9]:
# Uniques classes
set(review_subset.rating)

{1, 2}

## Split data into Train/Dev/Test

In [10]:
from sklearn.model_selection import train_test_split
train, temp = train_test_split(review_subset, train_size=0.7,
                                stratify=review_subset.rating.values)
val, test = train_test_split(temp, train_size=0.5,
                                stratify=temp.rating.values)
train = train.copy()
val = val.copy()
test = test.copy()

In [11]:
train.rating.value_counts() # Have the same distribution

2    19600
1    19600
Name: rating, dtype: int64

In [12]:
val.rating.value_counts()  # Have the same distribution

2    4200
1    4200
Name: rating, dtype: int64

In [13]:
test.rating.value_counts() # Have the same distribution

2    4200
1    4200
Name: rating, dtype: int64

In [14]:
train.reset_index(drop=True, inplace=True)
val.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

In [15]:
train["split"] = "train"
val["split"] = "val"
test["split"] = "test"

## Final reviews

In [16]:
final_reviews = pd.concat([train, val, test], axis=0, copy=True)
final_reviews.reset_index(drop=True, inplace=True)

In [17]:
final_reviews.split.value_counts()

train    39200
test      8400
val       8400
Name: split, dtype: int64

## Preprocess Reviews

In [18]:
# Preprocess the reviews
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"([.,!?])", r" \1 ", text)
    text = re.sub(r"[^a-zA-Z.,!?]+", r" ", text)
    return text
    
final_reviews.review = final_reviews.review.progress_apply(
                                                        preprocess_text)

HBox(children=(FloatProgress(value=0.0, description='Preprocessing Reviews', max=56000.0, style=ProgressStyle(…




In [19]:
final_reviews['rating'] = final_reviews.rating.progress_apply(
                            {1: 'negative', 2: 'positive'}.get)

HBox(children=(FloatProgress(value=0.0, description='Preprocessing Reviews', max=56000.0, style=ProgressStyle(…




In [20]:
final_reviews.head()

Unnamed: 0,rating,review,split
0,negative,"on a recent visit to las vegas , my friends an...",train
1,positive,"excellent food ! we had the pompedoro , chicke...",train
2,positive,a great little glimpse back into old vegas . t...,train
3,positive,i was in phoenix for a couple of days for a co...,train
4,positive,what a treasure ! i have been doing yoga for y...,train


In [21]:
final_reviews.to_csv(args.output_munged_csv, index=False)