In [1]:
print("""
@Description: Example: Classifying Sentiment of Restaurant Reviews
@Author(s): Stephen CUI
@LastEditor(s): Stephen CUI
@CreatedTime: 2023-04-24 15:00:30
""")


@Description: Example: Classifying Sentiment of Restaurant Reviews
@Author(s): Stephen CUI
@LastEditor(s): Stephen CUI
@CreatedTime: 2023-04-24 15:00:30



In [30]:
import collections
import numpy as np
import pandas as pd
import re
from argparse import Namespace

args = Namespace(
    raw_train_dataset_csv='data/yelp/raw_train.csv',
    raw_test_dataset_csv='data/yelp/raw_test.csv',
    proportion_subset_of_train=.1,
    train_proportion=.7,
    val_proportion=.5,
    test_proportion=.15,
    output_munged_csv='data/yelp/reviews_with_split_lite.csv'
)

In [3]:
train_reviews = pd.read_csv(args.raw_train_dataset_csv,
                            header=None,
                            names=['rating', 'review'])

In [4]:
by_rating = collections.defaultdict(list)

In [5]:
for _, row in train_reviews.iterrows():
    by_rating[row['rating']].append(row.to_dict())

In [6]:
review_subset = []
for _, item_list in sorted(by_rating.items()):
    n_total = len(item_list)
    n_subset = int(args.proportion_subset_of_train * n_total)
    review_subset.extend(item_list[:n_subset])
review_subset = pd.DataFrame(review_subset)

In [7]:
review_subset.sample(5)

Unnamed: 0,rating,review
55123,2,Coo spot to check out comics.. I was definitel...
2500,1,I've been to better Sheratons and what hotel s...
22310,1,Beware Sonora Quest billing department! I do n...
24215,1,"I ordered the lox and onion on wheat bagel, an..."
45045,2,Good dept store. Great selections of products....


In [8]:
for rating in set(review_subset['rating']):
    indices = review_subset[review_subset['rating'] == rating].index.to_numpy()
    np.random.shuffle(indices)
    n_total_by_rating = len(review_subset[review_subset['rating'] == rating])
    n_train_by_rating = int(args.train_proportion * n_total)
    n_val_by_rating = int(args.val_proportion * n_total)
    n_test_by_rating = int(args.test_proportion * n_total)
    
    review_subset.loc[indices[:n_train_by_rating], 'split'] = 'train'
    review_subset.loc[indices[n_train_by_rating:n_train_by_rating + n_val_by_rating], 'split'] = 'val'
    review_subset.loc[indices[n_train_by_rating + n_val_by_rating:], 'split'] = 'test'

In [24]:
final_reviews = review_subset.copy()

In [25]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"([.,!?])", r" \1 ", text)
    text = re.sub(r"[^a-zA-Z.,!?]+", r" ", text)
    return text
final_reviews['review'] = final_reviews['review'].apply(preprocess_text)

In [26]:
final_reviews.head()

Unnamed: 0,rating,review,split
0,1,"unfortunately , the frustration of being dr . ...",train
1,1,i don t know what dr . goldberg was like befor...,train
2,1,i m writing this review to give you a heads up...,train
3,1,wing sauce is like water . pretty much a lot o...,train
4,1,owning a driving range inside the city limits ...,train


In [27]:
final_reviews['rating'] = final_reviews['rating'].apply({1:'negative', 2:'positive'}.get)

In [28]:
final_reviews

Unnamed: 0,rating,review,split
0,negative,"unfortunately , the frustration of being dr . ...",train
1,negative,i don t know what dr . goldberg was like befor...,train
2,negative,i m writing this review to give you a heads up...,train
3,negative,wing sauce is like water . pretty much a lot o...,train
4,negative,owning a driving range inside the city limits ...,train
...,...,...,...
55995,positive,i am not really an arts and crafts kind of guy...,train
55996,positive,i absolutely love michael s ! i used to scrapb...,train
55997,positive,the fact that i can generally get whatever i w...,train
55998,positive,i ve been frequenting michaels lately because ...,train


In [31]:
final_reviews.to_csv(args.output_munged_csv, index=False)