In [24]:
print("""
@Description: Example: Classifying Sentiment of Restaurant Reviews
@Author(s): Stephen CUI
@LastEditor(s): Stephen CUI
@CreatedTime: 2023-04-24 15:00:30
""")


@Description: Example: Classifying Sentiment of Restaurant Reviews
@Author(s): Stephen CUI
@LastEditor(s): Stephen CUI
@CreatedTime: 2023-04-24 15:00:30



In [42]:
import collections
import numpy as np
import pandas as pd
import re
from argparse import Namespace

args = Namespace(
    raw_train_dataset_csv='data/yelp/raw_train.csv',
    raw_test_dataset_csv='data/yelp/raw_test.csv',
    proportion_subset_of_train=.1,
    train_proportion=.7,
    val_proportion=.15,
    test_proportion=.15,
    output_munged_csv='data/yelp/reviews_with_split_lite.csv'
)

In [26]:
train_reviews = pd.read_csv(args.raw_train_dataset_csv,
                            header=None,
                            names=['rating', 'review'])

In [27]:
by_rating = collections.defaultdict(list)

In [28]:
for _, row in train_reviews.iterrows():
    by_rating[row['rating']].append(row.to_dict())

In [29]:
review_subset = []
for _, item_list in sorted(by_rating.items()):
    n_total = len(item_list)
    n_subset = int(args.proportion_subset_of_train * n_total)
    review_subset.extend(item_list[:n_subset])
review_subset = pd.DataFrame(review_subset)

In [30]:
review_subset.sample(5)

Unnamed: 0,rating,review
41179,2,How in the world is this place a preferred pro...
43726,2,This place is amazing! I love going here. \n\n...
23869,1,I usually have top notch service with AZFCU. ...
46875,2,Clean & has a Starbucks inside. Coming from MN...
53515,2,An institution when it comes to breakfast. Bee...


In [43]:
for rating in set(review_subset['rating']):
    indices = review_subset[review_subset['rating'] == rating].index.to_numpy()
    np.random.shuffle(indices)
    assert (args.train_proportion + args.test_proportion + args.val_proportion) == 1.0, "训练、验证、测试集比例之和不为1"
    n_total_by_rating = len(indices)
    n_train_by_rating = int(args.train_proportion * n_total_by_rating)
    n_val_by_rating = int(args.val_proportion * n_total_by_rating)
    n_test_by_rating = int(args.test_proportion * n_total_by_rating)
    
    review_subset.loc[indices[:n_train_by_rating], 'split'] = 'train'
    review_subset.loc[indices[n_train_by_rating:(n_train_by_rating + n_val_by_rating)], 'split'] = 'val'
    review_subset.loc[indices[(n_train_by_rating + n_val_by_rating):], 'split'] = 'test'

In [44]:
set(review_subset['split'])

{'test', 'train', 'val'}

In [45]:
final_reviews = review_subset.copy()

In [46]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"([.,!?])", r" \1 ", text)
    text = re.sub(r"[^a-zA-Z.,!?]+", r" ", text)
    return text
final_reviews['review'] = final_reviews['review'].apply(preprocess_text)

In [47]:
final_reviews.head()

Unnamed: 0,rating,review,split
0,1,"unfortunately , the frustration of being dr . ...",train
1,1,i don t know what dr . goldberg was like befor...,train
2,1,i m writing this review to give you a heads up...,train
3,1,wing sauce is like water . pretty much a lot o...,val
4,1,owning a driving range inside the city limits ...,test


In [48]:
final_reviews['rating'] = final_reviews['rating'].apply({1:'negative', 2:'positive'}.get)

In [49]:
final_reviews

Unnamed: 0,rating,review,split
0,negative,"unfortunately , the frustration of being dr . ...",train
1,negative,i don t know what dr . goldberg was like befor...,train
2,negative,i m writing this review to give you a heads up...,train
3,negative,wing sauce is like water . pretty much a lot o...,val
4,negative,owning a driving range inside the city limits ...,test
...,...,...,...
55995,positive,i am not really an arts and crafts kind of guy...,train
55996,positive,i absolutely love michael s ! i used to scrapb...,train
55997,positive,the fact that i can generally get whatever i w...,test
55998,positive,i ve been frequenting michaels lately because ...,train


In [50]:
final_reviews.to_csv(args.output_munged_csv, index=False)