In [1]:
import collections
import numpy as np
import pandas as pd
import re

from argparse import Namespace

In [68]:
args = Namespace(
    raw_train_dataset_csv="../data/yelp/raw_train.csv",
    raw_test_dataset_csv="../data/yelp/raw_test.csv",
    proportion_subset_of_train=0.1,
    train_proportion=0.7,
    val_proportion=0.15,
    test_proportion=0.15,
    output_munged_csv="../data/yelp/reviews_with_splits_lite.csv",
    seed=1337
)

In [73]:
train_rewiews = pd.read_csv(
    args.raw_train_dataset_csv,
    header=None,skiprows=1,
    names=['review', 'rating']
)
train_rewiews.shape

(560000, 2)

In [74]:
train_rewiews = train_rewiews[['rating', 'review']]
train_rewiews.head()

Unnamed: 0,rating,review
0,0,"Unfortunately, the frustration of being Dr. Go..."
1,1,Been going to Dr. Goldberg for over 10 years. ...
2,0,I don't know what Dr. Goldberg was like before...
3,0,I'm writing this review to give you a heads up...
4,1,All the food is great here. But the best thing...


In [75]:
train_rewiews.rating.value_counts()

0    280000
1    280000
Name: rating, dtype: int64

In [90]:
# making subset 10 times smaller with equal classes
by_rating = collections.defaultdict(list)
# using dictionary to store reviews of positive and negative classes
for i, row in train_rewiews.iterrows():
    by_rating[row.rating].append(row.to_dict())

# iterating through both classes, filling 10 precent of data in list
review_subset = []
for i, item_list in sorted(by_rating.items()):
    n_total = len(item_list)
    n_subset = int(args.proportion_subset_of_train * n_total)
    review_subset.extend(item_list[:n_subset])
# saving list as DataFrame  
review_subset = pd.DataFrame(review_subset)

In [91]:
review_subset.head()

Unnamed: 0,rating,review
0,0,"Unfortunately, the frustration of being Dr. Go..."
1,0,I don't know what Dr. Goldberg was like before...
2,0,I'm writing this review to give you a heads up...
3,0,Wing sauce is like water. Pretty much a lot of...
4,0,Owning a driving range inside the city limits ...


In [92]:
train_rewiews.rating.value_counts()

0    280000
1    280000
Name: rating, dtype: int64

In [93]:
review_subset.rating.value_counts()

0    28000
1    28000
Name: rating, dtype: int64

In [94]:
# unique classes 
set(review_subset.rating)

{0, 1}

In [95]:
# splitting dataset into train, test, val
by_rating = collections.defaultdict(list)
for i, row in review_subset.iterrows():
    by_rating[row.rating].append(row.to_dict())
    
final_list = []
np.random.seed(args.seed)
for marker, item_list in sorted(by_rating.items()):
    np.random.shuffle(item_list)
    
    n_total = len(item_list)
    n_train = int(args.train_proportion * n_total)
    n_test = int(args.test_proportion * n_total)
    n_val = int(args.val_proportion * n_total)
    
    #Giving each review label of dataset type
    for item in item_list[:n_train]:
        item['split'] = 'train'
    for item in item_list[n_train:n_train+n_test]:
        item['split'] = 'test'
    for item in item_list[n_train+n_test:]:
        item['split'] = 'val'
        
    final_list.extend(item_list)

In [96]:
final_list = pd.DataFrame(final_list)

In [98]:
final_list.split.value_counts() / final_list.shape[0]

train    0.70
test     0.15
val      0.15
Name: split, dtype: float64