### define hyper parmeters

In [1]:
import argparse
from argparse import Namespace
arg = Namespace(
    train_csv = './data/train.csv',
    test_csv = './data/test.csv',
    data_ratio = 0.1,
    train_ratio = 0.7,
    test_ratio = 0.15,
    val_ratio = 0.15,
    out_csv = './data/review.csv',
    seed = 1337
)

### convert json file to csv file

In [2]:
import csv
import json
import pandas as pd

def json2csv(csv_file_path, json_file_path, n = 10):
    #打开json文件,取出第一行列名
    with open(json_file_path,'r',encoding='utf-8') as fin:
        for line in fin:
            line_contents = json.loads(line)
            headers=line_contents.keys()
            break
        print(headers)
    i = 0
    #将json读成字典,其键值写入csv的列名,再将json文件中的values逐行写入csv文件
    with open(csv_file_path, 'w', newline='',encoding='utf-8') as fout:
        title = (['stars', 'text'])
        writer=csv.DictWriter(fout, title)
        writer.writeheader()
        with open(json_file_path, 'r', encoding='utf-8') as fin:
            for line in fin:
                i += 1
                line_contents = json.loads(line)
                writer.writerow({'stars':int(line_contents['stars']), 'text':line_contents['text']})
                if i == n:
                    break
            fin.close()
        fout.close()

### show result

In [3]:
json_file_path='./data/yelp_academic_dataset_review.json'
csv_file_path='./data/yelp_academic_dataset_review.csv'
json2csv(csv_file_path, json_file_path, 500)
train_reviews = pd.read_csv(csv_file_path)
print(train_reviews.head())

dict_keys(['review_id', 'user_id', 'business_id', 'stars', 'useful', 'funny', 'cool', 'text', 'date'])
   stars                                               text
0      2  As someone who has worked with many museums, I...
1      1  I am actually horrified this place is still in...
2      5  I love Deagan's. I do. I really do. The atmosp...
3      1  Dismal, lukewarm, defrosted-tasting "TexMex" g...
4      4  Oh happy day, finally have a Canes near my cas...


In [4]:
train_reviews.stars.value_counts()

5    217
4    115
1     73
2     51
3     44
Name: stars, dtype: int64

In [5]:
set(train_reviews.stars)

{1, 2, 3, 4, 5}

### split dataset

In [6]:
import collections
import numpy as np

by_rating = collections.defaultdict(list)
for _, row in train_reviews.iterrows():
    by_rating[row.stars].append(row.to_dict())

# #split dataset
final_list = []
np.random.seed(arg.seed)
for _, item_list in sorted(by_rating.items()):
    np.random.shuffle(item_list)
    n_total = len(item_list)
    n_train = int(arg.train_ratio * n_total)
    n_val = int(arg.val_ratio * n_total)
    n_test = int(arg.test_ratio * n_total)
    for item in item_list[:n_train]:
        item['split'] = 'train'
    for item in item_list[n_train:n_train+n_val]:
        item['split'] = 'val'
    for item in item_list[n_train+n_val:n_train+n_val+n_test]:
        item['split'] = 'test'
    final_list.extend(item_list)
    print(len(final_list))
final_review = pd.DataFrame(final_list)

73
124
168
283
500


In [8]:
final_review.split.value_counts()

train    347
val       72
test      72
Name: split, dtype: int64

In [13]:
final_review.stars.value_counts()

5    217
4    115
1     73
2     51
3     44
Name: stars, dtype: int64

In [11]:
import re
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'([.,!?])', r'\1', text)
    text = re.sub(r'[^a-zA-Z.,!?]+', r' ', text)
    return text
# text = 'As someone who has worked with many museums, I was eager to visit this gallery on my most recent trip to Las Vegas. When I saw they would be showing infamous eggs of the House of Faberge from the Virginia Museum of Fine Arts (VMFA), I knew I had to go!'
# text = preprocess_text(text)
# print(text)

In [14]:
final_review.head()

Unnamed: 0,split,stars,text
0,train,1,I ordered a pizza at 4:49. Got an email that s...
1,train,1,I had explicitly explained to your manager tha...
2,train,1,I placed a custom order on 30 August. I was to...
3,train,1,I called Garcia Bail Bonds and when the call w...
4,train,1,The copy and print center here is just so horr...


In [15]:
final_review.text = final_review.text.apply(preprocess_text)

In [16]:
final_review['stars'] = final_review.stars.apply({1:'negative', 2:'negative', 3:'positive', 4:'positive', 5:'positive'}.get)

In [18]:
final_review.to_csv(arg.out_csv, index=False)