Takes as input a .json review file and outputs 2 .json files with train and test data, such that all the users and items in test are present in train too (not guaranteed for items, but usually true for low k, as there are far fewer items than users). 

The first k rows are moved from train to test for each user (default=1). Takes a couple of minutes to run for 700k reviews.

In [None]:
# Imports 
import string
import pandas as pd
import numpy as np
import time
from scipy.stats.stats import pearsonr 
from sklearn.model_selection import train_test_split
import json

k=1 #take k rows from each user and put in test set
m=3 #after cutting, remove items with <m rows left

filename = '../reviews_Apps_for_Android_5.json'
start_index = 0
end_index = 10000 #set to None for reading the entire file
jsonFile = True #JSON file output by default

# load the data, keep only selected columns
# Read the file using super fast Pandas.read_csv
def load_required_data(path, required_columns):
    dataframe = pd.read_json(path, lines=True)
    dataframe = dataframe[required_columns]
    return dataframe, dataframe.values

train_df, values = load_required_data(filename, ["asin", "reviewerID", "overall","reviewText", "unixReviewTime"])

print "Data loaded"

#sort by reviewerID
train_df.sort_values(by=['reviewerID'],inplace=True)

#cut the data 
train_df = train_df[start_index:end_index]

print "Data cut"

#check that the last user has enough reviews
last_user_id = train_df.iloc[-1]["reviewerID"]
i=-1
index_list=[]

while i>-1*(k+4) and train_df.iloc[i]['reviewerID'] == last_user_id:
    index_list.append(train_df.iloc[i].name)
    i-=1

if len(index_list)<k+3:
    train_df = train_df.drop(index_list).reset_index(drop=True)


#sort by items
train_df.sort_values(by=['asin'],inplace=True)

#remove items with <m entries
index_list=[] # holds the indices of the current item
last_item_id = 'undef' #first asin
drop_list=[] # holds indices of all rows to be removed

for index, row in train_df.iterrows():
    if row['asin'] != last_item_id:
        if len(index_list)<m:
            drop_list.extend(index_list)
        index_list=[]
        last_item_id = row['asin']
    index_list.append(index)
    
train_df = train_df.drop(drop_list).reset_index(drop=True)

print "Items with too few entries removed"

#sort by reviewerID again
train_df.sort_values(by=['reviewerID'],inplace=True)

last_user = 'undef' #first reviewerID
row_dicts = []
row_indices = []

def markRowForTestSet(index, row):
        row_dicts.append(row.to_dict()) # saving values for copying to train_df
        row_indices.append(index) # saving indices for deletion

#for each userid in train_df
for index, row in train_df.iterrows():
    if row['reviewerID'] != last_user: # first row for this user
        markRowForTestSet(index, row)
        n=k-1
    elif n>0:
        markRowForTestSet(index, row)
        n-=1        
    last_user = row['reviewerID']
    
test_df = pd.DataFrame.from_dict(row_dicts)

print "\ntrain_df had",len(train_df.reviewerID.unique()),"unique reviewers and",len(train_df.asin.unique()),"unique items"

train_df = train_df.drop(row_indices).reset_index(drop=True)

print "\ntrain_df now has",len(train_df.reviewerID.unique()),"unique reviewers and",len(train_df.asin.unique()),"unique items"

#remove extra user_ids and item_ids from the test set
extra_user_ids = np.setdiff1d(test_df["reviewerID"].unique(),train_df["reviewerID"].unique())
test_df = test_df[~test_df['reviewerID'].isin(extra_user_ids)]

extra_item_ids = np.setdiff1d(test_df["asin"].unique(),train_df["asin"].unique())
test_df = test_df[~test_df['asin'].isin(extra_item_ids)]

print "train_df now has ",len(train_df),"reviews left"
print "\ntest_df has",len(test_df.reviewerID.unique()),"unique reviewers and",len(test_df.asin.unique()),"unique items"
print "test_df now has ",len(test_df),"reviews"

def writeDFtoVotesFile(df,test):
    if test: output_file = open('test.votes', 'w')
    else : output_file = open('train.votes', 'w')
    for index,row in df.iterrows():
        output_file.write(row["reviewerID"] + ' ' + row["asin"]+ ' ' + str(row["overall"]) + ' ' + str(row["unixReviewTime"]) + ' ' + str(len(row["reviewText"].split())) + ' ' + row["reviewText"] + '\n')
    output_file.close()

#write to json files
if (jsonFile):
    test_df.to_json("test.json",
               orient="records", lines=True)    
    train_df.to_json("train.json",
               orient="records", lines=True)
    print("Done!")
else:
    writeDFtoVotesFile(test_df,True)
    writeDFtoVotesFile(train_df,False)
    print("Done!")

In [None]:
# to check how many users are in test but not train:
np.setdiff1d(test_df["reviewerID"].unique(),train_df["reviewerID"].unique()).shape

In [None]:
# to check how many items are in test but not train:
np.setdiff1d(test_df["asin"].unique(),train_df["asin"].unique()).shape