In [2]:
import json
import random
from tqdm import tqdm
import argparse

from surprise import BaselineOnly, SVD, SlopeOne, NMF, CoClustering
from surprise import Dataset
from surprise import Reader
from surprise import accuracy
from metrics import precision_recall_at_k, get_conversion_rate, get_ndcg, f_measure
from utils import output_ranking
import argparse

In [7]:
DATA_FILE_PATH = './Automotive.json'
TRAIN_FILE_PATH = './train.csv'
TEST_FILE_PATH = './test.csv'
APPROACH = 'SVD'
OUTPUT_RANKING_FILE = 'ranking'

In [4]:
def read_json(data_file_path):
    data = []
    print(f"read {data_file_path}...")
    with open(data_file_path, "r") as data_file:
        lines = data_file.readlines()
        for line in tqdm(lines, desc=f"read {data_file_path}", total=len(lines)):
            data.append(json.loads(line))
    return data


def split_data(data):
    user_ids_examples = {}
    train_data = []
    test_data = []
    for example in tqdm(data, desc="generate user ids examples...", total=len(data)):
        if example["reviewerID"] not in user_ids_examples:
            user_ids_examples[example["reviewerID"]] = [example]
        else:
            user_ids_examples[example["reviewerID"]].append(example)
    for user_ids in tqdm(user_ids_examples, desc="generate data split...", total=len(user_ids_examples)):
        examples = user_ids_examples[user_ids]
        random.shuffle(examples)
        train_size = int(len(examples)*0.8)
        train_data.extend(examples[0:train_size])
        test_data.extend(examples[train_size:])
    return train_data, test_data


def convert_json_to_user_item_rating_csv(data, csv_file):
    with open(csv_file, "w") as out_csv:
        for example in tqdm(data, total=len(data), desc=f"generate {csv_file}"):
            user_id = example["reviewerID"]
            item_id = example["asin"]
            rating = example["overall"]
            time = example["unixReviewTime"]
            out_csv.write(user_id+"\t"+item_id+"\t"+str(rating)+"\t"+str(time)+"\n")

In [5]:
dataset = read_json(DATA_FILE_PATH)
train_data, test_data = split_data(dataset)
convert_json_to_user_item_rating_csv(train_data, TRAIN_FILE_PATH)
convert_json_to_user_item_rating_csv(test_data, TEST_FILE_PATH)

read ./Automotive.json...


read ./Automotive.json: 100%|██████████| 7990166/7990166 [00:52<00:00, 152591.66it/s]
generate user ids examples...: 100%|██████████| 7990166/7990166 [00:17<00:00, 448638.79it/s]
generate data split...: 100%|██████████| 3873247/3873247 [00:07<00:00, 512881.40it/s]
generate ./train.csv: 100%|██████████| 3739971/3739971 [00:06<00:00, 596385.33it/s]
generate ./test.csv: 100%|██████████| 4250195/4250195 [00:06<00:00, 691884.30it/s]


In [8]:
bsl_options = {'method': 'sgd', 'n_epochs': 20, 'reg_u': 100, 'reg_i': 50}
options = {"SVD": SVD(verbose=True, n_factors=20, n_epochs=3),
            "SlopeOne": SlopeOne(),
            "NMF": NMF(),
            "CoClustering": CoClustering()}
reader = Reader(line_format='user item rating timestamp', sep='\t')
algo = options[APPROACH]



In [9]:
train_data = Dataset.load_from_file(TRAIN_FILE_PATH, reader=reader)
test_data = Dataset.load_from_file(TEST_FILE_PATH, reader=reader)
train_set = train_data.build_full_trainset()
test_set = test_data.build_full_trainset().build_testset()

In [10]:
print("training....")
algo.fit(train_set)
print("testing...")
predictions = algo.test(test_set)
accuracy.mae(predictions, verbose=True)
accuracy.rmse(predictions, verbose=True)

training....
Processing epoch 0
Processing epoch 1
Processing epoch 2
testing...
MAE:  0.9900
RMSE: 1.3408


1.3408185385479632

In [12]:
train_data

<surprise.dataset.DatasetAutoFolds at 0x7fe3202b3f10>

In [11]:
### Extra Credit
output_ranking(predictions, OUTPUT_RANKING_FILE + "_" + APPROACH + ".out")
precisions, recalls = precision_recall_at_k(predictions, k=10, threshold=2.5)
print("Precision:", sum(prec for prec in precisions.values()) / len(precisions))
print("Recall:", sum(rec for rec in recalls.values()) / len(recalls))
print("F-measure:", f_measure(precisions, recalls))
print("conversion_rate:", get_conversion_rate(predictions, k=10))
print("ndcg:", get_ndcg(predictions, k_highest_scores=10))

generating recommend_output...
Precision: 0.8355845851551472
Recall: 0.8397980780227251
F-measure: 0.8376860332405245
conversion_rate: 1.0
ndcg: 0.9884684684684684
