In [25]:
import findspark
findspark.init("/Users/chukuemekaogudu/Documents/Dev-Spark-Apache/Apache-Spark/spark-2.4.5-bin-hadoop2.7")

import os
import json
import time
from pyspark import SparkContext, SparkConf
import math

In [26]:
data_dir = "/Volumes/oli2/inf533_datasets"

In [27]:
config = SparkConf().setMaster("local[*]") \
                    .setAppName("Task2Opt") \
                    .set("spark.executor.memory", "4g") \
                    .set("spark.driver.memory", "4g")
sc = SparkContext(conf=config).getOrCreate()

In [28]:
model_dict = sc.textFile(os.path.join(data_dir, "output_item_based.json")) \
              .map(json.loads) \
              .map(lambda x: ((x["b1"], x["b2"]), x["sim"])) \
              .collectAsMap()

In [29]:
test_rdd = sc.textFile(os.path.join(data_dir, "test_review.json")) \
             .map(json.loads) \
             .map(lambda x: (x["user_id"], x["business_id"]))

In [30]:
avg_dict = sc.textFile(os.path.join(data_dir, "business_avg.json")) \
            .map(json.loads) \
            .map(dict) \
            .flatMap(lambda x: x.items()) \
            .collectAsMap()

In [31]:
train_rdd = sc.textFile(os.path.join(data_dir, "train_review.json")).map(json.loads).cache()

In [32]:
unique_id = train_rdd.map(lambda x: x["business_id"]).distinct().collect()

In [33]:
user_business = train_rdd.map(lambda x: (x["user_id"], (x["business_id"], x["stars"]))) \
                         .groupByKey() \
                         .mapValues(lambda x: list(set(x)))

In [34]:
N = 3

In [35]:
def prediction(data, avg_dict, id_list, sim_dict):
    target = data[0]
    if target not in id_list:
        target = "UNK"
    pairs = data[1]    
    
    values = []
    for pair in pairs:
        sim = 0
        if sim_dict.get(tuple((pair[0], target))) != None:
            sim = sim_dict.get(tuple((pair[0], target)))
        if sim_dict.get(tuple((target, pair[0]))) != None:
            sim = sim_dict.get(tuple((target, pair[0])))
        values.append((sim, pair[1]))
    
    values_sorted = sorted(values, key=lambda kv: kv[0], reverse=True)
    neighborhood = values_sorted[:N]
    
    num = 0
    for v in neighborhood:
        num += v[0] * v[1]
    
    denum = 0
    for v in neighborhood:
        denum += abs(v[0])
    if denum == 0 or num == 0:
        return (target, avg_dict.get(target, avg_dict["UNK"]))
    return (target, num/denum)

In [36]:
results = test_rdd.leftOuterJoin(user_business) \
                      .mapValues(lambda x: prediction(x, avg_dict, unique_id, model_dict)) \
                      .collect()

In [37]:
results

[('0gEDwhSUIWIBz23MbLYMig', ('pkXJEEaWcljXgAUd7cGzIA', 3.658511838564404)),
 ('9OPwLaTD-BMscCWuYnwazg', ('4hG2j_ibsNblDgqei05U_g', 3.6722542905149314)),
 ('9OPwLaTD-BMscCWuYnwazg', ('NRn6yqCq3yAh7NtKBt79xQ', 2.649157300140447)),
 ('C6uxJzwfCS-EsGACRocTBg', ('DfgZlNgKwBvCpA_0alumXw', 4.0)),
 ('C6uxJzwfCS-EsGACRocTBg', ('kCatAFdvOhpdeV5UZK-7Fw', 4.332657012674158)),
 ('C6uxJzwfCS-EsGACRocTBg', ('3Gt3xskppi9jZuTrwrhLNg', 4.3316765987293175)),
 ('pDvDLpD7CiMlIebsDPCRSQ', ('eaNenRk_liZBERFFLCXqqQ', 4.000178148831946)),
 ('pDvDLpD7CiMlIebsDPCRSQ', ('nEAoUNf1HrXAWidrwocozg', 3.3327967105417495)),
 ('pDvDLpD7CiMlIebsDPCRSQ', ('IVnGPHdTyu_GbLo9mXj98w', 4.000899411174376)),
 ('pDvDLpD7CiMlIebsDPCRSQ', ('4k3RlMAMd46DZ_JyZU0lMg', 2.998640165202972)),
 ('pDvDLpD7CiMlIebsDPCRSQ', ('aYfHXWNjLNUVoQa5CMA2jg', 4.000493335561976)),
 ('pDvDLpD7CiMlIebsDPCRSQ', ('qE7fsNN6JR4QirojnMJVCQ', 4.3350809737748826)),
 ('pDvDLpD7CiMlIebsDPCRSQ', ('UPIYuRaZvknINOd1w8kqRQ', 4.668286337581861)),
 ('pDvDLpD7CiMlIebsDPC

In [24]:
sc.stop()