In [104]:
import findspark
findspark.init("/Users/chukuemekaogudu/Documents/Dev-Spark-Apache/Apache-Spark/spark-2.4.5-bin-hadoop2.7")

import os
import json
import time
from pyspark import SparkContext, SparkConf
import math

In [105]:
data_dir = "/Volumes/oli2/inf533_datasets"

In [106]:
config = SparkConf().setMaster("local[*]") \
                    .setAppName("Task2Opt") \
                    .set("spark.executor.memory", "4g") \
                    .set("spark.driver.memory", "4g")
sc = SparkContext(conf=config).getOrCreate()

In [107]:
lines = sc.textFile(os.path.join(data_dir, "train_review.json")).map(json.loads).cache()

In [108]:
business_tokens = lines.map(lambda x: x["business_id"]).distinct().zipWithIndex().collectAsMap()
tokens_business = {v: k for k, v in business_tokens.items()}

In [109]:
user_tokens = lines.map(lambda x: x["user_id"]).distinct().zipWithIndex().collectAsMap()

In [110]:
rdd = lines.map(lambda x: (business_tokens[x["business_id"]], (x["user_id"], x["stars"]))) \
                      .groupByKey().filter(lambda x: len(x[1]) >= 3).mapValues(dict).cache()

In [111]:
tokens_rdd = rdd.map(lambda x: x[0])

In [112]:
def get_intersect(s1, s2):
    return len(set(s1.keys()).intersection(set(s2.keys())))

In [113]:
rdd_dict = rdd.collectAsMap()

In [114]:
def pearson_correlation(ri, rj):
    ru = list(set(ri.keys()).intersection(set(rj.keys())))
    avg_ri = sum([ri[rui] for rui in ru]) / len(ri)
    avg_rj = sum([rj[ruj] for ruj in ru]) / len(rj)
    i, j, numerator = 0.0, 0.0, 0.0
    for k in ru:
        i += (ri[k] - avg_ri)**2
        j += (rj[k] - avg_rj)**2
        numerator += (ri[k] - avg_ri) * (rj[k] - avg_rj)
    denominator = math.sqrt(i) * math.sqrt(j)
    if denominator == 0.0:
        return 0.0
    return numerator/denominator

In [115]:
candidates = tokens_rdd.cartesian(tokens_rdd) \
                       .filter(lambda x: x[0] < x[1]) \
                       .filter(lambda x: get_intersect(rdd_dict[x[0]], rdd_dict[x[1]]) >= 3) \
                       .map(lambda x: ((x[0], x[1]), pearson_correlation(rdd_dict[x[0]], rdd_dict[x[1]]))) \
                       .filter(lambda x: x[1] > 0.0)

In [116]:
candidates.collect()

[((2240, 8764), 0.9937072635046319),
 ((2240, 9492), 0.9620395892237321),
 ((2240, 2996), 0.9651203078140345),
 ((2240, 6622), 0.9825501875832747),
 ((2240, 5908), 0.9593928824275837),
 ((2240, 8778), 0.9926318524325869),
 ((2240, 5152), 0.9758752769712252),
 ((2240, 8050), 0.99385303911032),
 ((2240, 2254), 0.9840583453238358),
 ((2240, 7350), 0.9540515795116312),
 ((2240, 9506), 0.9308413082301333),
 ((2240, 3010), 0.9098427577834064),
 ((2240, 8064), 0.9442831033129351),
 ((2240, 7364), 0.9410673656709706),
 ((8764, 9506), 0.9583593286139108),
 ((2240, 3024), 0.9460540019957429),
 ((2240, 3738), 0.9599571882436198),
 ((2240, 6650), 0.9926150120886534),
 ((2240, 5166), 0.9013655313176535),
 ((2240, 4466), 0.9963036281945424),
 ((2240, 3038), 0.946219727950763),
 ((2240, 5950), 0.9548007447636403),
 ((2240, 8806), 0.9006050380572413),
 ((2240, 2282), 0.9668954976718692),
 ((2240, 5180), 0.940114888319627),
 ((2240, 7392), 0.9572512066383),
 ((2240, 9534), 0.9859449507212095),
 ((2240,

In [103]:
sc.stop()