In [163]:
import findspark
findspark.init("/Users/chukuemekaogudu/Documents/Dev-Spark-Apache/Apache-Spark/spark-2.4.5-bin-hadoop2.7")

from pyspark import SparkContext, SparkConf
import json
import time
import os
import random
from collections import defaultdict
from functools import reduce
from itertools import combinations

random.seed(25)

In [164]:
data_dir = "/Volumes/oli2/inf533_datasets/"

In [165]:
num_buckets = 1000
bands = 500

In [166]:
def min_hash_func(idx):
    p = 2**35 - 365
    a = random.randint(1, p - 1)
    b = random.randint(217, p - 1)
    m = 4294975
    return lambda x: ((a * x + b * idx) % p) % m

In [167]:
def lsh_hash(idx):
    p = 2**75 - 545
    a = random.randint(1, p - 1)
    b = random.randint(0, p - 1)
    m = 7293513
    return lambda x: ((a * x + b * idx) % p) % m

In [168]:
config = SparkConf().setMaster("local[*]") \
                    .setAppName("Task1") \
                    .set("spark.executor.memory", "4g") \
                    .set("spark.driver.memory", "4g")
sc = SparkContext(conf=config).getOrCreate()

In [169]:
lines = sc.textFile(os.path.join(data_dir, "train_review.json")).map(json.loads).cache()

In [170]:
business_tokens = lines.map(lambda x: x["business_id"]).distinct().zipWithIndex().collectAsMap()
tokens_business = {v: k for k, v in business_tokens.items()}

In [171]:
user_tokens = lines.map(lambda x: x["user_id"]).distinct().zipWithIndex().collectAsMap()
tokens_user = {v: k for k, v in user_tokens.items()}

In [172]:
business_users = lines.map(lambda x: (business_tokens[x["business_id"]], (user_tokens[x["user_id"]], x["stars"]))) \
                      .groupByKey() \
                      .filter(lambda x: len(x[1]) >= 3) \
                      .mapValues(dict).cache()

In [173]:
users_business = lines.map(lambda x: (user_tokens[x["user_id"]], (business_tokens[x["business_id"]], x["stars"]))) \
                      .groupByKey() \
                      .filter(lambda x: len(x[1]) >= 3) \
                      .mapValues(dict) \
                      .collectAsMap()

In [174]:
print(len(users_business))

26182


### MinHash

In [175]:
hash_funcs = [min_hash_func(i) for i in range(num_buckets)]

In [176]:
def get_hash(row, hash_funcs):
    hash_vals = [hash_func(row) for hash_func in hash_funcs]
    return hash_vals

In [177]:
hash_rdd = business_users.map(lambda x: (x[0], get_hash(x[0], hash_funcs)))

In [178]:
joined_hash_rdd = hash_rdd.join(business_users).partitionBy(7, lambda x: hash(x) % 7)

In [179]:
signature_mat = joined_hash_rdd.map(lambda x: get_user_hash(x[1])) \
                               .flatMap(lambda x: x) \
                               .reduceByKey(lambda h1, h2: min_hash(h1, h2))

In [180]:
lsh_hash_funcs = [lsh_hash(i) for i in range(BANDS)]

In [181]:
def get_intersect(s1, s2):
    return len(set(s1.keys()).intersection(set(s2.keys())))

In [185]:
candidates = signature_mat.map(lambda x: (x[0], generate_bands(x[1]))) \
                             .map(group_bands) \
                             .flatMap(lambda x: x) \
                             .groupByKey() \
                             .map(lambda x: lsh(x, lsh_hash_funcs)) \
                             .flatMap(lambda x: x[1]) \
                             .filter(lambda x: len(x) > 1) \
                             .flatMap(lambda pairs: [pair for pair in combinations(pairs, 2)]) \
                             .distinct() \
                             .filter(lambda x: users_business.get(x[0]) != None and users_business.get(x[1]) != None) \
                             .filter(lambda x: get_intersect(users_business[x[0]], users_business[x[1]]) >= 3) \
                             .collect()

In [186]:
print(len(candidates))

791785


In [187]:
sc.stop()