In [1]:
import pyspark
sc = pyspark.SparkContext('local[*]')

In [17]:
%load_ext autoreload
%autoreload 2

from os.path import join

import numpy as np

from pyspark.sql import SQLContext

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [30]:
path_root = "/home/tlin/notebooks/data"
path_pickle_train = join(
    path_root, "cache/two_domain/split_data/train")
path_pickle_test = join(
    path_root, "cache/two_domain/split_data/test")
path_pickle_baseline_sim = join(
    path_root, "cache/two_domain/item_based_sim/base_sim")
path_pickle_extended_sim = join(
    path_root, "cache/two_domain/extend_sim/extendsim")
path_pickle_private_mapped_sim = join(
    path_root, "cache/two_domain/private_mapping/privatemap")
path_pickle_nonprivate_mapped_sim = join(
    path_root, "cache/two_domain/private_mapping/nonprivatemap")

In [36]:
trainingRDD = sc.pickleFile(path_pickle_train).cache()
private_mapped_sim = sc.pickleFile(path_pickle_private_mapped_sim)
nonprivate_mapped_sim = sc.pickleFile(path_pickle_nonprivate_mapped_sim)

In [59]:
def map_to_dict(rdd):
    """For a rdd, map it from list to dict.
    return:
        {source item: target item}
    """
    return dict((line[1], line[0]) for line in rdd.collect())

private_mapped_sim_dict = map_to_dict(private_mapped_sim)

In [60]:
dataRDD = trainingRDD.flatMap(
    lambda line: [(line[0], l[0], l[1], l[2]) for l in line[1]]).cache()
dataRDD.take(1)

[('A3J85VVGLCXD57',
  'S:B000EMLDTG',
  5.0,
  datetime.datetime(2013, 1, 17, 19, 0))]

In [67]:
def mapping_item(line, mapping_dict):
    """.
    Args:
        line: in the form of (uid, iid, rating, rating time).
        mapping_dict: {source item: target item}.
        mapping_key: it contains all keys of mapping.
    """
    return (line[0], mapping_dict[line[1]], line[2], line[3]) \
        if line[1] in mapping_dict else None
    
alterEgo_profile = dataRDD.map(
    lambda line: mapping_item(line, private_mapped_sim_dict)).filter(
    lambda line: line is not None)

In [76]:
def build_sthbased_profile(rdd, profile):
    """build an item-based or a user-based profile.
    Args:
        rdd: (uid, iid, rating, time)*
    """
    if "user" in profile:
        return rdd.map(
            lambda l: (l[0], [(l[1], l[2], l[3])])).reduceByKey(
            lambda a, b: a + b)
    elif "item" in profile:
        return rdd.map(
            lambda l: (l[1], [(l[0], l[2], l[3])])).reduceByKey(
            lambda a, b: a + b)
    
user_based_alterEgo = build_sthbased_profile(alterEgo_profile, "user").cache()
item_based_alterEgo = build_sthbased_profile(alterEgo_profile, "item").cache()
user_based_dict_bd = sc.broadcast(user_based_alterEgo.collectAsMap())
item_based_dict_bd = sc.broadcast(item_based_alterEgo.collectAsMap())

In [77]:
    def get_info(dataRDD):
        """get the information of RDD, either item or user.
        Args:
            dataRDD could either be userRDD or itemRDD.
            userRDD: (uid, (iid, rating, rating time)*)
            itemRDD: (iid, (uid, rating, rating time)*)
        Returns:
            info of the input RDD:
                (uid, (average, norm2, count))* or
                (iid, (average, norm2, count))* or
        """
        def norm2(ratings):
            """calculate the norm 2 of input ratings.
            Args:
                ratings: (iid, rating, rating time)*
            Returns:
                norm of ratings.
            """
            return np.sqrt(np.sum([rating[1] ** 2 for rating in ratings]))

        def average(ratings):
            """calculate the average of the ratings.
            Args:
                ratings: (iid, rating, rating time)*
            Returns:
                average of ratings.
            """
            return 1.0 * np.average([rating[1] for rating in ratings])

        def helper(line):
            """a helper function."""
            uid, ratings = line
            return uid, (average(ratings), norm2(ratings), len(ratings))
        return dataRDD.map(helper)
    
user_info = sc.broadcast(get_info(user_based_alterEgo).collectAsMap())
item_info = sc.broadcast(get_info(item_based_alterEgo).collectAsMap())

In [81]:
    from itertools import combinations

    def produce_pairwise(dataRDD):
        """produce pairwise."""
        def helper(iters):
            """find item pairs."""
            for uid, ratings in iters:
                for item1, item2 in combinations(ratings, 2):
                    yield (item1[0], item2[0]), [(item1[1], item2[1], uid)]
        return dataRDD.filter(
            lambda line: len(line[1]) >= 2).mapPartitions(
            helper).reduceByKey(
            lambda a, b: a + b)
    
pair_wise = produce_pairwise(user_based_alterEgo).cache()

In [87]:
line = pair_wise.take(1)[0]
(id1, id2), rating_pairs = line

In [88]:
ratings = [
    (rating_pair[0], rating_pair[1], rating_pair[0] * rating_pair[1])
    for rating_pair in rating_pairs]
inner_product = sum(map(lambda line: line[2], ratings))