In [1]:
import pyspark
sc = pyspark.SparkContext('local[*]')

In [2]:
%load_ext autoreload
%autoreload 2

from os.path import join

import auxiliary as auxi
from cleanData import CleanData
from splitData import SplitData
from itemBasedSim import ItemBasedSim
from extender import ExtendSim

from pyspark.sql import SQLContext

In [3]:
path_root = "/home/jovyan/work/data"
path_pickle_train = join(path_root, "cache/two_domain/split_data/train")
path_pickle_test = join(path_root, "cache/two_domain/split_data/test")
path_pickle_baseline_sim = join(path_root, "cache/two_domain/item_based_sim/base_sim")

In [4]:
testRDD = sc.pickleFile(path_pickle_test)
item2item_simRDD = sc.pickleFile(path_pickle_baseline_sim)

In [5]:
sqlContext = SQLContext(sc)
itemsim = ItemBasedSim(method='cosine', num_atleast=50)

item2item_simDF = itemsim.build_sim_DF(item2item_simRDD)

item2item_simDF.registerTempTable("sim_table")
BB_item_list = sqlContext.sql(
    "SELECT DISTINCT id1 FROM sim_table WHERE label = 1").map(
    lambda line: line.id1).collect()
BB_item_bd = sc.broadcast(BB_item_list)

item_simRDD = itemsim.get_item_sim(item2item_simRDD)

In [6]:
top_k = 10
extend_sim = ExtendSim(top_k)

classfied_items = extend_sim.find_knn_items(item_simRDD, BB_item_bd).cache()

In [7]:
def extract_siminfo(sc, classfied_items):
    """broadcast knn item information.
    arg:
        classfied_items: iid, (BB_BB, BB_NB), (NB_BB, NB_NN)
    return:
        knn_BB_bd: {BB iid: {NB iid: (sum, mutu, frac_mutu)}}
        knn_NB_bd: {NB iid: {iid: (sum, mutu, frac_mutu)}}
    """
    BB_info = classfied_items.map(
        lambda line: (line[0], line[1])).filter(
        lambda line: line[1] is not None).cache()

    NB_info = classfied_items.map(
        lambda line: (line[0], line[2])).filter(
        lambda line: line[1] is not None).cache()

    BB_items_knn = BB_info.map(
        lambda line: (line[0], dict(
                (l[0], l[1:]) for l in line[1][0] + line[1][1]))
    ).collectAsMap()

    NB_items_knn = NB_info.map(
        lambda line: (line[0], dict(
                (l[0], l[1:]) for l in line[1][0] + line[1][1]))
    ).collectAsMap()

    knn_BB_bd = sc.broadcast(BB_items_knn)
    knn_NB_bd = sc.broadcast(NB_items_knn)
    return BB_info, NB_info, knn_BB_bd, knn_NB_bd

BB_info, NB_info, knn_BB_bd, knn_NB_bd = extract_siminfo(sc, classfied_items)

In [8]:
def combine_BB_withother_in_singledomain(iter_items):
    """combine BB item with other items for each domain.
    return:
        NB_NN iid, [(BB iid, [NB_NN iid*])*]
    """
    for iid, (NB_BB, NB_NN) in iter_items:
        """
        NB_BB: [(BB iid, sim, mutu, frac_mutu)*]
        NB_NN: [(NN iid, sim, mutu, frac_mutu)*]
        """
        for info in NB_BB:
            yield info[0], [(iid, [line[0] for line in NB_NN])]

BB_other_intra = NB_info.mapPartitions(
    combine_BB_withother_in_singledomain).reduceByKey(lambda a, b: a + b).cache()

In [39]:
        def get_final_sim(paths):
            final_score = []
            local_db = {}
            knn_BB_iids = knn_BB_bd.value.keys()
            knn_NB_iids = knn_NB_bd.value.keys()
            for path in paths:
                iid_pairs = zip(path[0: len(path) - 1], path[1: len(path)])
                tmp_info = []
                for iid1, iid2 in iid_pairs:
                    if (iid1, iid2) not in local_db.keys():
                        if iid1 in knn_BB_iids \
                                and iid2 in knn_BB_bd.value[iid1]:
                            tmp = knn_BB_bd.value[iid1][iid2]
                        elif iid2 in knn_BB_iids \
                                and iid1 in knn_BB_bd.value[iid2]:
                            tmp = knn_BB_bd.value[iid2][iid1]
                        elif iid1 in knn_NB_iids \
                                and iid2 in knn_NB_bd.value[iid1]:
                            tmp = knn_NB_bd.value[iid1][iid2]
                        elif iid2 in knn_NB_iids \
                                and iid1 in knn_NB_bd.value[iid2]:
                            tmp = knn_NB_bd.value[iid2][iid1]
                        local_db.update({(iid1, iid2): tmp})
                    tmp_info += [local_db[(iid1, iid2)]]
                sim_info = [l[0] for l in tmp_info]
                mutu_info = [l[1] for l in tmp_info]
                frac_mutu = [l[2] for l in tmp_info]
                final_score.append(
                    ((path[0], path[-1]), calculate_path_confidence(
                        sim_info, mutu_info, frac_mutu)))
            return final_score

        def extend_BB_source(sourceRDD):
            """connect BB item in target domain with items in source domain.
            (BB_target, BB_source), connections
            """
            def helper(iter_items):
                for iid, line in iter_items:
                    for v in knn_BB_bd.value[iid].keys():
                        if "T:" in v:
                            yield (v, iid), line
            return sourceRDD.mapPartitions(helper)

        def extend_BB_target(rdd):
            """connect BB item in source domain with item in target domain.
            (BB_target, BB_source), connections
            """
            def helper(iter_items):
                for iid, line in iter_items:
                    for v in knn_BB_bd.value[iid].keys():
                        if "S:" in v:
                            yield (iid, v), line
            return rdd.mapPartitions(helper)
        
        def final_nonjoint_extend(nonjoint_BB):
            """extend path for items that only linked to BB_target.
            arg:
                nonjoint_BB: (target_iid, source_iid), source_info
            """
            def helper(iter_items):
                for iid_pair, source in iter_items:
                    """iid_pair in the form of (target_iid, source_iid).
                    source_path: from BB_target to item in source domain.
                    """
                    source_path = [iid_pair]
                    for NB_iid, NN_iids in source:
                        source_path += [iid_pair + (NB_iid,)]
                        for NN_iid in NN_iids:
                            source_path += [iid_pair + (NB_iid, NN_iid)]
                    yield get_final_sim(source_path)
            return nonjoint_BB.mapPartitions(helper)
        
        def calculate_path_confidence(sim_info, mutu_info, frac_mutu):
            """calculate the confidence of the path."""
            denominator = sum([a * b for a, b in zip(sim_info, mutu_info)])
            numerator = sum(mutu_info)
            s_p = 1.0 * denominator / numerator if numerator else 0.0
            c_p = reduce(lambda a, b: a * b, frac_mutu)
            return s_p, c_p
    
from functools import reduce

BB_other_intra_source = BB_other_intra.filter(lambda l: "S:" in l[0])
BB_other_intra_target = BB_other_intra.filter(lambda l: "T:" in l[0])

extended_BB_source = extend_BB_source(BB_other_intra_source)
extended_BB_target = extend_BB_target(BB_other_intra_target)
joined_extended_BB = extended_BB_source.join(extended_BB_target).cache()
final_nonjoint_extended = final_nonjoint_extend(extended_BB_source)

In [None]:
    import numpy as np

    def get_final_extension(cross_extended):
        """Deal with the case of multiple path among an item-item pair.
            If item-item pair exists several paths,
            then use s_p and c_p to get the final similarity
        Args:
            cross_extended: in the form of [((iid1, iid2), (s_p, c_p))*]
        Returns:
            xsim: in the form of (iid1, [(iid2, sim)*])
        """
        def swap_info(line):
            """adjust the position of the information."""
            iids, info = line
            return iids[0], [(iids[1], ) + info]

        def get_sim(pairs):
            similarity = np.array([pair[0] for pair in pairs])
            certainty = np.array([pair[1] for pair in pairs])
            return 1.0 * similarity.dot(certainty) / np.sum(certainty)

        def merge(iter_items):
            for iid, info in iter_items:
                local_db = dict()
                final_sim = []
                for pair in info:
                    if pair[0] not in local_db.keys():
                        local_db.update({pair[0]: [pair[1:]]})
                    else:
                        local_db[pair[0]] += [pair[1:]]
                for key in local_db.keys():
                    final_sim.append((key, get_sim(local_db[key])))
                yield iid, final_sim

        return cross_extended.flatMap(lambda line: line).map(swap_info).reduceByKey(
            lambda a, b: a + b).mapPartitions(merge)

# final_nonjoint_extended.flatMap(lambda line: line).take(1)
get_final_extension(final_nonjoint_extended).take(1)

In [None]:
knn_NB_bd.value['S:B00005K2YG']

In [None]:
    def get_final_extension(self, cross_extended):
        """Deal with the case of multiple path among an item-item pair.
            If item-item pair exists several paths,
            then use s_p and c_p to get the final similarity
        Args:
            cross_extended: in the form of [((iid1, iid2), (s_p, c_p))*]
        Returns:
            xsim: in the form of (iid1, [(iid2, sim)*])
        """
        def swap_info(line):
            """adjust the position of the information."""
            iids, info = line
            return iids[0], [(iids[1], ) + info]

        def get_sim(pairs):
            similarity = np.array([pair[0] for pair in pairs])
            certainty = np.array([pair[1] for pair in pairs])
            return 1.0 * similarity.dot(certainty) / np.sum(certainty)

        def merge(iter_items):
            for iid, info in iter_items:
                local_db = dict()
                final_sim = []
                for pair in info:
                    if pair[0] not in local_db.keys():
                        local_db.update({pair[0]: [pair[1:]]})
                    else:
                        local_db[pair[0]] += [pair[1:]]
                for key in local_db.keys():
                    final_sim.append((key, get_sim(local_db[key])))
                yield iid, final_sim

        return cross_extended.flatMap(swap_info).reduceByKey(
            lambda a, b: a + b).mapPartitions(merge)
    


In [None]:
extended_BB_target.take(1)