# Frequent Itemset

## Overview of frequent itemset method:
1. Brute Force
  1. Triangular Matrix
  2. Triples List
  
2. A priori

3. SON
  
## Dataset
We will be using http://ocelma.net/MusicRecommendationDataset/lastfm-360K.html

We could make a sample of this dataset for facilitate development

In [None]:
## Take first 10000 lines for development
!head -n 10000 usersha1-artmbid-artname-plays.tsv > sample.tsv

In [62]:
from pyspark import SparkContext
import collections
import itertools
import math
import bisect

## Task 1: Find  artists pair with support > 10, using Triangular Matrix method

In [50]:
filename='sample.tsv'
threshold=10

In [27]:
def process_line(text):
    tokens=[t for t in text.split('\t') if t!='']
    if len(tokens)!=4:
        return []
    else:
        return [tokens[:2]]

def get_baskets_spark(filename):
    # Can't use the same logic as in pure python, because spark will read file in parallel.
    # Use groupByKey to merge artist_id liked by the same user_id into a list.
    # Another benfits is, this logic allows the file lines been shuffled, which is not allowed in the python logic
    sc = SparkContext("local","PySpark Tutorial") #In Q1, Q2, Q3, Q4 spark is only used for reading data

    # Use only 1 core
    baskets=sc.textFile(filename,minPartitions=1).flatMap(lambda x:process_line(x)).groupByKey().collect()

    sc.stop()
    return baskets

def get_item_dict(baskets):
    item_dict={}
    for basket in baskets:
        for item in basket[1]:
            if item not in item_dict.keys():
                item_dict[item]=len(item_dict)
    return item_dict

def inverse_dict(d):
    return {v:k for k,v in d.items()}

In [58]:
baskets=get_baskets_spark(filename)
item_dict=get_item_dict(baskets)
item_dict_inv=inverse_dict(item_dict)
n=len(item_dict)

tri_matrix=[0 for i in range(n*(n-1)//2)]
for basket in baskets:
    items=basket[1]
    for pair in itertools.combinations(items,2):
        i=item_dict[pair[0]]
        j=item_dict[pair[1]]
        if i>j:
            i,j=j,i
        idx=int((n*(n-1)/2) - (n-i)*((n-i)-1)/2 + j - i - 1)
        tri_matrix[idx]+=1
    
frequent_itemset_list=[]
for idx in range(len(tri_matrix)):
    count=tri_matrix[idx]
    if count>=threshold:
        i = int(n - 2 - math.floor(math.sqrt(-8*idx + 4*n*(n-1)-7)/2.0 - 0.5))
        j = int(idx + i + 1 - n*(n-1)/2 + (n-i)*((n-i)-1)/2)
        item_i=item_dict_inv[i]
        item_j=item_dict_inv[j]
        if item_i>item_j:
            item_i,item_j=item_j,item_i
        frequent_itemset_list.append(((item_i,item_j),count))
frequent_itemset_list=sorted(frequent_itemset_list,key=lambda x:x[1],reverse=True)

In [59]:
print(len(frequent_itemset_list))
frequent_itemset_list

28


[(('a74b1b7f-71a5-4011-9441-d0b5e4122711',
   'cc197bad-dc9c-440d-a5b5-d52ba2e14234'),
  18),
 (('a74b1b7f-71a5-4011-9441-d0b5e4122711',
   'b10bbbfc-cf9e-42e0-be17-e2c3e1d2600d'),
  18),
 (('52074ba6-e495-4ef3-9bb4-0703888a9f68',
   'a74b1b7f-71a5-4011-9441-d0b5e4122711'),
  14),
 (('a74b1b7f-71a5-4011-9441-d0b5e4122711',
   'f6f2326f-6b25-4170-b89d-e235b25508e8'),
  14),
 (('8f6bd1e4-fbe1-4f50-aa9b-94c450ec0f11',
   'a74b1b7f-71a5-4011-9441-d0b5e4122711'),
  13),
 (('8c538f11-c141-4588-8ecb-931083524186',
   'a74b1b7f-71a5-4011-9441-d0b5e4122711'),
  13),
 (('72c536dc-7137-4477-a521-567eeb840fa8',
   'a74b1b7f-71a5-4011-9441-d0b5e4122711'),
  13),
 (('b071f9fa-14b0-4217-8e97-eb41da73f598',
   'b10bbbfc-cf9e-42e0-be17-e2c3e1d2600d'),
  12),
 (('9c9f1380-2516-4fc9-a3e6-f9f61941d090',
   'a74b1b7f-71a5-4011-9441-d0b5e4122711'),
  12),
 (('1cc5adcd-1422-4b5c-a3cd-3ecd4f43f506',
   'a74b1b7f-71a5-4011-9441-d0b5e4122711'),
  12),
 (('69ee3720-a7cb-4402-b48d-a02c366f2bcf',
   'a74b1b7f-71a5

## Task 2: Find  artists pair with support > 10, using Triples List method

In [67]:
class FirstList(collections.UserList):
    def __lt__(self,other):
        return self[0].__lt__(other)

In [68]:
baskets=get_baskets_spark(filename)
item_dict=get_item_dict(baskets)
item_dict_inv=inverse_dict(item_dict)
n=len(item_dict)

tuples=[]
for basket in baskets:
    items=basket[1]
    for pair in itertools.combinations(items,2):
        i=item_dict[pair[0]]
        j=item_dict[pair[1]]
        if i>j:
            i,j=j,i
        idx=n*i+j
        idx_insert=bisect.bisect_left(tuples,idx)
        if idx_insert>=len(tuples):
            tuples.append(FirstList([idx,1]))
        else:
            tp=tuples[idx_insert]
            if tp[0]==idx:
                tp[1]+=1
            else:
                tuples.insert(idx_insert,FirstList([idx,1]))
        
frequent_itemset_list=[]
for idx,count in tuples:
    if count>=threshold:
        i = idx//n
        j = idx%n
        item_i=item_dict_inv[i]
        item_j=item_dict_inv[j]
        if item_i>item_j:
            item_i,item_j=item_j,item_i
        frequent_itemset_list.append(((item_i,item_j),count))
frequent_itemset_list=sorted(frequent_itemset_list,key=lambda x:x[1],reverse=True)

In [69]:
frequent_itemset_list

[(('a74b1b7f-71a5-4011-9441-d0b5e4122711',
   'cc197bad-dc9c-440d-a5b5-d52ba2e14234'),
  18),
 (('a74b1b7f-71a5-4011-9441-d0b5e4122711',
   'b10bbbfc-cf9e-42e0-be17-e2c3e1d2600d'),
  18),
 (('52074ba6-e495-4ef3-9bb4-0703888a9f68',
   'a74b1b7f-71a5-4011-9441-d0b5e4122711'),
  14),
 (('a74b1b7f-71a5-4011-9441-d0b5e4122711',
   'f6f2326f-6b25-4170-b89d-e235b25508e8'),
  14),
 (('8f6bd1e4-fbe1-4f50-aa9b-94c450ec0f11',
   'a74b1b7f-71a5-4011-9441-d0b5e4122711'),
  13),
 (('8c538f11-c141-4588-8ecb-931083524186',
   'a74b1b7f-71a5-4011-9441-d0b5e4122711'),
  13),
 (('72c536dc-7137-4477-a521-567eeb840fa8',
   'a74b1b7f-71a5-4011-9441-d0b5e4122711'),
  13),
 (('b071f9fa-14b0-4217-8e97-eb41da73f598',
   'b10bbbfc-cf9e-42e0-be17-e2c3e1d2600d'),
  12),
 (('9c9f1380-2516-4fc9-a3e6-f9f61941d090',
   'a74b1b7f-71a5-4011-9441-d0b5e4122711'),
  12),
 (('1cc5adcd-1422-4b5c-a3cd-3ecd4f43f506',
   'a74b1b7f-71a5-4011-9441-d0b5e4122711'),
  12),
 (('69ee3720-a7cb-4402-b48d-a02c366f2bcf',
   'a74b1b7f-71a5

## Task 3: Find all frequent items set with support > 10, using A priori method

In [70]:
def get_item_counter(baskets):
    item_counter=collections.Counter()
    for basket in baskets:
        items=basket[1]    
        item_counter.update(items)
    return item_counter

def get_item_dict_threshold(item_counter,threshold):
    item_dict={}
    for item,count in item_counter.items():
        if count>=threshold:
            item_dict[item]=len(item_dict)
    return item_dict

def apriori_method(baskets,threshold,method):
    item_counter=get_item_counter(baskets)
    item_dict=get_item_dict_threshold(item_counter,threshold)
    return method(baskets,threshold,item_dict)

def tuple_wrapper(t):
    if type(t) is not tuple:
        t=(t,)
    return t

def get_possible_k(item_dict, k):
    possible_k = {}
    for pair in itertools.combinations(item_dict.keys(), 2):
        pair_set = set()
        for i in range(2):
            pair_set = pair_set.union(tuple_wrapper(pair[i]))
        if len(pair_set) == k:
            possible_k[frozenset(pair_set)] = [pair[0], pair[1]]
    return possible_k

def get_dict_from_frequent(frequent):
    item_dict={}
    for item in frequent:
        item_dict[item]=len(item_dict)
    return item_dict

In [230]:
def triple_list_method(baskets,threshold,item_dict=None,k=2):
    if item_dict==None:
        item_dict=get_item_dict(baskets)
    else: #apriori, remove infrequent items from baskets
        if k==2:
            for i in range(len(baskets)):
                basket=baskets[i]
                items=basket[1]
                items_filtered=[i for i in items if i in item_dict.keys()]
                baskets[i]=(basket[0],items_filtered)

    item_dict_inv=inverse_dict(item_dict)
    n=len(item_dict)
    
    if k>=3:
        possible_k=get_possible_k(item_dict,k)

    tuples=[]
    for basket in baskets:
        items=basket[1]
        for kpair in itertools.combinations(items,k):
            if k>=3:
                pair=possible_k.get(frozenset(kpair),None)
                if pair==None:
                    continue
            else:
                pair=kpair
                
            i=item_dict[pair[0]]
            j=item_dict[pair[1]]
            if i>j:
                i,j=j,i
            idx=n*i+j
            idx_insert=bisect.bisect_left(tuples,idx)
            if idx_insert>=len(tuples):
                tuples.append(FirstList([idx,1]))
            else:
                tp=tuples[idx_insert]
                if tp[0]==idx:
                    tp[1]+=1
                else:
                    tuples.insert(idx_insert,FirstList([idx,1]))
    frequent_itemset_list=[]
    for idx,count in tuples:
        if count>=threshold:
            i = idx//n
            j = idx%n
            item_i=item_dict_inv[i]
            item_j=item_dict_inv[j]
            item_all=set()
            for item in (item_i,item_j):
                item_all=item_all.union(tuple_wrapper(item))
            item_all=tuple(sorted(item_all))
            frequent_itemset_list.append((item_all,count))
    frequent_itemset_list=sorted(frequent_itemset_list,key=lambda x:x[1],reverse=True)
    return frequent_itemset_list

In [231]:
def apriori_all_method(baskets,threshold,method):
    item_counter=get_item_counter(baskets)
    itemsets_1=sorted([(k,v) for k,v in item_counter.items() if v>=threshold],
                      key=lambda x:x[1],reverse=True)
    frequent_1=[x[0] for x in itemsets_1]
    
    itemsets_list=[itemsets_1]
    frequent_list=frequent_1
    
    k=2
    while True:
        item_dict=get_dict_from_frequent(frequent_list)
        itemsets=method(baskets,threshold,item_dict,k)
        if len(itemsets)>0:
            itemsets_list.append(itemsets)
            frequent_list.extend([x[0] for x in itemsets])
            k+=1
            print(f"k={k} initiated")
        else:
            break
    
    return itemsets_list

In [345]:
baskets=get_baskets_spark(filename)
threshold=7
itemsets_list=apriori_all_method(baskets,threshold,triple_list_method)

k=3 initiated
k=4 initiated


In [346]:
def print_itemsets(itemsets_list):
    n_itemsets=len(itemsets_list)
    for i in range(n_itemsets-1,-1,-1):
        print(f"# of frequent itemsets of size {i+1}: {len(itemsets_list[i])}")
#         for itemset in itemsets_list[i]:
#             print("\t",itemset)

In [347]:
print_itemsets(itemsets_list)

# of frequent itemsets of size 3: 6
# of frequent itemsets of size 2: 128
# of frequent itemsets of size 1: 207


## Task 4: Find all frequent items set with support > 10, using SON method

In [362]:
def apriori_all_method(baskets,threshold,method,son=False,tot_baskets=0):
    if type(baskets) is not list:
        baskets=list(baskets)
    if son:
        threshold=math.floor(threshold*len(baskets)/tot_baskets)
        
    item_counter=get_item_counter(baskets)
    itemsets_1=sorted([(k,v) for k,v in item_counter.items() if v>=threshold],
                      key=lambda x:x[1],reverse=True)
    frequent_1=[x[0] for x in itemsets_1]
    
    itemsets_list=[itemsets_1]
    frequent_list=frequent_1
    
    k=2
    while True:
        item_dict=get_dict_from_frequent(frequent_list)
        itemsets=method(baskets,threshold,item_dict,k)
        if len(itemsets)>0:
            itemsets_list.append(itemsets)
            frequent_list.extend([x[0] for x in itemsets])
            k+=1
            print(f"k={k} initiated")
        else:
            break
    if son:
        return frequent_list
    else:
        return itemsets_list

In [267]:
def count_candidates(baskets,candidates):
    item_counter=collections.defaultdict(int)
    for basket in baskets:
        items=frozenset(basket[1])
        for candidate in candidates:
            if items.issuperset(tuple_wrapper(candidate)):
                item_counter[candidate]+=1
    return item_counter.items()

In [372]:
filename='sample.tsv'
threshold=7

In [373]:
sc = SparkContext("local","PySpark Tutorial")
baskets_rdd=baskets=sc.textFile(filename,minPartitions=2).flatMap(lambda x:process_line(x)).groupByKey()

In [374]:
tot_baskets=baskets_rdd.count()

In [375]:
tot_baskets

205

In [376]:
candidates=baskets_rdd.mapPartitions(lambda x:apriori_all_method(x,threshold,triple_list_method,True,tot_baskets))\
            .distinct().collect()

In [377]:
itemsets=baskets_rdd.mapPartitions(lambda x:count_candidates(x,candidates)).\
         reduceByKey(lambda a,b:a+b).filter(lambda x:x[1]>=threshold).collect()

In [378]:
itemsets_dict=collections.defaultdict(list)
for itemset in itemsets:
    k=len(tuple_wrapper(itemset[0]))
    itemsets_dict[k].append(itemset)

In [379]:
itemsets_dict.keys()

dict_keys([1, 2, 3])

In [380]:
summary=[(k,len(v)) for k,v in itemsets_dict.items()]
summary

[(1, 207), (2, 128), (3, 6)]

In [381]:
sc.stop()