In [157]:
import pandas as pd
import numpy as np
from itertools import chain, combinations


In [29]:
# from itertools import islice  #  slice set

## data process

In [32]:
data = pd.read_csv("browsing.txt", header=None)

In [122]:
# change data df to [[matrix]]
datalist = []
for k,v in data.iterrows():
    vlist = v[0].split(' ')
    datalist.append([i for i in vlist if i])

In [227]:
## constant variables
S = 100 # min support
itemSuppDict = {} # support dictionary for frequent sets 


In [187]:
# generate candidate sets
def getCandSet(itemsets, k):
    return set([frozenset(set1.union(set2)) for set1 in itemsets for set2 in itemsets if len(set1.union(set2) ) == k])
    

In [163]:
# Prune non-frequent
def getFreqDict(itemCount):
    return {key:value for key,value in itemCount.items() if value >= S}

In [141]:
# count the support of candidate sets
def countSupport(itemsets, transet):
    """
    params itemsets: list
    params transet: df, browsing data
    return itemsetCount: dict
    """
    itemCount = {}
    for item in itemsets:
        itemCount[item] = sum([1 for t in transet if item.issubset(t)])
    return itemCount
        

In [204]:
# generate association rules
# compute confidence score

def subsets(arr):
    """ Returns non empty subsets of arr"""
    return chain(*[combinations(arr, i + 1) for i, a in enumerate(arr)])

def getAssociationRule(freqItemset, suppDict,k):
    """
    freqItemset: k-item frequent set
    suppDict: itemSuppDict
    return rules: Association Rule
    """
    rules = {}
    for item in freqItemset:
        _subsets = map(frozenset, [x for x in subsets(item) if len(x) < k])
        for i in _subsets:
            rightItem = i
            leftItem = item.difference(rightItem)
#             print(suppDict[item])
#             print(suppDict[leftItem])
            conf = suppDict[item]/suppDict[leftItem]
            r = ','.join(leftItem) + '|' + ','.join(rightItem)
            rules[r] = conf
    return rules
            

In [219]:
def getTopRule(rules, n):
    """
    params rules: {rule:conf}
    n: top n
    return top n rules
    """
    sortRules = sorted(rules.items(), key=lambda item: (-item[1],item[0]))
    return {k[0]:k[1] for k in list(sortRules)[:n]}
    

## k = 1

In [127]:
# k = 1
candSet1 = set()
for d in datalist:
    for i in d:
        candSet1.add(frozenset([i]))


In [142]:
candSetSupp1 = countSupport(candSet1, datalist)

In [144]:
freqSet1 = getFreqSet(candSetSupp1)

In [164]:
freqSetDict1 = getFreqDict(candSetSupp1)

In [228]:
# update itemSuppDict
itemSuppDict.update(freqSetDict1)

# k = 2


In [152]:
# k = 2
candSet2 = getCandSet(freqSet1,2)

In [155]:
candSetCount2 = countSupport(candSet2, datalist)

In [167]:
freqSetDict2 = getFreqDict(candSetCount2)

In [168]:
# update itemSuppDict
itemSuppDict.update(freqSetDict2)

In [221]:
freqSet2 = freqSetDict2.keys()

In [223]:
rules2 = getAssociationRule(freqSet2,itemSuppDict,2 )

In [226]:
getTopRule(rules2,5)

{'DAI93865|FRO40251': 1.0,
 'GRO85051|FRO40251': 0.999176276771005,
 'GRO38636|FRO40251': 0.9906542056074766,
 'ELE12951|FRO40251': 0.9905660377358491,
 'DAI88079|FRO40251': 0.9867256637168141}

# k = 3


In [188]:
# k = 3
def getKItemRule(prevFreqSet, suppDict, k, n):
    """
    prevFreqSet: previous frequent set
    suppDict: itemSuppDict
    k: k-item
    n: top n rules
    """
    candSet = getCandSet(prevFreqSet,k)
    candSetCount = countSupport(candSet, datalist)
    freqSetDict = getFreqDict(candSetCount)
    # update itemSuppDict
    suppDict.update(freqSetDict)
    freqSet = freqSetDict3.keys() 
    rules = getAssociationRule(freqSet,suppDict,k)
    print(getTopRule(rules,n))
    

In [190]:
getKItemRule(freqSet2, itemSuppDict, 3, 5)

{'DAI23334,ELE92920|DAI62779': 1.0,
 'DAI62779,DAI88079|FRO40251': 1.0,
 'DAI75645,GRO85051|FRO40251': 1.0,
 'FRO92469,ELE20847|FRO40251': 1.0,
 'GRO73461,GRO85051|FRO40251': 1.0,
 'GRO85051,DAI31081|FRO40251': 1.0,
 'GRO85051,DAI55911|FRO40251': 1.0,
 'GRO85051,ELE17451|FRO40251': 1.0,
 'GRO85051,ELE20847|FRO40251': 1.0,
 'GRO85051,ELE26917|FRO40251': 1.0}