In [18]:
import pandas as pd
import numpy as np

In [19]:
def support_val(db_df, prev_lst):
    db_df['sup'] = 0 #set value to 0 for initial value at every iter
    count_freq = []

    #Set the init value of prev candidate to 0
    for item in prev_lst:
        count_freq.append((item, 0))

    df_candidate = pd.DataFrame(count_freq, columns=['itemset', 'sup'])

    for i in range(len(db_df)):
        for j in range(len(count_freq)):
            #check if itemset is subset of Database
            if (df_candidate['itemset'][j]).issubset(set(db_df['items'][i])): 
                df_candidate.loc[j, 'sup'] += 1
                
    # for s in range (len(df_candidate)):
    #     df_candidate['sup'].iloc[s] = float(df_candidate['sup'].iloc[s]) / len(db_df)
    return df_candidate

In [20]:
def filter_freq_element(minsup,prev_candidate):
    filtering = prev_candidate['sup'] >= minsup
    freq = prev_candidate[filtering]
    return freq

In [21]:
def join_freq(itemset):
    join_lst = []
    for i in range(len(itemset['itemset'])):
        for j in range((i+1), len(itemset['itemset'])):
            itemset_i = itemset['itemset'].iloc[i]
            itemset_j = itemset['itemset'].iloc[j]
            if type(itemset_i) == str and type(itemset_j) == str:
                itemset_i = {itemset_i}
                itemset_j = {itemset_j}

            union_candidate = itemset_i.union(itemset_j)

            if union_candidate not in join_lst:
                join_lst.append(union_candidate)
    return join_lst

In [22]:
def confidence_val(db_df,rules,minconf): #Probability(A & B) / Support(A)
    confdict = {}
    for i in range (len(rules)):
        for k in rules[i].keys():
            key_lst = k.split(", ")
            key_sup = support_val(db_df, [set(key_lst)]) #Support(A)
            AandB = sorted(key_lst + rules[i][k]) 
            prob_val = support_val(db_df, [set(AandB)])#prob(A&B)
            if int(key_sup['sup']) == 0: #division by 0 prevention
                conf=0
            else:
                conf = int(prob_val['sup']) / int(key_sup['sup'])
            # print(key_sup)
            # print(prob_val)
            # print(conf)
            if conf >= minconf: #get rules >= minconf 
                confdict[k + "->" + ",".join(rules[i][k])] = conf
    return confdict

In [23]:
def lift_val(db_df, confdict): #confidence(A->B) / support(B)
    liftdict = {}
    for k in confdict.keys():
        confvalue = confdict[k]
        b = k[k.find(">")+1:].split(",") #> in ->
        supportB = support_val(db_df, [set(b)])
        liftvalue = confvalue / int(supportB['sup'])
        liftdict[k]=liftvalue
    return liftdict
        

In [24]:
def association_rules(candidate):
    rules = []
    dict_rules = {}
    for i in range(len(candidate['itemset'])):
        dict_rules = {}
        items = sorted(list(candidate['itemset'].iloc[i]))
        temp_i = items[:]
        for j in range(len(items)):
            k = temp_i[j]
            del temp_i[j]
            if type(temp_i) is not list:
                temp_i = [temp_i]
            dict_rules[k] = temp_i
            dict_rules[", ".join(temp_i)] = [k]
            temp_i = items[:]
        rules.append(dict_rules)
    return rules

In [25]:
def lift_conf_sup(db_df,liftdata, confdata, minsup): #for Task2 to generate values
    data = []
    for k in liftdata.keys():
        setdata = k[:k.find("-")].split(", ") + k[k.find(">")+1:].split(",")
        sup = support_val(db_df,[set(setdata)])
        data.append((k,liftdata[k], confdata[k],float(sup['sup'])))
    data = sorted(data)
    all_data = pd.DataFrame(data, columns=['rule', 'lift', 'conf', 'sup'])
    all_data = filter_freq_element(minsup,all_data)
    all_data = all_data.reset_index(drop=True)
    print(all_data)
    return all_data

In [26]:
def main(minsup,minconf,minlift,csv):
    dataset = pd.read_csv(csv, on_bad_lines='skip', header=None)
    item_dict = {}
    for r in range (len(dataset)):
        for item in dataset.iloc[r]:
            if type(item)!=str or "total" in item: #if value is NaN, end of transaction, exit loop
                break
            if r not in item_dict:
                item_dict[r]=[item]
            else:
                item_dict[r].append(item)
        item_dict[r].sort()

    item_lst = []
    for k in item_dict.keys():
        item_lst.append(item_dict[k])

    dict_data = {'items': item_lst}
    df = pd.DataFrame.from_dict(dict_data)

    content = []
    for i in item_lst:
        for q in i:
            if(q not in content):
                content.append(q)
    content.sort()
    minsup *= len(dataset)
    
    #first iter sup (count individual items frequency)
    counter = 0
    count_content = []
    for i in content:
        for d in item_lst:
            if i in d:
                counter += 1
        count_content.append((i, counter))
        counter = 0 #reset counter
    c1_df = pd.DataFrame(count_content, columns=["itemset", "sup"])
    freqset = filter_freq_element(minsup,c1_df)
    
    #repeat apriori algorithm until length of dataframe is either 0 or 1
    #0 means that no data is in dataframe, stop iteration
    #1 means that no more combination of set, stop iteration (data support val can be < or >= minsup)
    tempset = 0 #to check if prevset=currentset, if same then set is final, no need to iterate further
    while len(freqset)!=0 and len(freqset)!=1 and tempset!=len(freqset):
        join = join_freq(freqset)
        count_content = []
        for item in join:
            count_content.append((item, 0))
        candidate = pd.DataFrame(count_content, columns=['itemset', 'sup'])
        candidate = support_val(df,join)
        freqset = filter_freq_element(minsup,candidate)
        reset_index = freqset.reset_index(drop=True)
        tempset=len(candidate) #if no changes then candinate=reset_index since no data is pruned
        rules = association_rules(reset_index)
        #print(rules)
        conf = confidence_val(df,rules,minconf)
        #print(conf)
        lift = lift_val(df, conf)
        alldata = lift_conf_sup(df,lift,conf,minsup)
        print()
        
    if len(freqset) == 1: #if final set is only 1, check support value if it's >=minsup
        check_supVal = reset_index['sup'].iloc[0] >= minsup
        if check_supVal: #get rules if >=minsup and >=minconf
            rules = association_rules(reset_index)
            conf = confidence_val(df,rules,minconf)
            lift = lift_val(df, conf)
            alldata = lift_conf_sup(df,lift,conf,minsup)

In [None]:
main(0.2,0.6,0,'supermarket.csv')