In [0]:
# Import libraries and declare main arrays
%pylab inline

import numpy as np
import pandas as pd
import dataiku
from dataiku import pandasutils as pdu
from itertools import combinations

metric_dict = {
    "antecedent support": lambda _, sA, __: sA,
    "consequent support": lambda _, sA, __: sC,
    "confidence": lambda sAC, sA, _: sAC/sA,
    "rule support": lambda sAC, _, __: sAC,
    "conf.difference": lambda sAC, sA, sC: abs(metric_dict["confidence"](sAC, sA, sC)-sC),
    "conf.ratio": lambda sAC, sA, sC: 1-np.minimum(sAC/sA, sC)/np.maximum(sAC/sA, sC),
    "lift": lambda sAC, sA, sC: metric_dict["confidence"](sAC, sA, sC)/sC
    }

columns_ordered = ["antecedent support", "consequent support", 
                   "confidence", "rule support", "lift",
                   "conf.difference", "conf.ratio"]


In [0]:
# Apriori function
def apriori(df, min_support=0, use_colnames=False):
    X = df.values
    ary_col_idx = np.arange(X.shape[1]-1)+1
    support = (np.sum(X, axis=0) / float(X.shape[0]))
    support=support[support<1]
    support_dict = {1: support[support >= min_support]}
    itemset_dict = {1: ary_col_idx[support >= min_support].reshape(-1, 1)}
    
    max_itemset = 1

    while max_itemset:
        next_max_itemset = max_itemset + 1
        combin = combinations(np.unique(itemset_dict[max_itemset].flatten()),
                              r=next_max_itemset)
        frequent_items = []
        frequent_items_support = []

        for c in combin:
            together = X[:, c].sum(axis=1) == len(c)
            support = together.sum() / float(X.shape[0])
            if support >= min_support:
                frequent_items.append(c)
                frequent_items_support.append(support)

        if frequent_items:
            itemset_dict[next_max_itemset] = np.array(frequent_items)
            support_dict[next_max_itemset] = np.array(frequent_items_support)
            max_itemset = next_max_itemset
        else:
            max_itemset = 0

    all_res = []
    for k in sorted(itemset_dict):
        support = pd.Series(support_dict[k])
        itemsets = pd.Series([i for i in itemset_dict[k]])

        res = pd.concat((support, itemsets), axis=1)
        all_res.append(res)

    res_df = pd.concat(all_res)
    res_df.columns = ['support', 'itemsets']
    if use_colnames:
        mapping = {idx: item for idx, item in enumerate(df.columns)}
        res_df['itemsets'] = res_df['itemsets'].apply(lambda x: [mapping[i]
                                                      for i in x])
    res_df = res_df.reset_index(drop=True)
    return res_df



In [0]:
# Example: load a DSS dataset as a Pandas dataframe (change the dataset name if on a different project)
mydataset = dataiku.Dataset("Shopping_tabular")
df = mydataset.get_dataframe()
df.head()

In [0]:
# Area to change the script parameters
metric="lift"        # metric used for threshold
min_threshold=1      # metric threshold
min_confidence=0.75         # min confidence
min_rulesupport=0       # min rule support (enter 0 to use antecedent support + other metric)
min_support=0.1       # min antecedent support
max_antecedent=5        # define maximum number of antecedent

# calls apriori function to produce, in first hand, the most frequent combinations of products = itemsets
freq=apriori(df, min_rulesupport, use_colnames=True)

keys = freq['itemsets'].values
values = freq['support'].values
frozenset_vect = np.vectorize(lambda x: frozenset(x))
frequent_items_dict = dict(zip(frozenset_vect(keys), values))


In [0]:
# prepare buckets to collect frequent rules
rule_antecedents = []
rule_consequent = []
rule_supports = []

# nsize will store total number of tickets
nsize=len(df.index) 

# iterate over all frequent itemsets
for k in frequent_items_dict.keys():
    sAC = frequent_items_dict[k]
    # iterate to find all possible combinations. len(k) is itemset length
    for idx in range(len(k)-1, 0, -1):
        # extract supports, antecedents and consequent from rules
        for c in combinations(k, r=idx):
            antecedent = frozenset(c)
            consequent = k.difference(antecedent)
            sA = frequent_items_dict[antecedent]
            sC = frequent_items_dict[consequent]
            instances = sA * nsize
            str_antecedent=str(list(c))[2:-2].replace("', u'", ",")
            str_consequent=str(list(consequent))[2:-2].replace("', u'", ",")
            # check thresholds
            if metric_dict[metric](sAC, sA, sC) >= min_threshold:
                if metric_dict['antecedent support'](sAC, sA, sC) >= min_support:
                    if metric_dict['confidence'](sAC, sA, sC) >= min_confidence:
                        if len(antecedent)<=max_antecedent and len(consequent)==1:
                            rule_antecedents.append(str_antecedent.replace("','", ",").replace("', '", ","))
                            rule_consequent.append(str_consequent.replace("','", ",").replace("', '", ","))
                            rule_supports.append([len(antecedent), len(consequent), instances, sAC, sA, sC])


In [0]:
# if rules were found, generate metrics from rule_supports array
if len(rule_supports)==0:
    print ("EMPTY: No rules were found with the criteria specified.")    
else:
    rule_supports= np.array(rule_supports).T.astype(float)
    numbantec = rule_supports[0]
    numbconseq = rule_supports[1]
    instances = rule_supports[2]
    sAC = rule_supports[3]
    sA = rule_supports[4]
    sC = rule_supports[5]

    # create the final pandas dataframe: dfrules, to store rule list
    dfrules = pd.DataFrame(data=list(zip(rule_antecedents, rule_consequent, instances)), columns=["antecedents", "consequent", "instances"])

    # add other metrics
    for m in columns_ordered:
        dfrules[m] = metric_dict[m](sAC, sA, sC)

    # add number of antecedents and number of consequents
    dfrules2=pd.DataFrame(data=list(zip(numbantec, numbconseq)), columns=["#antec", "#conseq"])
    dfrules=pd.concat([dfrules, dfrules2], axis=1)
    
    # sort by one metric
    dfrules=dfrules.sort_values('confidence', ascending=False)

    # assign row number
    dfrules['ruleid']=''
    for index in dfrules.iterrows():
        dfrules['ruleid']=dfrules.index+1
    
    print (str(len(dfrules)) + " rules found")        

In [0]:
# show rules
dfrules

## dfrules