# Imports

In [1]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules
import itertools

# Load Data

In [2]:
df = pd.read_csv("../datasets/GroceryStoreDataSet.csv",names=['products'])
df.head()

Unnamed: 0,products
0,"MILK,BREAD,BISCUIT"
1,"BREAD,MILK,BISCUIT,CORNFLAKES"
2,"BREAD,TEA,BOURNVITA"
3,"JAM,MAGGI,BREAD,MILK"
4,"MAGGI,TEA,BISCUIT"


# Prepare Data for Apriori Algorithm

In [3]:
transactions = list(df["products"].apply(lambda x:x.split(',')))

from mlxtend.preprocessing import TransactionEncoder
te = TransactionEncoder()
te_data = te.fit(transactions).transform(transactions)
df = pd.DataFrame(te_data,columns=te.columns_).astype(int)

df

Unnamed: 0,BISCUIT,BOURNVITA,BREAD,COCK,COFFEE,CORNFLAKES,JAM,MAGGI,MILK,SUGER,TEA
0,1,0,1,0,0,0,0,0,1,0,0
1,1,0,1,0,0,1,0,0,1,0,0
2,0,1,1,0,0,0,0,0,0,0,1
3,0,0,1,0,0,0,1,1,1,0,0
4,1,0,0,0,0,0,0,1,0,0,1
5,0,1,1,0,0,0,0,0,0,0,1
6,0,0,0,0,0,1,0,1,0,0,1
7,1,0,1,0,0,0,0,1,0,0,1
8,0,0,1,0,0,0,1,1,0,0,1
9,0,0,1,0,0,0,0,0,1,0,0


# Algorithm Step by Step (own implementation)
## Firs Iteration
1. Find support values for each product. We divide item frequency by row number.
2. Filter out frequency lower than 20%

In [4]:
first = pd.DataFrame(df.sum() / df.shape[0], columns = ["Support"]).sort_values("Support", ascending = False)
first = first[first.Support >= 0.20]
first

Unnamed: 0,Support
BREAD,0.65
COFFEE,0.4
BISCUIT,0.35
TEA,0.35
CORNFLAKES,0.3
SUGER,0.3
MAGGI,0.25
MILK,0.25
BOURNVITA,0.2


## Second Iteration
1. Find every posible product pair
2. Calculate their support value
3. Eliminate those with frequency lower than 10%

In [5]:
second = list(itertools.combinations(first.index, 2))
second = [list(i) for i in second]
second[:10]

[['BREAD', 'COFFEE'],
 ['BREAD', 'BISCUIT'],
 ['BREAD', 'TEA'],
 ['BREAD', 'CORNFLAKES'],
 ['BREAD', 'SUGER'],
 ['BREAD', 'MAGGI'],
 ['BREAD', 'MILK'],
 ['BREAD', 'BOURNVITA'],
 ['BREAD', 'COCK'],
 ['BREAD', 'JAM']]

In [6]:
#Finding support values
value = []
for i in range(0, len(second)):
    temp = df.T.loc[second[i]].sum() 
    temp = len(temp[temp == df.T.loc[second[i]].shape[0]]) / df.shape[0]
    value.append(temp)
secondIteration = pd.DataFrame(value, columns = ["Support"])
secondIteration["index"] = [tuple(i) for i in second]
secondIteration['length'] = secondIteration['index'].apply(lambda x:len(x))
secondIteration = secondIteration.set_index("index").sort_values("Support", ascending = False)
#Elimination by Support Value
secondIteration = secondIteration[secondIteration.Support > 0.1]
secondIteration

Unnamed: 0_level_0,Support,length
index,Unnamed: 1_level_1,Unnamed: 2_level_1
"(TEA, MAGGI)",0.2,2
"(BREAD, TEA)",0.2,2
"(BREAD, SUGER)",0.2,2
"(BREAD, MILK)",0.2,2
"(BREAD, BISCUIT)",0.2,2
"(COFFEE, CORNFLAKES)",0.2,2
"(COFFEE, SUGER)",0.2,2
"(BREAD, COFFEE)",0.15,2
"(BREAD, MAGGI)",0.15,2
"(BREAD, BOURNVITA)",0.15,2


## Apriori Iterations Function
Function that generalizes the algorithm to n iterations

In [7]:
def ar_iterations(data, num_iter = 1, support_value = 0.1, iterationIndex = None):
    
    # Next Iterations
    def ar_calculation(iterationIndex = iterationIndex): 
        # Calculation of support value
        value = []
        for i in range(0, len(iterationIndex)):
            result = data.T.loc[iterationIndex[i]].sum() 
            result = len(result[result == data.T.loc[iterationIndex[i]].shape[0]]) / data.shape[0]
            value.append(result)
        # Bind results
        result = pd.DataFrame(value, columns = ["Support"])
        result["index"] = [tuple(i) for i in iterationIndex]
        result['length'] = result['index'].apply(lambda x:len(x))
        result = result.set_index("index").sort_values("Support", ascending = False)
        # Elimination by Support Value
        result = result[result.Support > support_value]
        return result    
    
    # First Iteration
    first = pd.DataFrame(df.T.sum(axis = 1) / df.shape[0], columns = ["Support"]).sort_values("Support", ascending = False)
    first = first[first.Support > support_value]
    first["length"] = 1
    
    if num_iter == 1:
        res = first.copy()
        
    # Second Iteration
    elif num_iter == 2:
        
        second = list(itertools.combinations(first.index, 2))
        second = [list(i) for i in second]
        res = ar_calculation(second)
        
    # All Iterations > 2
    else:
        nth = list(itertools.combinations(set(list(itertools.chain(*iterationIndex))), num_iter))
        nth = [list(i) for i in nth]
        res = ar_calculation(nth)
    
    return res

# Association Rules

## Apriori

In [8]:
# Apriori
freq_items = apriori(df, min_support = 0.1, use_colnames = True, verbose = 1)
freq_items.sort_values("support", ascending = False)

Processing 108 combinations | Sampling itemset size 4


Unnamed: 0,support,itemsets
2,0.65,(BREAD)
4,0.4,(COFFEE)
0,0.35,(BISCUIT)
10,0.35,(TEA)
5,0.3,(CORNFLAKES)
9,0.3,(SUGER)
7,0.25,(MAGGI)
8,0.25,(MILK)
30,0.2,"(COFFEE, SUGER)"
34,0.2,"(TEA, MAGGI)"


## Asociation Rules
- Antecedent support variable tells us probability of antecedent products alone
- Consequents support variable tells us probability of consequents products alone
- The support value is the value of the two products (Antecedents and Consequents)
- Confidence is an indication of how often the rule has been found to be true.
- The ratio of the observed support to that expected if X and Y were independent.

In [9]:
df_ar = association_rules(freq_items, metric = "confidence", min_threshold = 0.5)
df_ar[(df_ar.support > 0.15) & (df_ar.confidence > 0.5)].sort_values("confidence", ascending = False)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
8,(MILK),(BREAD),0.25,0.65,0.2,0.8,1.230769,0.0375,1.75
19,(MAGGI),(TEA),0.25,0.35,0.2,0.8,2.285714,0.1125,3.25
9,(SUGER),(BREAD),0.3,0.65,0.2,0.666667,1.025641,0.005,1.05
14,(CORNFLAKES),(COFFEE),0.3,0.4,0.2,0.666667,1.666667,0.08,1.8
16,(SUGER),(COFFEE),0.3,0.4,0.2,0.666667,1.666667,0.08,1.8
0,(BISCUIT),(BREAD),0.35,0.65,0.2,0.571429,0.879121,-0.0275,0.816667
10,(TEA),(BREAD),0.35,0.65,0.2,0.571429,0.879121,-0.0275,0.816667
18,(TEA),(MAGGI),0.35,0.25,0.2,0.571429,2.285714,0.1125,1.75
