In [1]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import association_rules, apriori
from collections import Counter

In [2]:
df = pd.read_csv("C:/Users/ASUS/Downloads/GroceryStoreDataSet.csv", names = ['products'], sep = ',')
df.head()

Unnamed: 0,products
0,"MILK,BREAD,BISCUIT"
1,"BREAD,MILK,BISCUIT,CORNFLAKES"
2,"BREAD,TEA,BOURNVITA"
3,"JAM,MAGGI,BREAD,MILK"
4,"MAGGI,TEA,BISCUIT"


### 1) Prepare the data to run apriori algorithm

In [3]:
data = [a.split(',') for a in df.products.values.tolist()]
data

[['MILK', 'BREAD', 'BISCUIT'],
 ['BREAD', 'MILK', 'BISCUIT', 'CORNFLAKES'],
 ['BREAD', 'TEA', 'BOURNVITA'],
 ['JAM', 'MAGGI', 'BREAD', 'MILK'],
 ['MAGGI', 'TEA', 'BISCUIT'],
 ['BREAD', 'TEA', 'BOURNVITA'],
 ['MAGGI', 'TEA', 'CORNFLAKES'],
 ['MAGGI', 'BREAD', 'TEA', 'BISCUIT'],
 ['JAM', 'MAGGI', 'BREAD', 'TEA'],
 ['BREAD', 'MILK'],
 ['COFFEE', 'COCK', 'BISCUIT', 'CORNFLAKES'],
 ['COFFEE', 'COCK', 'BISCUIT', 'CORNFLAKES'],
 ['COFFEE', 'SUGER', 'BOURNVITA'],
 ['BREAD', 'COFFEE', 'COCK'],
 ['BREAD', 'SUGER', 'BISCUIT'],
 ['COFFEE', 'SUGER', 'CORNFLAKES'],
 ['BREAD', 'SUGER', 'BOURNVITA'],
 ['BREAD', 'COFFEE', 'SUGER'],
 ['BREAD', 'COFFEE', 'SUGER'],
 ['TEA', 'MILK', 'COFFEE', 'CORNFLAKES']]

In [4]:
te = TransactionEncoder()
te_data = te.fit_transform(data)

display(te_data)

array([[ True, False,  True, False, False, False, False, False,  True,
        False, False],
       [ True, False,  True, False, False,  True, False, False,  True,
        False, False],
       [False,  True,  True, False, False, False, False, False, False,
        False,  True],
       [False, False,  True, False, False, False,  True,  True,  True,
        False, False],
       [ True, False, False, False, False, False, False,  True, False,
        False,  True],
       [False,  True,  True, False, False, False, False, False, False,
        False,  True],
       [False, False, False, False, False,  True, False,  True, False,
        False,  True],
       [ True, False,  True, False, False, False, False,  True, False,
        False,  True],
       [False, False,  True, False, False, False,  True,  True, False,
        False,  True],
       [False, False,  True, False, False, False, False, False,  True,
        False, False],
       [ True, False, False,  True,  True,  True, False, Fal

In [5]:
data_x = pd.DataFrame(te_data,columns=te.columns_)

display(data_x.head())

Unnamed: 0,BISCUIT,BOURNVITA,BREAD,COCK,COFFEE,CORNFLAKES,JAM,MAGGI,MILK,SUGER,TEA
0,True,False,True,False,False,False,False,False,True,False,False
1,True,False,True,False,False,True,False,False,True,False,False
2,False,True,True,False,False,False,False,False,False,False,True
3,False,False,True,False,False,False,True,True,True,False,False
4,True,False,False,False,False,False,False,True,False,False,True


### 2) Select frequent itemsets with min support of 10%

In [6]:
frequent_items = apriori(data_x, use_colnames=True, min_support=0.1)
frequent_items

Unnamed: 0,support,itemsets
0,0.35,(BISCUIT)
1,0.2,(BOURNVITA)
2,0.65,(BREAD)
3,0.15,(COCK)
4,0.4,(COFFEE)
5,0.3,(CORNFLAKES)
6,0.1,(JAM)
7,0.25,(MAGGI)
8,0.25,(MILK)
9,0.3,(SUGER)


### 3) Find out which item (not itemset) is the most frequent in the frequent itemset and how many times it appears

In [7]:
ones = [a for a in frequent_items.itemsets.values if len(a) == 1]
ones

[frozenset({'BISCUIT'}),
 frozenset({'BOURNVITA'}),
 frozenset({'BREAD'}),
 frozenset({'COCK'}),
 frozenset({'COFFEE'}),
 frozenset({'CORNFLAKES'}),
 frozenset({'JAM'}),
 frozenset({'MAGGI'}),
 frozenset({'MILK'}),
 frozenset({'SUGER'}),
 frozenset({'TEA'})]

In [8]:
new = frequent_items[frequent_items['itemsets'].isin(ones)]
new

Unnamed: 0,support,itemsets
0,0.35,(BISCUIT)
1,0.2,(BOURNVITA)
2,0.65,(BREAD)
3,0.15,(COCK)
4,0.4,(COFFEE)
5,0.3,(CORNFLAKES)
6,0.1,(JAM)
7,0.25,(MAGGI)
8,0.25,(MILK)
9,0.3,(SUGER)


#### As long as 
    Support = (Number of transaction in which X appears)/(Total number of transactions)

In [9]:
data = [new["itemsets"], new["support"] * 20] # 20 is the count of all combinations
headers = ["itemsets", "counts"]
final = pd.concat(data, axis=1, keys=headers)
final.sort_values('counts', ascending=False)

Unnamed: 0,itemsets,counts
2,(BREAD),13.0
4,(COFFEE),8.0
0,(BISCUIT),7.0
10,(TEA),7.0
5,(CORNFLAKES),6.0
9,(SUGER),6.0
7,(MAGGI),5.0
8,(MILK),5.0
1,(BOURNVITA),4.0
3,(COCK),3.0


So, the answer is BREAD

### 4) Print association rules using already obtained frequent itemsets filtered by confidence above 70%

In [10]:
rules = association_rules(frequent_items, min_threshold=0.7)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(BOURNVITA),(BREAD),0.2,0.65,0.15,0.75,1.153846,0.02,1.4
1,(JAM),(BREAD),0.1,0.65,0.1,1.0,1.538462,0.035,inf
2,(MILK),(BREAD),0.25,0.65,0.2,0.8,1.230769,0.0375,1.75
3,(COCK),(COFFEE),0.15,0.4,0.15,1.0,2.5,0.09,inf
4,(JAM),(MAGGI),0.1,0.25,0.1,1.0,4.0,0.075,inf
5,(MAGGI),(TEA),0.25,0.35,0.2,0.8,2.285714,0.1125,3.25
6,"(BISCUIT, MILK)",(BREAD),0.1,0.65,0.1,1.0,1.538462,0.035,inf
7,"(COCK, BISCUIT)",(COFFEE),0.1,0.4,0.1,1.0,2.5,0.06,inf
8,"(BISCUIT, COFFEE)",(COCK),0.1,0.15,0.1,1.0,6.666667,0.085,inf
9,"(COCK, BISCUIT)",(CORNFLAKES),0.1,0.3,0.1,1.0,3.333333,0.07,inf


### 5) Print the rule that has lowest confidence 

In [11]:
rules[rules.confidence==min(rules.confidence)]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(BOURNVITA),(BREAD),0.2,0.65,0.15,0.75,1.153846,0.02,1.4


### 6) Print the rules which has values lift > 3 and consequent support > 20

In [12]:
rules[(rules.lift>3) & (rules['consequent support']>=0.2)]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
4,(JAM),(MAGGI),0.1,0.25,0.1,1.0,4.0,0.075,inf
9,"(COCK, BISCUIT)",(CORNFLAKES),0.1,0.3,0.1,1.0,3.333333,0.07,inf
11,"(BISCUIT, COFFEE)",(CORNFLAKES),0.1,0.3,0.1,1.0,3.333333,0.07,inf
12,"(TEA, BISCUIT)",(MAGGI),0.1,0.25,0.1,1.0,4.0,0.075,inf
15,"(BREAD, JAM)",(MAGGI),0.1,0.25,0.1,1.0,4.0,0.075,inf
19,"(COCK, BISCUIT, COFFEE)",(CORNFLAKES),0.1,0.3,0.1,1.0,3.333333,0.07,inf
23,"(COCK, BISCUIT)","(COFFEE, CORNFLAKES)",0.1,0.2,0.1,1.0,5.0,0.08,inf


### 7) Print the rules which antecedents itemset contains coffee 

In [13]:
rules[rules['antecedents'].astype(str).str.lower().str.contains('coffee')]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
8,"(BISCUIT, COFFEE)",(COCK),0.1,0.15,0.1,1.0,6.666667,0.085,inf
11,"(BISCUIT, COFFEE)",(CORNFLAKES),0.1,0.3,0.1,1.0,3.333333,0.07,inf
19,"(COCK, BISCUIT, COFFEE)",(CORNFLAKES),0.1,0.3,0.1,1.0,3.333333,0.07,inf
21,"(COCK, COFFEE, CORNFLAKES)",(BISCUIT),0.1,0.35,0.1,1.0,2.857143,0.065,inf
22,"(BISCUIT, COFFEE, CORNFLAKES)",(COCK),0.1,0.15,0.1,1.0,6.666667,0.085,inf
25,"(BISCUIT, COFFEE)","(COCK, CORNFLAKES)",0.1,0.1,0.1,1.0,10.0,0.09,inf


### 8) Please write down your suggestions for the business based on the results that you have. State as well the metrics, which helped you to construct your suggestions. 

* Bread is the bestseller item in this shop (it appears 13 times in 20 purcahses). Cofee is the second (8 from 20). 
* Bread has a assosiation rule with the folloging items and itemsents: BOURNVITA, JAM, MILK, (BISCUIT, MILK), (TEA, BOURNVITA), (MAGGI, JAM) - see appendix 1.
* MAGGI and CORNFLAKES are the best assotiated ones with other items and itemsets (based on lift and consequent support). This means they should always be srounded by the other items for better sales.
* Coffe shows association with cock, cornflakes and biscuit. Placing these items close to coffee ordering counter can be a good strategy to attract customers in buying these.
* The lowest assotiation (probability to be bought with other items) based on confidence value has BOURNVITA. Still, it's associated with bread sales (lift>1). The shop should either try to promote it with discounts to have the assotions with bread as well, or just to stop to sell this product. 

#### Appendix 1. Some helping queries.

In [14]:
rules[(rules.lift>1)]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(BOURNVITA),(BREAD),0.2,0.65,0.15,0.75,1.153846,0.02,1.4
1,(JAM),(BREAD),0.1,0.65,0.1,1.0,1.538462,0.035,inf
2,(MILK),(BREAD),0.25,0.65,0.2,0.8,1.230769,0.0375,1.75
3,(COCK),(COFFEE),0.15,0.4,0.15,1.0,2.5,0.09,inf
4,(JAM),(MAGGI),0.1,0.25,0.1,1.0,4.0,0.075,inf
5,(MAGGI),(TEA),0.25,0.35,0.2,0.8,2.285714,0.1125,3.25
6,"(BISCUIT, MILK)",(BREAD),0.1,0.65,0.1,1.0,1.538462,0.035,inf
7,"(COCK, BISCUIT)",(COFFEE),0.1,0.4,0.1,1.0,2.5,0.06,inf
8,"(BISCUIT, COFFEE)",(COCK),0.1,0.15,0.1,1.0,6.666667,0.085,inf
9,"(COCK, BISCUIT)",(CORNFLAKES),0.1,0.3,0.1,1.0,3.333333,0.07,inf


**Lift** – Overcoming the limitation of confidence measure, Lift will calculate the confidence taking into account the popularity of both items. Representation of lift in mathematical terms is:

Lift({X} => {Y}) = Confidence({X} => {Y}) / Support(Y)

If the lift measure is greater than 1, it means that the Y is likely to be bought with X, while a value less than 1 indicates that Y is unlikely to be bought with X. A lift value of near 1 indicates that both the itemsets in the transactions are appearing often together but there is no association between them.
