##Apriori Algorithm

Apriori algorithm used frequent itemsets to generate association rules. It is based on the concept that a subset of frequent items must also be frequent item.

*   frequent itemset: itemset whose support value is greater than a treshold value
*   treshold vaue: every marketing team in an organization have minimum treshold value for their products.

There are 3 ways to measure association which are;


*   Support: to determine the frequency of item that have been bought. it can be used to filter out items that have been bought less frequent
*   Confidence: to determine how often item A & B are bought together given A occur
*   Lift: to determine strenght of rules. If lift is more than 1, it determines that item B is more likely to be bought if item A is bought.




In [1]:
import pandas as pd
import numpy as np
import os
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
import mlxtend as ml

In [2]:
##read data
dt=pd.read_csv('/content/drive/MyDrive/LinkedIn/Learning_Data_Analytics/Market Basket/basket_analysis.csv')

#check total records
num_record=len(dt)
print(num_record)

999


In [3]:
dt.head()

Unnamed: 0.1,Unnamed: 0,Apple,Bread,Butter,Cheese,Corn,Dill,Eggs,Ice cream,Kidney Beans,Milk,Nutmeg,Onion,Sugar,Unicorn,Yogurt,chocolate
0,0,False,True,False,False,True,True,False,True,False,False,False,False,True,False,True,True
1,1,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False
2,2,True,False,True,False,False,True,False,True,False,True,False,False,False,False,True,True
3,3,False,False,True,True,False,True,False,False,False,True,True,True,False,False,False,False
4,4,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [4]:
#drop unrelated column
dt1=dt.drop('Unnamed: 0', axis=1)

In [5]:
dt1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 999 entries, 0 to 998
Data columns (total 16 columns):
 #   Column        Non-Null Count  Dtype
---  ------        --------------  -----
 0   Apple         999 non-null    bool 
 1   Bread         999 non-null    bool 
 2   Butter        999 non-null    bool 
 3   Cheese        999 non-null    bool 
 4   Corn          999 non-null    bool 
 5   Dill          999 non-null    bool 
 6   Eggs          999 non-null    bool 
 7   Ice cream     999 non-null    bool 
 8   Kidney Beans  999 non-null    bool 
 9   Milk          999 non-null    bool 
 10  Nutmeg        999 non-null    bool 
 11  Onion         999 non-null    bool 
 12  Sugar         999 non-null    bool 
 13  Unicorn       999 non-null    bool 
 14  Yogurt        999 non-null    bool 
 15  chocolate     999 non-null    bool 
dtypes: bool(16)
memory usage: 15.7 KB


In [6]:
##check missing value
dt1.isnull().sum()

Apple           0
Bread           0
Butter          0
Cheese          0
Corn            0
Dill            0
Eggs            0
Ice cream       0
Kidney Beans    0
Milk            0
Nutmeg          0
Onion           0
Sugar           0
Unicorn         0
Yogurt          0
chocolate       0
dtype: int64

In [7]:
#check support value
apriori(dt1, min_support=0.15)

Unnamed: 0,support,itemsets
0,0.383383,(0)
1,0.384384,(1)
2,0.420420,(2)
3,0.404404,(3)
4,0.407407,(4)
...,...,...
131,0.191191,"(12, 14)"
132,0.188188,"(12, 15)"
133,0.184184,"(13, 14)"
134,0.186186,"(13, 15)"


In [8]:
#check number of rules
print('Number of rules with 0.15 minimum support:',len(apriori(dt1, min_support=0.15)))

Number of rules with 0.15 minimum support: 136


In [9]:
frequent_itemset=apriori(dt1, min_support=0.15, use_colnames=True)
frequent_itemset['length'] = frequent_itemset['itemsets'].apply(lambda x: len(x))
frequent_itemset
print(frequent_itemset)

      support              itemsets  length
0    0.383383               (Apple)       1
1    0.384384               (Bread)       1
2    0.420420              (Butter)       1
3    0.404404              (Cheese)       1
4    0.407407                (Corn)       1
..        ...                   ...     ...
131  0.191191       (Sugar, Yogurt)       2
132  0.188188    (chocolate, Sugar)       2
133  0.184184     (Unicorn, Yogurt)       2
134  0.186186  (Unicorn, chocolate)       2
135  0.198198   (chocolate, Yogurt)       2

[136 rows x 3 columns]


In [10]:
#check top 25 frequest item bought by customers
frequent_itemset.sort_values('support', ascending=False)[1:26]

Unnamed: 0,support,itemsets,length
2,0.42042,(Butter),1
14,0.42042,(Yogurt),1
7,0.41041,(Ice cream),1
12,0.409409,(Sugar),1
8,0.408408,(Kidney Beans),1
4,0.407407,(Corn),1
9,0.405405,(Milk),1
3,0.404404,(Cheese),1
11,0.403403,(Onion),1
10,0.401401,(Nutmeg),1


In [11]:
#filter the length of item to more than and equal to 2 with support more than and equal to 0.15
frequent_itemset[(frequent_itemset['length'] >= 2) &
                   (frequent_itemset['support'] >= 0.15)]

Unnamed: 0,support,itemsets,length
16,0.154154,"(Bread, Apple)",2
17,0.188188,"(Butter, Apple)",2
18,0.162162,"(Cheese, Apple)",2
19,0.186186,"(Apple, Corn)",2
20,0.179179,"(Dill, Apple)",2
...,...,...,...
131,0.191191,"(Sugar, Yogurt)",2
132,0.188188,"(chocolate, Sugar)",2
133,0.184184,"(Unicorn, Yogurt)",2
134,0.186186,"(Unicorn, chocolate)",2


In [12]:
# filter item by Association rules
rules1 = association_rules(frequent_itemset, metric="confidence", min_threshold=0.30)

In [13]:
print('Number of rules created with 0.15 supports and 0.30 minimum tresholds:', len(rules1))

Number of rules created with 0.15 supports and 0.30 minimum tresholds: 240


In [14]:
#Display top 10 confidence of itemsets
rules1_top10=rules1.sort_values('confidence', ascending=False)[1:11]

In [15]:
print(rules1_1)

        antecedents     consequents  ...  leverage  conviction
66      (Ice cream)        (Butter)  ...  0.034662    1.170579
54          (Bread)        (Yogurt)  ...  0.031590    1.165228
208     (chocolate)          (Milk)  ...  0.040365    1.192021
148          (Dill)     (chocolate)  ...  0.031306    1.157157
69   (Kidney Beans)        (Butter)  ...  0.030499    1.147905
92         (Cheese)  (Kidney Beans)  ...  0.035038    1.171583
73         (Nutmeg)        (Butter)  ...  0.029441    1.144884
67         (Butter)     (Ice cream)  ...  0.034662    1.162571
182     (Ice cream)     (chocolate)  ...  0.029246    1.140467
185          (Milk)  (Kidney Beans)  ...  0.033628    1.163081

[10 rows x 9 columns]


In [16]:
#download rules1
from google.colab import files
rules1.to_csv('rules1.csv')
files.download('rules1.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Interpretation based on top 3 Confidence amount:


1.   The probability of customers to buy ice cream and butter together is 21% (support).The probability of customers to buy ice cream when they bought butter is 50% (confidence). Butter is 1.20 times to be bought when the customers bought ice cream.
2.   The probability of customers to buy bread and yogurt together is 19% (support).The probability of customers to buy bread when they bought yogurt is 50% (confidence). Yogurt is 1.20 times to be bought when the customers bought bread.
3.   The probability of customers to buy chocolate and milk together is 21% (support).The probability of customers to buy chocolate when they bought milk is 50% (confidence). Milk is 1.24 times to be bought when the customers bought chocolate.





Depending on the purpose of the project, one can also use Lift as the main rules of the association rules.

In [17]:
rules2=association_rules(frequent_itemset,metric='lift', min_threshold=1)
print(rules2)

     antecedents  consequents  ...  leverage  conviction
0        (Bread)      (Apple)  ...  0.006788    1.029482
1        (Apple)      (Bread)  ...  0.006788    1.029610
2       (Butter)      (Apple)  ...  0.027006    1.116289
3        (Apple)     (Butter)  ...  0.027006    1.138354
4       (Cheese)      (Apple)  ...  0.007120    1.029393
..           ...          ...  ...       ...         ...
233     (Yogurt)    (Unicorn)  ...  0.020477    1.086680
234    (Unicorn)  (chocolate)  ...  0.022089    1.108705
235  (chocolate)    (Unicorn)  ...  0.022089    1.093902
236  (chocolate)     (Yogurt)  ...  0.021024    1.094184
237     (Yogurt)  (chocolate)  ...  0.021024    1.094608

[238 rows x 9 columns]


In [18]:
#print top 10 lift
rules2_top10=rules2.sort_values('lift', ascending=False)[1:11]
print(rules2_1)

        antecedents     consequents  ...  leverage  conviction
206     (chocolate)          (Milk)  ...  0.040365    1.192021
93   (Kidney Beans)        (Cheese)  ...  0.035038    1.168284
92         (Cheese)  (Kidney Beans)  ...  0.035038    1.171583
208        (Nutmeg)         (Onion)  ...  0.033269    1.161336
209         (Onion)        (Nutmeg)  ...  0.033269    1.159785
183          (Milk)  (Kidney Beans)  ...  0.033628    1.163081
182  (Kidney Beans)          (Milk)  ...  0.033628    1.160740
66      (Ice cream)        (Butter)  ...  0.034662    1.170579
67         (Butter)     (Ice cream)  ...  0.034662    1.162571
140          (Dill)         (Onion)  ...  0.031477    1.152648

[10 rows x 9 columns]


In [19]:
rules2.to_csv('rules2.csv')
files.download('rules2.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [22]:
##check top 5 item are oftenly bought by customers when they bought Butter by using rules 1
rules1[rules1['antecedents'] == {'Butter'}].sort_values(['confidence'], ascending=False)[1:6]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
68,(Butter),(Kidney Beans),0.42042,0.408408,0.202202,0.480952,1.177626,0.030499,1.139764
82,(Butter),(chocolate),0.42042,0.421421,0.202202,0.480952,1.141262,0.025028,1.114693
70,(Butter),(Milk),0.42042,0.405405,0.198198,0.471429,1.162857,0.027757,1.124909
72,(Butter),(Nutmeg),0.42042,0.401401,0.198198,0.471429,1.174457,0.029441,1.132484
74,(Butter),(Onion),0.42042,0.403403,0.197197,0.469048,1.162726,0.027598,1.123635
