# Apriori Algorithm in Association Rule Learning

In [8]:
import numpy as np
import pandas as pd
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
import re

In [2]:
df = pd.read_csv('Retail Transactions Dataset-4.csv')
df.head()

Unnamed: 0,"Invoice;""product 1"";""Orders"";""Sales value"""
0,"131506;""Product 20"";1;40"
1,"131506;""Product 21"";1;80"
2,"131507;""Product 11"";1;80"
3,"131508;""Product 19"";1;32"
4,"131509;""Product 31"";1;9"


In [7]:
len(df.columns)

1

The length of the columns = 1 means that all columns are compacted together, same as the data in columns. So we need to disintegrate them.

In [11]:
Invoice = []
Product = []
Orders = []
Sales_value = []

for i in range(len(df)):
  row = df.iloc[i,0]
  invoice = re.search("(\d*);\"(.*)\";(.*);(.*)", row).group(1).lower()
  product = re.search("(\d*);\"(.*)\";(.*);(.*)", row).group(2).lower()
  orders = re.search("(\d*);\"(.*)\";(.*);(.*)", row).group(3).lower()
  value = re.search("(\d*);\"(.*)\";(.*);(.*)", row).group(4).lower()

  Invoice.append(invoice)
  Product.append(product)
  Orders.append(orders)
  Sales_value.append(value)

dict = {'Invoice': Invoice, 'Product_ID': Product, 'Orders': Orders, 'Sales_value': Sales_value}

df_clean = pd.DataFrame(dict)
df_clean.head()


Unnamed: 0,Invoice,Product_ID,Orders,Sales_value
0,131506,product 20,1,40
1,131506,product 21,1,80
2,131507,product 11,1,80
3,131508,product 19,1,32
4,131509,product 31,1,9


In [15]:
df_clean.dtypes

Invoice        object
Product_ID     object
Orders         object
Sales_value    object
dtype: object

Orders should be numeric, so we change the data type into int.

In [25]:
df_clean['Orders'] = df_clean['Orders'].astype(int)

In [27]:
df_clean['Orders'].unique()

array([1, 2])

we need to consolidate the items into 1 transaction per row with each product.

In [28]:
df_t = (df_clean.groupby(['Invoice', 'Product_ID'])['Orders']
          .sum().unstack().reset_index().fillna(0)
          .set_index('Invoice'))
df_t

Product_ID,product 1,product 10,product 11,product 12,product 13,product 14,product 15,product 16,product 17,product 18,...,product 22,product 23,product 24,product 25,product 26,product 27,product 28,product 29,product 30,product 31
Invoice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1306797,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1306799,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1306800,0.0,0.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1306824,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1306825,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
647987,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
647988,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
647989,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
647990,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


we need to make sure any positive values are converted to a 1 and anything less than 0 is set to 0.

In [29]:
def encode_units(x):
    if x <= 0:
        return 0
    if x >= 1:
        return 1
df_t = df_t.applymap(encode_units)
df_t

Product_ID,product 1,product 10,product 11,product 12,product 13,product 14,product 15,product 16,product 17,product 18,...,product 22,product 23,product 24,product 25,product 26,product 27,product 28,product 29,product 30,product 31
Invoice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1306797,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1306799,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1306800,0,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1306824,0,0,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1306825,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
647987,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
647988,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
647989,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
647990,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


• Generate frequent itemsets that have a support value of at least 5%.

• Generate the rules with their corresponding confidence larger than 20%.

In [33]:
frequent_itemsets = apriori(df_t, min_support=0.05, use_colnames=True)
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.2)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(product 20),(product 11),0.188832,0.151269,0.069036,0.365591,2.416829,0.040471,1.33783
1,(product 11),(product 20),0.151269,0.188832,0.069036,0.456376,2.416829,0.040471,1.492148
2,(product 15),(product 12),0.11269,0.253807,0.093401,0.828829,3.265586,0.064799,4.359337
3,(product 12),(product 15),0.253807,0.11269,0.093401,0.368,3.265586,0.064799,1.403971
4,(product 20),(product 12),0.188832,0.253807,0.052792,0.27957,1.101505,0.004865,1.03576
5,(product 12),(product 20),0.253807,0.188832,0.052792,0.208,1.101505,0.004865,1.024201


Based on the result above, we can conclude that:

1. Product 15 and 12 should be put together, and there is more chance that when people purchse 15, there is more chance that they will also buy 12 compared to buy 15 additionally when buying 12.
2. Product 20 and 11 should be put together, and there is more chance that when people purchse 11, there is more chance that they will also buy 20 compared to buy 11 additionally when buying 20.
3. Product 20 and 12 should be put together, and there is more chance that when people purchse 20, there is more chance that they will also buy 12 compared to buy 20 additionally when buying 12.
4. In the above 3 product combination, Product 15 and 12's association is the largest.

Three possible use cases of association rule mining:

1. This method could be used to optimize large scales logistics operations.
2. It is also used in designing routes for airplanes with layovers.
3. It could be used to design large scale supermakrets based on customer buying patter.
4. It could also be used in online recommendation systems.