In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# This project implements association rule mining on a supermarket to try to find underlying patterns in what customers buy.
# Thus, the apriori algorithm aims to identify the relationship between different items which are included together.

# https://stackabuse.com/association-rule-mining-via-apriori-algorithm-in-python

In [None]:
!pip install apyori
from apyori import apriori

Collecting apyori
  Downloading apyori-1.1.2.tar.gz (8.6 kB)
Building wheels for collected packages: apyori
  Building wheel for apyori (setup.py) ... [?25l[?25hdone
  Created wheel for apyori: filename=apyori-1.1.2-py3-none-any.whl size=5975 sha256=5bc3dba876f7a95ee637bae09a8e3bfa62a400058397beb6e6172cf5cdc41aa6
  Stored in directory: /root/.cache/pip/wheels/cb/f6/e1/57973c631d27efd1a2f375bd6a83b2a616c4021f24aab84080
Successfully built apyori
Installing collected packages: apyori
Successfully installed apyori-1.1.2


In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
# Loading the dataset
dataset_path = "/content/gdrive/MyDrive/assets/Grocery_Shopping_Lists/store_data.csv"

# The dataset consists of all the shopping lists in a grocery store for 7 days
# The shopping lists will consist of maximum 20 items
# There are 7500 different shopping lists in this dataset (rows)

In [None]:
df = pd.read_csv(dataset_path)
df.head()

Unnamed: 0,shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil
0,burgers,meatballs,eggs,,,,,,,,,,,,,,,,,
1,chutney,,,,,,,,,,,,,,,,,,,
2,turkey,avocado,,,,,,,,,,,,,,,,,,
3,mineral water,milk,energy bar,whole wheat rice,green tea,,,,,,,,,,,,,,,
4,low fat yogurt,,,,,,,,,,,,,,,,,,,


In [None]:
df.fillna(0, inplace=True) #replacing empty value (NaN) with 0
df.head()

Unnamed: 0,shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil
0,burgers,meatballs,eggs,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0
1,chutney,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0
2,turkey,avocado,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0
3,mineral water,milk,energy bar,whole wheat rice,green tea,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0
4,low fat yogurt,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0


In [None]:
# Data Pre-Processing

# We convert the pandas dataframe into a list of lists. The number of terms in this list will be equal to the number of rows

num_of_cols = len(df.columns)
num_of_rows = len(df.index)

records = []

for i in range (0, num_of_rows):

    #Adds a condition to only store non-zero terms to the lists
    records.append([str(df.values[i,j]) for j in range(0, num_of_cols) if str(df.values[i,j])!='0' and str(df.values[i,j])!='0.0'])

records

In [None]:
# Execute the apriori algorithm 

# min_support = Sets the mininimum support for consideration of association rules (min existence across all shopping lists)
# min_confidence = Sets the minimum confidence for consideration of association rules (min co-occurrence with other items)
# min_lift = Minimum lift for short-listed rules (increase in the ratio of the ocurrence of B if A has already occurred)
# min_length = Minimum number of items that we want for our rules

# If we want to include rules for items bought at least 5 times a day, then for a week, they would have 35 expected purchases.
# The minimum support would be calculated as 35 purchases out of 7500 shopping lists.

association_rules = apriori(records, min_support=0.0045, min_confidence=0.2, min_lift=2, min_length=2)
association_results = list(association_rules)

In [None]:
print(len(association_results)) # The amount of association rules

# We now have a list of all the Association Rule statistics for each item in the Association Results list
print(association_results[0]) # The first association rule

143
RelationRecord(items=frozenset({'burgers', 'almonds'}), support=0.0052, ordered_statistics=[OrderedStatistic(items_base=frozenset({'almonds'}), items_add=frozenset({'burgers'}), confidence=0.2565789473684211, lift=2.9424191211974895)])


In [None]:
# The following code makes it easier to interpret the statistics

for item in association_results:

    # first index of the inner list
    rule_items = item[0] # Contains base item and add item
    current_items = [x for x in rule_items]
    print("Rule: ")
    print(current_items)

    print("Support: " + str(item[1]))  #second index of the inner list

    # Second (confidence) and third (lift) index of the 3D array. The first term (OrderedStatistic) repeats the items in the association rule again.
    # Despite it being a 3D array, only the first index can work with the third dimension, since it includes two items in the same index

    print("Confidence: " + str(item[2][0][2]))
    print("Lift: " + str(item[2][0][3]))
    print("=====================================")

Rule: 
['burgers', 'almonds']
Support: 0.0052
Confidence: 0.2565789473684211
Lift: 2.9424191211974895
Rule: 
['burgers', 'ham']
Support: 0.0056
Confidence: 0.21105527638190955
Lift: 2.420358674104467
Rule: 
['milk', 'cereals']
Support: 0.007066666666666666
Confidence: 0.27461139896373055
Lift: 2.118915115460884
Rule: 
['chicken', 'light cream']
Support: 0.004533333333333334
Confidence: 0.2905982905982906
Lift: 4.843304843304844
Rule: 
['chocolate', 'tomato sauce']
Support: 0.005066666666666666
Confidence: 0.3584905660377358
Lift: 2.1876967007998527
Rule: 
['mushroom cream sauce', 'escalope']
Support: 0.005733333333333333
Confidence: 0.30069930069930073
Lift: 3.7903273197390845
Rule: 
['pasta', 'escalope']
Support: 0.005866666666666667
Confidence: 0.37288135593220345
Lift: 4.700185158809287
Rule: 
['extra dark chocolate', 'mineral water']
Support: 0.005733333333333333
Confidence: 0.47777777777777775
Lift: 2.0052229061742213
Rule: 
['pancakes', 'fresh tuna']
Support: 0.005066666666666666