In [1]:
# Import Dependencies
import pandas as pd
import numpy as np
from sklearn import preprocessing
#Transform transactions into a one-hot encoded NumPy array. 
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

In [2]:
# Data Preparation

In [3]:
# First we import the dataset that will be used for the training and testing process
df = pd.read_csv('data/Groceries_dataset.csv')
df.head()

Unnamed: 0,Member_number,Date,itemDescription
0,1808,21-07-2015,tropical fruit
1,2552,05-01-2015,whole milk
2,2300,19-09-2015,pip fruit
3,1187,12-12-2015,other vegetables
4,3037,01-02-2015,whole milk


In [4]:
# Label Encoding Process

# AsTo help make the digestion of the data easier, we will begin to encode the values into discrete forms.

In [5]:
# Encode the dates into numbers
le = preprocessing.LabelEncoder()
le.fit(df['Date'])
le_dict = dict(zip(le.classes_, le.transform(le.classes_)))

In [6]:
# Now that we have converted the dates to discrete numbers we do a sense check to determine if that has been properly coded.
dict(list(le_dict.items())[0:5])
# We see that it has and then proceed forward.

{'01-01-2014': 0,
 '01-01-2015': 1,
 '01-02-2014': 2,
 '01-02-2015': 3,
 '01-03-2014': 4}

In [7]:
categorical_df = pd.DataFrame()
categorical_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 0 entries
Empty DataFrame

In [8]:
#categorical_df['Date'] = le.dict
new_df = pd.DataFrame.from_dict(le_dict, orient='index')
new_df.head()

Unnamed: 0,0
01-01-2014,0
01-01-2015,1
01-02-2014,2
01-02-2015,3
01-03-2014,4


In [10]:
merged_df = pd.merge(df,new_df,left_on='Date',right_index=True)
merged_df.tail()

Unnamed: 0,Member_number,Date,itemDescription,0
36938,3559,12-11-2014,napkins,284
37264,3287,12-11-2014,cleaner,284
37269,1312,12-11-2014,soft cheese,284
37989,3914,12-11-2014,specialty bar,284
38216,2111,12-11-2014,domestic eggs,284


In [11]:
# We then merge the 'date' column into the table.
merged_df.columns.values[3] = 'date'
merged_df.head()

Unnamed: 0,Member_number,Date,itemDescription,date
0,1808,21-07-2015,tropical fruit,493
12,1997,21-07-2015,frankfurter,493
14,4736,21-07-2015,butter,493
213,3812,21-07-2015,sausage,493
318,4429,21-07-2015,grapes,493


In [12]:
# This can be deleted as we no longer need it, the Date column is suitable.
# Come back to this as it is currently not working.
merged_df.drop(columns=['Date'],axis=0)
merged_df.head()

Unnamed: 0,Member_number,Date,itemDescription,date
0,1808,21-07-2015,tropical fruit,493
12,1997,21-07-2015,frankfurter,493
14,4736,21-07-2015,butter,493
213,3812,21-07-2015,sausage,493
318,4429,21-07-2015,grapes,493


In [13]:
# We then do a groupby function focusing on the Date as the primary key.
basket = (merged_df
          .groupby(['Date', 'itemDescription'])['itemDescription']
          .count().unstack().reset_index().fillna(0)
          .set_index('Date'))

In [14]:
basket

itemDescription,Instant food products,UHT-milk,abrasive cleaner,artif. sweetener,baby cosmetics,bags,baking powder,bathroom cleaner,beef,berries,...,turkey,vinegar,waffles,whipped/sour cream,whisky,white bread,white wine,whole milk,yogurt,zwieback
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
01-01-2014,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,2.0,2.0,0.0,0.0,0.0,2.0,4.0,0.0
01-01-2015,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,3.0,0.0,0.0
01-02-2014,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,...,0.0,0.0,2.0,1.0,0.0,1.0,0.0,1.0,3.0,0.0
01-02-2015,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,9.0,6.0,0.0
01-03-2014,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,3.0,0.0,1.0,0.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31-07-2015,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,2.0,0.0,0.0,0.0,6.0,1.0,0.0
31-08-2014,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,4.0,3.0,0.0
31-08-2015,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,3.0,3.0,0.0
31-10-2014,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,1.0,0.0


In [15]:
def encode_units(x):
    if x <= 0:
        return 0
    if x >= 1:
        return 1

basket_sets = basket.applymap(encode_units)

In [76]:
# NOTE: Minimum Support will only function at 0.15, would recommend further investigation as to why this is the case. If proceeding to <0.15, the system goes into a loop.
frequent_itemsets = apriori(basket_sets, min_support=0.15, use_colnames=True)

In [75]:
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)
rules.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(bottled beer),(UHT-milk),0.578297,0.365385,0.21978,0.380048,1.04013,0.00848,1.023652
1,(UHT-milk),(bottled beer),0.365385,0.578297,0.21978,0.601504,1.04013,0.00848,1.058237
2,(brown bread),(UHT-milk),0.54533,0.365385,0.20467,0.375315,1.027178,0.005415,1.015896
3,(UHT-milk),(brown bread),0.365385,0.54533,0.20467,0.56015,1.027178,0.005415,1.033695
4,(butter),(UHT-milk),0.506868,0.365385,0.197802,0.390244,1.068036,0.0126,1.040769


In [70]:
# This will help tell us how many were designed by the system.
rules_list = list(rules)
print(len(rules_list))

9


In [71]:
rules_list[0]

'antecedents'

TypeError: apriori() got an unexpected keyword argument 'min_lift'