In [1]:
from math import ceil

import pandas as pd

from mlxtend.frequent_patterns import fpgrowth
from mlxtend.preprocessing import TransactionEncoder

from eclat_algo import eclat, assoc_rules

import os.path as osp

## Read and transform data

In [2]:
data = pd.read_csv(osp.join('..', 'Data', 'processed.csv'))
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38006 entries, 0 to 38005
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Member_number    38006 non-null  int64 
 1   itemDescription  38006 non-null  object
 2   Date             38006 non-null  int64 
 3   Month            38006 non-null  int64 
 4   Year             38006 non-null  int64 
dtypes: int64(4), object(1)
memory usage: 1.4+ MB


In [8]:
filtered_data = data[data['Year'] == 2015]
# filtered_data = data[data['Year'] == 2014]
# filtered_data = data

transformed_df = filtered_data.copy(deep=True)
transformed_df['Tid'] = transformed_df['Member_number'].astype('str') + '-' +\
                        transformed_df['Date'].astype('str') + '-' +\
                        transformed_df['Month'].astype('str') + '-' +\
                        transformed_df['Year'].astype('str')
transformed_df.drop(['Member_number', 'Date', 'Month', 'Year'], axis=1, inplace=True)
transformed_df

Unnamed: 0,itemDescription,Tid
0,tropical fruit,1808-21-7-2015
1,whole milk,2552-5-1-2015
2,pip fruit,2300-19-9-2015
3,other vegetables,1187-12-12-2015
4,whole milk,3037-1-2-2015
...,...,...
36404,chocolate,1185-24-8-2015
36405,newspapers,4656-25-12-2015
36406,canned vegetables,2741-11-9-2015
36407,coffee,2654-17-8-2015


## Frequent Itemset Mining

In [9]:
minSup = .005
minConf = .1

no_transacts = len(transformed_df['Tid'].unique())
minFreq = ceil(no_transacts * minSup)
minFreq

35

In [10]:
my_results = eclat(transformed_df, minSup, 'itemDescription')

In [11]:
sorted_results = my_results.sort_values('frequencies')
k_set = sorted_results[sorted_results['itemsets'].apply(len) >= 2]
print(f"""Number of frequent itemsets: {len(sorted_results)}
Number of frequent k-itemset (k >=2): {len(k_set)}""")
k_set.iloc[:-6:-1]

Number of frequent itemsets: 186
Number of frequent k-itemset (k >=2): 93


Unnamed: 0,itemsets,frequencies,support
185,"[other vegetables, whole milk]",153,0.021913
184,"[rolls/buns, whole milk]",150,0.021484
182,"[sausage, whole milk]",120,0.017187
175,"[yogurt, whole milk]",119,0.017044
179,"[soda, whole milk]",107,0.015325


## Association Rules Mining

In [17]:
rules = assoc_rules(k_set['itemsets'].values, sorted_results)
lift_ge1 = rules[rules['lift'] >= 1]
print(f"""Number of association rules generated: {len(rules)}
Number of rules with lift >= 1: {len(lift_ge1)}""")
lift_ge1.sort_values(['support', 'confidence'], ascending=False)

Number of association rules generated: 186
Number of rules with lift >= 1: 38


Unnamed: 0,rule,support,confidence,lift
160,"((soda,), (sausage,))",0.011601,0.115549,1.115857
161,"((sausage,), (soda,))",0.011601,0.112033,1.115857
158,"((yogurt,), (sausage,))",0.010885,0.114458,1.105318
159,"((sausage,), (yogurt,))",0.010885,0.105118,1.105318
144,"((frankfurter,), (other vegetables,))",0.010026,0.142857,1.007504
145,"((other vegetables,), (frankfurter,))",0.010026,0.070707,1.007504
130,"((shopping bags,), (whole milk,))",0.009023,0.221053,1.134011
131,"((whole milk,), (shopping bags,))",0.009023,0.046289,1.134011
118,"((citrus fruit,), (yogurt,))",0.008307,0.109641,1.15288
119,"((yogurt,), (citrus fruit,))",0.008307,0.087349,1.15288
