In [1]:
from math import ceil

import pandas as pd

from mlxtend.frequent_patterns import fpgrowth
from mlxtend.preprocessing import TransactionEncoder

from eclat_algo import eclat, find_conf

import os.path as osp

## Read and transform data

In [2]:
data = pd.read_csv(osp.join('..', 'Data', 'processed.csv'))
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38006 entries, 0 to 38005
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Member_number    38006 non-null  int64 
 1   itemDescription  38006 non-null  object
 2   Date             38006 non-null  int64 
 3   Month            38006 non-null  int64 
 4   Year             38006 non-null  int64 
dtypes: int64(4), object(1)
memory usage: 1.4+ MB


In [121]:
# filtered_data = data[(data['Month'] >= 10) & (data['Month'] <= 12)]
# filtered_data = data[(data['Month'] >= 7) & (data['Month'] <= 12)]
# filtered_data = data[data['Year'] == 2015]
# filtered_data = data[data['Year'] == 2014]
# filtered_data = data[data['Month'] == 2]
filtered_data = data

transformed_df = filtered_data.copy(deep=True)
transformed_df['Tid'] = transformed_df['Member_number'].astype('str') + '-' +\
                        transformed_df['Date'].astype('str') + '-' +\
                        transformed_df['Month'].astype('str') + '-' +\
                        transformed_df['Year'].astype('str')
transformed_df.drop(['Member_number', 'Date', 'Month', 'Year'], axis=1, inplace=True)
transformed_df

Unnamed: 0,itemDescription,Tid
0,tropical fruit,1808-21-7-2015
1,whole milk,2552-5-1-2015
2,pip fruit,2300-19-9-2015
3,other vegetables,1187-12-12-2015
4,whole milk,3037-1-2-2015
...,...,...
38001,sliced cheese,4471-8-10-2014
38002,candy,2022-23-2-2014
38003,cake bar,1097-16-4-2014
38004,fruit/vegetable juice,1510-3-12-2014


## Frequent Itemset Mining

In [122]:
minSup = .01
minConf = .1

no_transacts = len(transformed_df['Tid'].unique())
minFreq = ceil(no_transacts * minSup)
minFreq

150

In [123]:
transactions = transformed_df.groupby('Tid')\
                             .aggregate(list)

encoder = TransactionEncoder()
encoder.fit(transactions['itemDescription'].values)
array = encoder.transform(transactions['itemDescription'].values)
fp_df = pd.DataFrame(array, columns=encoder.columns_)

In [124]:
test_result = fpgrowth(fp_df, minSup, use_colnames=True)

In [125]:
my_results = eclat(transformed_df, minSup, 'itemDescription')

In [126]:
sorted_results = my_results.sort_values('frequencies')
two_set = sorted_results[sorted_results['itemsets'].apply(len) >= 2]
two_set.iloc[:-5:-1]

Unnamed: 0,itemsets,frequencies,support
68,"[other vegetables, whole milk]",222,0.014837
67,"[rolls/buns, whole milk]",209,0.013968
65,"[soda, whole milk]",174,0.011629
64,"[yogurt, whole milk]",167,0.011161


In [127]:
sorted_test = test_result.sort_values('support')
sorted_test[sorted_test['itemsets'].apply(len) >= 2].iloc[:-5:-1]

Unnamed: 0,support,itemsets
68,0.014837,"(whole milk, other vegetables)"
66,0.013968,"(whole milk, rolls/buns)"
65,0.011629,"(whole milk, soda)"
64,0.011161,"(yogurt, whole milk)"


## Association Rules Mining

In [131]:
rules = find_conf(two_set['itemsets'].values, sorted_results)
rules[rules['lift'] >= 1]
rules

Unnamed: 0,rule,frequencies,support,confidence,lift
0,"((rolls/buns,), (other vegetables,))",158,0.010559,0.09599,0.786154
1,"((other vegetables,), (rolls/buns,))",158,0.010559,0.086481,0.786154
2,"((yogurt,), (whole milk,))",167,0.011161,0.129961,0.82294
3,"((whole milk,), (yogurt,))",167,0.011161,0.070673,0.82294
4,"((soda,), (whole milk,))",174,0.011629,0.119752,0.758296
5,"((whole milk,), (soda,))",174,0.011629,0.073635,0.758296
6,"((rolls/buns,), (whole milk,))",209,0.013968,0.126974,0.804028
7,"((whole milk,), (rolls/buns,))",209,0.013968,0.088447,0.804028
8,"((other vegetables,), (whole milk,))",222,0.014837,0.121511,0.76943
9,"((whole milk,), (other vegetables,))",222,0.014837,0.093948,0.76943
