In [2]:
# Import Dependencies
import pandas as pd
import numpy as np
from sklearn import preprocessing
#Transform transactions into a one-hot encoded NumPy array. 
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

In [3]:
# Data Preparation

In [4]:
# First we import the dataset that will be used for the training and testing process
df = pd.read_csv('data/Groceries_dataset.csv')
df.head()

Unnamed: 0,Member_number,Date,itemDescription
0,1808,21-07-2015,tropical fruit
1,2552,05-01-2015,whole milk
2,2300,19-09-2015,pip fruit
3,1187,12-12-2015,other vegetables
4,3037,01-02-2015,whole milk


In [5]:
# Label Encoding Process

# AsTo help make the digestion of the data easier, we will begin to encode the values into discrete forms.

In [6]:
# Encode the dates into numbers
le = preprocessing.LabelEncoder()
le.fit(df['Date'])
le_dict = dict(zip(le.classes_, le.transform(le.classes_)))

In [7]:
# Now that we have converted the dates to discrete numbers we do a sense check to determine if that has been properly coded.
dict(list(le_dict.items())[0:5])
# We see that it has and then proceed forward.

{'01-01-2014': 0,
 '01-01-2015': 1,
 '01-02-2014': 2,
 '01-02-2015': 3,
 '01-03-2014': 4}

In [8]:
categorical_df = pd.DataFrame()
categorical_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 0 entries
Empty DataFrame

In [9]:
#categorical_df['Date'] = le.dict
new_df = pd.DataFrame.from_dict(le_dict, orient='index')
new_df.head()

Unnamed: 0,0
01-01-2014,0
01-01-2015,1
01-02-2014,2
01-02-2015,3
01-03-2014,4


In [10]:
# new_df.rename['0']

In [11]:
merged_df = pd.merge(df,new_df,left_on='Date',right_index=True)
merged_df.tail()

Unnamed: 0,Member_number,Date,itemDescription,0
36938,3559,12-11-2014,napkins,284
37264,3287,12-11-2014,cleaner,284
37269,1312,12-11-2014,soft cheese,284
37989,3914,12-11-2014,specialty bar,284
38216,2111,12-11-2014,domestic eggs,284


In [12]:
merged_df.columns.values[3] = 'date'
merged_df.head()

Unnamed: 0,Member_number,Date,itemDescription,date
0,1808,21-07-2015,tropical fruit,493
12,1997,21-07-2015,frankfurter,493
14,4736,21-07-2015,butter,493
213,3812,21-07-2015,sausage,493
318,4429,21-07-2015,grapes,493


In [23]:
# Come back to this as it is currently not working.
merged_df.drop(columns=['Date'],axis=0)
merged_df.head()

Unnamed: 0,Member_number,Date,itemDescription,date
0,1808,21-07-2015,tropical fruit,493
12,1997,21-07-2015,frankfurter,493
14,4736,21-07-2015,butter,493
213,3812,21-07-2015,sausage,493
318,4429,21-07-2015,grapes,493


In [48]:
# We then do a groupby function focusing on the Date as the primary key.
basket = (merged_df
          .groupby(['Date', 'itemDescription'])['itemDescription']
          .sum().unstack().reset_index().fillna(0)
          .set_index('Date'))

In [49]:
basket

itemDescription,Instant food products,UHT-milk,abrasive cleaner,artif. sweetener,baby cosmetics,bags,baking powder,bathroom cleaner,beef,berries,...,turkey,vinegar,waffles,whipped/sour cream,whisky,white bread,white wine,whole milk,yogurt,zwieback
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
01-01-2014,Instant food productsInstant food products,0,0,0,0,0,0,0,0,berries,...,0,0,waffleswaffles,whipped/sour creamwhipped/sour cream,0,0,0,whole milkwhole milk,yogurtyogurtyogurtyogurt,0
01-01-2015,0,0,0,0,0,0,0,0,beef,0,...,turkey,0,0,whipped/sour cream,0,white bread,0,whole milkwhole milkwhole milk,0,0
01-02-2014,0,0,0,0,0,0,0,0,beefbeef,0,...,0,0,waffleswaffles,whipped/sour cream,0,white bread,0,whole milk,yogurtyogurtyogurt,0
01-02-2015,0,UHT-milk,0,artif. sweetener,0,0,0,0,beefbeef,0,...,0,0,waffles,0,0,white bread,0,whole milkwhole milkwhole milkwhole milkwhole ...,yogurtyogurtyogurtyogurtyogurtyogurt,0
01-03-2014,0,0,0,0,0,0,0,0,0,berries,...,0,0,waffles,whipped/sour creamwhipped/sour creamwhipped/so...,0,white bread,0,whole milk,yogurt,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31-07-2015,0,0,0,0,0,0,0,0,0,0,...,0,vinegar,0,whipped/sour creamwhipped/sour cream,0,0,0,whole milkwhole milkwhole milkwhole milkwhole ...,yogurt,0
31-08-2014,0,UHT-milk,0,0,0,0,0,0,beef,0,...,0,0,waffles,0,0,0,white wine,whole milkwhole milkwhole milkwhole milk,yogurtyogurtyogurt,0
31-08-2015,0,0,0,0,0,0,0,0,0,berries,...,0,0,0,whipped/sour cream,whisky,0,0,whole milkwhole milkwhole milk,yogurtyogurtyogurt,0
31-10-2014,0,0,0,0,0,0,0,0,beef,0,...,0,0,0,0,0,0,white wine,whole milkwhole milk,yogurt,0


In [50]:
# We then convert the column values into numerical
basket.columns


Index(['Instant food products', 'UHT-milk', 'abrasive cleaner',
       'artif. sweetener', 'baby cosmetics', 'bags', 'baking powder',
       'bathroom cleaner', 'beef', 'berries',
       ...
       'turkey', 'vinegar', 'waffles', 'whipped/sour cream', 'whisky',
       'white bread', 'white wine', 'whole milk', 'yogurt', 'zwieback'],
      dtype='object', name='itemDescription', length=167)