In [51]:
# for basic operations
import numpy as np
import pandas as pd

# for visualizations
import matplotlib.pyplot as plt
import squarify
import seaborn as sns
plt.style.use('fivethirtyeight')

# for defining path
import os

# for market basket analysis
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules


**Slidding windows**

In [52]:
# reading the dataset

data = pd.read_csv('dataset.csv', header = None)

# let's check the shape of the dataset
data.shape

(7501, 20)

In [53]:
# checking the head of the data

data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil
1,burgers,meatballs,eggs,,,,,,,,,,,,,,,,,
2,chutney,,,,,,,,,,,,,,,,,,,
3,turkey,avocado,,,,,,,,,,,,,,,,,,
4,mineral water,milk,energy bar,whole wheat rice,green tea,,,,,,,,,,,,,,,


In [54]:
# checkng the tail of the data

data.tail()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
7496,butter,light mayo,fresh bread,,,,,,,,,,,,,,,,,
7497,burgers,frozen vegetables,eggs,french fries,magazines,green tea,,,,,,,,,,,,,,
7498,chicken,,,,,,,,,,,,,,,,,,,
7499,escalope,green tea,,,,,,,,,,,,,,,,,,
7500,eggs,frozen smoothie,yogurt cake,low fat yogurt,,,,,,,,,,,,,,,,


In [55]:
# checking the random entries in the data

data.sample(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
5675,turkey,spaghetti,avocado,,,,,,,,,,,,,,,,,
2619,spaghetti,french fries,escalope,,,,,,,,,,,,,,,,,
3504,mineral water,chocolate,,,,,,,,,,,,,,,,,,
4129,burgers,french wine,eggs,barbecue sauce,cottage cheese,melons,fresh bread,white wine,,,,,,,,,,,,
886,turkey,frozen vegetables,avocado,cake,light cream,cooking oil,chicken,chocolate bread,mashed potato,,,,,,,,,,,
1516,french fries,mayonnaise,,,,,,,,,,,,,,,,,,
1223,fresh tuna,spaghetti,mineral water,salmon,,,,,,,,,,,,,,,,
2031,fromage blanc,eggs,french fries,,,,,,,,,,,,,,,,,
4625,meatballs,,,,,,,,,,,,,,,,,,,
214,olive oil,cookies,mushroom cream sauce,,,,,,,,,,,,,,,,,


In [56]:
# let's describe the dataset

data.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
count,7501,5747,4389,3345,2529,1864,1369,981,654,395,256,154,87,47,25,8,4,4,3,1
unique,115,117,115,114,110,106,102,98,88,80,66,50,43,28,19,8,3,3,3,1
top,mineral water,mineral water,mineral water,mineral water,green tea,french fries,green tea,green tea,green tea,green tea,low fat yogurt,green tea,green tea,green tea,magazines,chocolate,frozen smoothie,protein bar,mayonnaise,olive oil
freq,577,484,375,201,153,107,96,67,57,31,22,15,8,4,3,1,2,2,1,1


## Data Visualizations

In [57]:
import matplotlib.pyplot as plt
import seaborn as sns

from wordcloud import WordCloud

plt.rcParams['figure.figsize'] = (15, 15)
wordcloud = WordCloud(background_color = 'yellow', width = 1200,  height = 1200, max_words = 121).generate(str(data[0]))
plt.show();

In [58]:

y = data[0].value_counts().head(50).to_frame()
y.index

Index(['mineral water', 'burgers', 'turkey', 'chocolate', 'frozen vegetables',
       'spaghetti', 'shrimp', 'grated cheese', 'eggs', 'cookies',
       'french fries', 'herb & pepper', 'ground beef', 'tomatoes', 'milk',
       'escalope', 'fresh tuna', 'red wine', 'ham', 'cake', 'green tea',
       'whole wheat pasta', 'pancakes', 'soup', 'muffins', 'energy bar',
       'olive oil', 'champagne', 'pepper', 'avocado', 'butter',
       'parmesan cheese', 'whole wheat rice', 'low fat yogurt', 'chicken',
       'vegetables mix', 'pickles', 'meatballs', 'frozen smoothie',
       'yogurt cake', 'salmon', 'hot dogs', 'dessert wine', 'honey', 'cereals',
       'candy bars', 'yams', 'strawberries', 'oil', 'tomato sauce'],
      dtype='object')

In [59]:
data['food'] = 'Food'
food = data.truncate(before = -1, after = 15)


import networkx as nx

food = nx.from_pandas_edgelist(food, source = 'food', target = 0, edge_attr = True)

In [60]:
import warnings
warnings.filterwarnings('ignore')


pos = nx.spring_layout(food)
color = plt.cm.Wistia(np.linspace(0, 15, 1))


In [61]:
data['secondchoice'] = 'Second Choice'
secondchoice = data.truncate(before = -1, after = 15)
secondchoice = nx.from_pandas_edgelist(secondchoice, source = 'food', target = 1, edge_attr = True)

In [62]:
import warnings
warnings.filterwarnings('ignore')

pos = nx.spring_layout(secondchoice)
color = plt.cm.Blues(np.linspace(0, 15, 1))


In [63]:
data['thirdchoice'] = 'Third Choice'
secondchoice = data.truncate(before = -1, after = 10)
secondchoice = nx.from_pandas_edgelist(secondchoice, source = 'food', target = 2, edge_attr = True)

In [64]:
import warnings
warnings.filterwarnings('ignore')


pos = nx.spring_layout(secondchoice)
color = plt.cm.Reds(np.linspace(0, 15, 1))


## Data Preprocessing

In [65]:
# making each customers shopping items an identical list
trans = []
for i in range(0, 7501):
    trans.append([str(data.values[i,j]) for j in range(0, 20)])

# conveting it into an numpy array
trans = np.array(trans)

# checking the shape of the array
print(trans.shape)

(7501, 20)


## Using Transaction Sliding windows

In [66]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder

te = TransactionEncoder()
data = te.fit_transform(trans)
data = pd.DataFrame(data, columns = te.columns_)

# getting the shape of the data
data.shape

(7501, 121)

In [67]:
import warnings
warnings.filterwarnings('ignore')

# getting correlations for 121 items would be messy 
# so let's reduce the items from 121 to 50

data = data.loc[:, ['mineral water', 'burgers', 'turkey', 'chocolate', 'frozen vegetables', 'spaghetti',
                    'shrimp', 'grated cheese', 'eggs', 'cookies', 'french fries', 'herb & pepper', 'ground beef',
                    'tomatoes', 'milk', 'escalope', 'fresh tuna', 'red wine', 'ham', 'cake', 'green tea',
                    'whole wheat pasta', 'pancakes', 'soup', 'muffins', 'energy bar', 'olive oil', 'champagne', 
                    'avocado', 'pepper', 'butter', 'parmesan cheese', 'whole wheat rice', 'low fat yogurt', 
                    'chicken', 'vegetables mix', 'pickles', 'meatballs', 'frozen smoothie', 'yogurt cake']]

# checking the shape
data.shape

(7501, 40)

In [68]:
# let's check the columns

data.columns

Index(['mineral water', 'burgers', 'turkey', 'chocolate', 'frozen vegetables',
       'spaghetti', 'shrimp', 'grated cheese', 'eggs', 'cookies',
       'french fries', 'herb & pepper', 'ground beef', 'tomatoes', 'milk',
       'escalope', 'fresh tuna', 'red wine', 'ham', 'cake', 'green tea',
       'whole wheat pasta', 'pancakes', 'soup', 'muffins', 'energy bar',
       'olive oil', 'champagne', 'avocado', 'pepper', 'butter',
       'parmesan cheese', 'whole wheat rice', 'low fat yogurt', 'chicken',
       'vegetables mix', 'pickles', 'meatballs', 'frozen smoothie',
       'yogurt cake'],
      dtype='object')

In [69]:
# getting the head of the data

data.head()

Unnamed: 0,mineral water,burgers,turkey,chocolate,frozen vegetables,spaghetti,shrimp,grated cheese,eggs,cookies,...,butter,parmesan cheese,whole wheat rice,low fat yogurt,chicken,vegetables mix,pickles,meatballs,frozen smoothie,yogurt cake
0,True,False,False,False,False,False,True,False,False,False,...,False,False,False,True,False,True,False,False,True,False
1,False,True,False,False,False,False,False,False,True,False,...,False,False,False,False,False,False,False,True,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,True,False,False,False,False,False,False,False,False,False,...,False,False,True,False,False,False,False,False,False,False


## Frequent item set

In [70]:
from mlxtend.frequent_patterns import apriori

#Now, let us return the items and itemsets with at least 5% support:
apriori(data, min_support = 0.01, use_colnames = True)


Unnamed: 0,support,itemsets
0,0.238368,(mineral water)
1,0.087188,(burgers)
2,0.062525,(turkey)
3,0.163845,(chocolate)
4,0.095321,(frozen vegetables)
...,...,...
204,0.010132,"(eggs, ground beef, mineral water)"
205,0.013065,"(eggs, milk, mineral water)"
206,0.011065,"(ground beef, milk, mineral water)"
207,0.010532,"(chocolate, eggs, spaghetti)"


## Selecting and Filtering the Results

In [73]:
frequent_itemsets = apriori(data, min_support = 0.07, use_colnames=True)
frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))
frequent_itemsets

Unnamed: 0,support,itemsets,length
0,0.238368,(mineral water),1
1,0.087188,(burgers),1
2,0.163845,(chocolate),1
3,0.095321,(frozen vegetables),1
4,0.17411,(spaghetti),1
5,0.071457,(shrimp),1
6,0.179709,(eggs),1
7,0.080389,(cookies),1
8,0.170911,(french fries),1
9,0.098254,(ground beef),1


In [74]:
# getting th item sets with length = 2 and support more han 10%

frequent_itemsets[ (frequent_itemsets['length'] == 2) &
                   (frequent_itemsets['support'] >= 0.01) ]

Unnamed: 0,support,itemsets,length


In [75]:
# getting th item sets with length = 2 and support more han 10%

frequent_itemsets[ (frequent_itemsets['length'] == 1) &
                   (frequent_itemsets['support'] >= 0.01) ]

Unnamed: 0,support,itemsets,length
0,0.238368,(mineral water),1
1,0.087188,(burgers),1
2,0.163845,(chocolate),1
3,0.095321,(frozen vegetables),1
4,0.17411,(spaghetti),1
5,0.071457,(shrimp),1
6,0.179709,(eggs),1
7,0.080389,(cookies),1
8,0.170911,(french fries),1
9,0.098254,(ground beef),1


## Association Mining

In [76]:
frequent_itemsets[ frequent_itemsets['itemsets'] == {'eggs', 'mineral water'} ]


Unnamed: 0,support,itemsets,length


In [77]:
frequent_itemsets[ frequent_itemsets['itemsets'] == {'mineral water'} ]


Unnamed: 0,support,itemsets,length
0,0.238368,(mineral water),1


In [78]:
frequent_itemsets[ frequent_itemsets['itemsets'] == {'milk'} ]


Unnamed: 0,support,itemsets,length
10,0.129583,(milk),1


In [79]:
frequent_itemsets[ frequent_itemsets['itemsets'] == {'chicken'} ]


Unnamed: 0,support,itemsets,length


In [80]:
frequent_itemsets[ frequent_itemsets['itemsets'] == {'frozen vegetables'} ]


Unnamed: 0,support,itemsets,length
3,0.095321,(frozen vegetables),1


In [81]:
frequent_itemsets[ frequent_itemsets['itemsets'] == {'chocolate'} ]


Unnamed: 0,support,itemsets,length
2,0.163845,(chocolate),1
