In [1]:
# Import the required librairies.
# for basic operations
import numpy as np
import pandas as pd

# for visualizations
import matplotlib.pyplot as plt
import seaborn as sns

# for market basket analysis
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

# Loading and exploring the data
# reading the dataset
data = pd.read_csv('store_data.csv', header = None)

In [2]:
# Load data and display the first 7 rows of the table.
print('Displaying the first 7 rows of the table ')
data.head(7)

Displaying the first 7 rows of the table 


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil
1,burgers,meatballs,eggs,,,,,,,,,,,,,,,,,
2,chutney,,,,,,,,,,,,,,,,,,,
3,turkey,avocado,,,,,,,,,,,,,,,,,,
4,mineral water,milk,energy bar,whole wheat rice,green tea,,,,,,,,,,,,,,,
5,low fat yogurt,,,,,,,,,,,,,,,,,,,
6,whole wheat pasta,french fries,,,,,,,,,,,,,,,,,,


In [3]:
# How many products is purchased during the week the data was recorded?
week = data.head(7)
week_count = sum(week.count())
print('How many products is purchased during the week the data was recorded? =>',week_count)

How many products is purchased during the week the data was recorded? => 34


In [4]:
# Count the number of unique products in the store 
# making each customers shopping items an identical list
trans = []
for i in range(0, 7501):
    trans.append([str(data.values[i,j]) for j in range(0, 20)])
    
# conveting it into an numpy array
trans = np.array(trans)

from mlxtend.preprocessing import TransactionEncoder

te = TransactionEncoder()
data = te.fit_transform(trans)
data = pd.DataFrame(data, columns = te.columns_)

#unique count
unique = data.columns
print('Count of the number of unique products in the store :',len(unique))

Count of the number of unique products in the store : 121


In [5]:
from apyori import apriori as apriori1
#Load data 
data1 = pd.read_csv('store_data.csv',header=None)
shape = data1.shape
# Reshaping the data: The Apyori library we are using requires our dataset to be in the form of a list of lists.
trans1 = []
for i in range(0, shape[0]):
    trans1.append([str(data1.values[i,j]) for j in range(0, shape[1])])
# conveting it into an numpy array
trans1 = np.array(trans1)
print('List of list store data',trans1)

List of list store data [['shrimp' 'almonds' 'avocado' ... 'frozen smoothie' 'spinach'
  'olive oil']
 ['burgers' 'meatballs' 'eggs' ... 'nan' 'nan' 'nan']
 ['chutney' 'nan' 'nan' ... 'nan' 'nan' 'nan']
 ...
 ['chicken' 'nan' 'nan' ... 'nan' 'nan' 'nan']
 ['escalope' 'green tea' 'nan' ... 'nan' 'nan' 'nan']
 ['eggs' 'frozen smoothie' 'yogurt cake' ... 'nan' 'nan' 'nan']]


In [6]:
# compute the number of rules that can be generated if we proceed by brute force approach?


In [7]:
# Sort the data proportionally to the number of items in each transaction (largest transactions first).
data['count']=data.count(axis=1)
data = data.sort_values(by=['count'],ascending=False)
data.head()

Unnamed: 0,asparagus,almonds,antioxydant juice,asparagus.1,avocado,babies food,bacon,barbecue sauce,black tea,blueberries,...,vegetables mix,water spray,white wine,whole weat flour,whole wheat pasta,whole wheat rice,yams,yogurt cake,zucchini,count
0,False,True,True,False,True,False,False,False,False,False,...,True,False,False,True,False,False,True,False,False,121
4997,False,False,False,False,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,121
5009,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,121
5008,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,121
5007,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,121


In [8]:
#We are only interested in finding rules for the items that have certain minimum occurence (e.g. support) 
#and have a minimum value for co-occurrence with other items (e.g. confidence). 
#The first task is therefore to make a good strategy for setting optimal minimum values for support and confidence. 
#Try to argue your strategy.
from apyori import apriori as apriori1
#Load data 
data1 = pd.read_csv('store_data.csv',header=None)
shape = data1.shape
# making each customers shopping items an identical list
trans = []
for i in range(0, shape[0]):
    trans.append([str(data1.values[i,j]) for j in range(0, shape[1])])
# conveting it into an numpy array
trans = np.array(trans)
from mlxtend.preprocessing import TransactionEncoder
te = TransactionEncoder()
data1 = te.fit_transform(trans)
data1 = pd.DataFrame(data1, columns = te.columns_)
# association_rules(records)
#building the apriori model
association_rules  = apriori1(trans, min_support = 0.003, min_confidence = 0.2, min_lift = 3, min_length = 2)
association_results = list(association_rules)
#getting the number of rules
print(len(association_results))
l1=[]
for i in range(0,len(association_results)-1):
    pair = association_results[i]
    list1 =[pair.items,pair.support,pair.ordered_statistics[0][2],pair.ordered_statistics[0][3]]
    l1.append(list1)
df = pd.DataFrame(l1,columns=['Rule','Support','Confidence','Lift'])
edge_list = list(df['Rule'])
df

160


Unnamed: 0,Rule,Support,Confidence,Lift
0,"(chicken, light cream)",0.004533,0.290598,4.843951
1,"(mushroom cream sauce, escalope)",0.005733,0.300699,3.790833
2,"(escalope, pasta)",0.005866,0.372881,4.700812
3,"(honey, fromage blanc)",0.003333,0.245098,5.164271
4,"(herb & pepper, ground beef)",0.015998,0.323450,3.291994
...,...,...,...,...
154,"(spaghetti, ground beef, milk, mineral water, ...",0.004399,0.200000,3.348661
155,"(spaghetti, ground beef, olive oil, mineral wa...",0.003066,0.216981,3.632981
156,"(spaghetti, ground beef, mineral water, pancak...",0.003066,0.211009,3.532991
157,"(spaghetti, tomatoes, ground beef, mineral wat...",0.003066,0.261364,4.376091


In [14]:
#Generate the frequent itemsets and the rules considering the obtained support and confidence thresholds.
sort_data = pd.read_csv('store_data.csv', header = None)

transactions = []
for i in range(0, shape[0]):
    transactions.append([str(sort_data.values[i,j]) for j in range(0, shape[1])])  
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori as app
from mlxtend.frequent_patterns import association_rules
import time
support_threshold = 0.004
te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
df = pd.DataFrame(te_ary, columns=te.columns_)
# apriori
from mlxtend.frequent_patterns import apriori as app
frequent_itemsets = app(df, min_support=0.003, use_colnames=True)
print(frequent_itemsets)
#dataframe with the itemsets


       support                                           itemsets
0     0.020397                                          (almonds)
1     0.008932                                (antioxydant juice)
2     0.004666                                        (asparagus)
3     0.033329                                          (avocado)
4     0.004533                                      (babies food)
...        ...                                                ...
2881  0.003066  (spaghetti, ground beef, mineral water, pancak...
2882  0.003066  (spaghetti, tomatoes, ground beef, mineral wat...
2883  0.003333   (spaghetti, milk, olive oil, mineral water, nan)
2884  0.003066      (spaghetti, milk, mineral water, shrimp, nan)
2885  0.003333    (spaghetti, tomatoes, milk, mineral water, nan)

[2886 rows x 2 columns]


In [10]:
#Display the generated rules. How many they are? Try to analyze these associations.


In [11]:
# Before visualizing the results, display the rules in a more clear way; in a table showing the support, 
#the confidence, and lift for each rule.



In [12]:
# For the visualization task, you can choose to answer one of the two following questions:
# We can visualize rulesets as graphs, with items as nodes and connections as arcs between them. 
#This help to find interesting (subjective) patterns from the data. To achieve this goal, 
#use NetworkXpython library.
import networkx as nx
G=nx.Graph()
G.add_nodes_from(y.index)
edge_list = [list(x) for x in edge_list]
edge_list
G.add_edges_from(edge_list)

NameError: name 'y' is not defined

In [None]:
# Draw a discrete heat map for the most frequent binary items. Use many colors, 
# for example; the red cells present the items having the highest co-occurrence, 
#and white cells present those having the lowest co-occurrence. You can use pandas, matplotlib and
#seaborn visualization library.


In [None]:
#Apply the algorithm apriroi and fp-growth implemented in frequent_patterns library. 
#Then, find the time spent to generate the rules by the two apriori implementations and fp-growth. 
#Based on these runtimes, compare the effectiveness and complexity of the two algorithms.
from mlxtend.frequent_patterns import fpgrowth
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori as apriori1
from mlxtend.frequent_patterns import fpgrowth

fpgrowth(data1, min_support=0.1)

te = TransactionEncoder()
te_ary = te.fit(trans).transform(trans)
df = pd.DataFrame(te_ary, columns=te.columns_)

%timeit -n 100 -r 10 apriori1(df, min_support=0.6)

%timeit -n 100 -r 10 apriori1(df, min_support=0.6, low_memory=True)

%timeit -n 100 -r 10 fpgrowth(df, min_support=0.6)