In [118]:
import pandas as pd
import numpy as np
import time
from sklearn.feature_extraction import DictVectorizer

from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

from efficient_apriori import apriori as origin_apriori

In [122]:
# load data
market = pd.read_csv('Market_Basket_Optimisation.csv', header = None)
# market.fillna(0, inplace = True)

# create head for item set
market_hot_encoded = pd.DataFrame(columns = ["items"])

print(market.head())
print(market_hot_encoded.head())

              0          1           2                 3             4   \
0         shrimp    almonds     avocado    vegetables mix  green grapes   
1        burgers  meatballs        eggs               NaN           NaN   
2        chutney        NaN         NaN               NaN           NaN   
3         turkey    avocado         NaN               NaN           NaN   
4  mineral water       milk  energy bar  whole wheat rice     green tea   

                 5     6               7             8             9   \
0  whole weat flour  yams  cottage cheese  energy drink  tomato juice   
1               NaN   NaN             NaN           NaN           NaN   
2               NaN   NaN             NaN           NaN           NaN   
3               NaN   NaN             NaN           NaN           NaN   
4               NaN   NaN             NaN           NaN           NaN   

               10         11     12     13             14      15  \
0  low fat yogurt  green tea  honey  sala

In [123]:
# clean extra space
for i in market.index:
    market.iloc[i] = market.iloc[i].str.strip()
    
# collection all data row by row
for i in market.index:
    item = ""
    # unique for only one item and sort it
    for j in market.iloc[i].unique():
        item +=str(j) + "|"
    market_hot_encoded.loc[i, 'items'] = item

market_hot_encoded.head()


Unnamed: 0,items
0,shrimp|almonds|avocado|vegetables mix|green gr...
1,burgers|meatballs|eggs|nan|
2,chutney|nan|
3,turkey|avocado|nan|
4,mineral water|milk|energy bar|whole wheat rice...


In [124]:
# one hot code and delete nan value
market_hot_encoded = market_hot_encoded["items"].str.get_dummies(sep = "|").drop('nan', axis = 1)
market_hot_encoded.head()

Unnamed: 0,almonds,antioxydant juice,asparagus,avocado,babies food,bacon,barbecue sauce,black tea,blueberries,body spray,bramble,brownies,bug spray,burger sauce,burgers,butter,cake,candy bars,carrots,cauliflower,cereals,champagne,chicken,chili,chocolate,chocolate bread,chutney,cider,clothes accessories,cookies,cooking oil,corn,cottage cheese,cream,dessert wine,eggplant,eggs,energy bar,energy drink,escalope,extra dark chocolate,flax seed,french fries,french wine,fresh bread,fresh tuna,fromage blanc,frozen smoothie,frozen vegetables,gluten free bar,...,melons,milk,mineral water,mint,mint green tea,muffins,mushroom cream sauce,napkins,nonfat milk,oatmeal,oil,olive oil,pancakes,parmesan cheese,pasta,pepper,pet food,pickles,protein bar,red wine,rice,salad,salmon,salt,sandwich,shallot,shampoo,shrimp,soda,soup,spaghetti,sparkling water,spinach,strawberries,strong cheese,tea,tomato juice,tomato sauce,tomatoes,toothpaste,turkey,vegetables mix,water spray,white wine,whole weat flour,whole wheat pasta,whole wheat rice,yams,yogurt cake,zucchini
0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,...,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,1,0,0,1,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,...,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0


## Mlxtend Apriori

In [94]:
# mini_support = 0.02
# use_colnames means 'uses the dataframes column names in the returned DataFrame instead of column indices'
itemsets = apriori(market_hot_encoded, use_colnames = True, min_support = 0.02)
# print(itemsets)
itemsets = itemsets.sort_values(by = "support", ascending = False)
print(itemsets)

     support                           itemsets
34  0.238368                    (mineral water)
13  0.179709                             (eggs)
44  0.174110                        (spaghetti)
17  0.170911                     (french fries)
9   0.163845                        (chocolate)
..       ...                                ...
0   0.020397                          (almonds)
80  0.020264   (mineral water, frozen smoothie)
98  0.020131  (mineral water, whole wheat rice)
78  0.020131           (pancakes, french fries)
67  0.020131       (mineral water, cooking oil)

[103 rows x 2 columns]


In [97]:
# lift value
rules = association_rules(itemsets, metric = "lift", min_threshold = 2)
rules = rules.sort_values(by = "lift", ascending = False)
print(rules)

     antecedents    consequents  antecedent support  consequent support  \
0    (spaghetti)  (ground beef)            0.174110            0.098254   
1  (ground beef)    (spaghetti)            0.098254            0.174110   

    support  confidence      lift  leverage  conviction  
0  0.039195    0.225115  2.291162  0.022088    1.163716  
1  0.039195    0.398915  2.291162  0.022088    1.373997  
