In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import pyfpgrowth

pd.set_option('display.max_columns', 100)
warnings.filterwarnings('ignore')
sns.set()

In [2]:
def parse_file(filename):
    # open file and read the content of the file
    with open(filename,'r') as data:
        content = data.read()
        # get unique items in the dataset
        items = content.replace('\n',',').split(',')
        items.remove('')  #remove blank rows/transaction
        items = set(items)
        
        # get all line by line transactions
        all_lines = content.split('\n')
        all_lines.remove('')  #remove blank rows/transaction
        no_of_line = len(all_lines)
        print('There are {} unique items in the dataset'.format(len(items)))
        
        # create a dataframe where if Item is present in Transaction in dataset then its value is 1
        raw_data = pd.DataFrame(data=0,index=range(0,no_of_line),columns=items)
        for index,line in enumerate(all_lines):
            print("Parsing {}/{} transaction".format(index,no_of_line),end='\r')
            item_list = line.strip().split(',')
            raw_data.loc[index,item_list] = 1   
    return raw_data

In [3]:
raw_data = parse_file('groceries.csv')
raw_data.sample(3)

There are 169 unique items in the dataset
Parsing 9834/9835 transaction

Unnamed: 0,packaged fruit/vegetables,meat spreads,soft cheese,beef,potted plants,prosecco,flower (seeds),sauces,baby cosmetics,brown bread,liquor,semi-finished bread,cereals,dish cleaner,organic sausage,baby food,pork,cookware,yogurt,sausage,make up remover,misc. beverages,male cosmetics,grapes,frozen potato products,long life bakery product,bottled beer,sparkling wine,pickled vegetables,rum,mayonnaise,cocoa drinks,fish,canned vegetables,chocolate marshmallow,soap,oil,brandy,margarine,red/blush wine,coffee,turkey,specialty fat,cling film/bags,napkins,salt,flour,canned beer,specialty cheese,specialty chocolate,...,roll products,cake bar,domestic eggs,honey,snack products,light bulbs,whisky,bathroom cleaner,condensed milk,photo/film,tea,chicken,curd cheese,pasta,baking powder,bottled water,berries,kitchen utensil,cream,hygiene articles,candles,dessert,frozen meals,whipped/sour cream,frozen vegetables,white bread,ice cream,canned fish,soups,preservation products,herbs,zwieback,tidbits,spread cheese,liquor (appetizer),cleaner,curd,beverages,rice,mustard,sliced cheese,liver loaf,shopping bags,white wine,canned fruit,ready soups,frozen dessert,processed cheese,vinegar,salad dressing
9209,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8675,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
2385,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [4]:
raw_data.shape

(9835, 169)

In [5]:
# create copy ofr raw_data
raw_data_copy = raw_data.copy()

### Top 10 products

In [6]:
# Top 10 products
raw_data.sum().nlargest(10)

whole milk          2513
other vegetables    1903
rolls/buns          1809
soda                1715
yogurt              1372
bottled water       1087
root vegetables     1072
tropical fruit      1032
shopping bags        969
sausage              924
dtype: int64

### Lowest 10 Products

In [7]:
# Lowest 10 Products
raw_data.sum().nsmallest(10)

baby food                1
sound storage medium     1
preservation products    2
bags                     4
kitchen utensil          4
baby cosmetics           6
frozen chicken           6
toilet cleaner           7
make up remover          8
whisky                   8
dtype: int64

### finding items which makes 80% of the sales

In [8]:
# finding items which makes 80% of the sales
total_purchase = raw_data.sum().sum()
item_sale_cumulative = raw_data.sum().div(total_purchase).mul(100).sort_values(ascending=False).cumsum()
item_sale_cumulative.head(3)

whole milk           5.794729
other vegetables    10.182858
rolls/buns          14.354232
dtype: float64

In [9]:
sales_eighty_per = item_sale_cumulative[item_sale_cumulative <= 80]
sales_eighty_per.tail(5)

butter milk      77.383725
specialty bar    78.004012
beverages        78.594323
ham              79.184633
meat             79.770332
dtype: float64

In [10]:
print("There are {} items which account for 80% of the sales".format(sales_eighty_per.shape[0]))
unique_item_per = round(sales_eighty_per.shape[0]/len(raw_data.columns),4)*100
print("Only {}% out of item all the items  account for 80% of the sales".format(unique_item_per))

There are 53 items which account for 80% of the sales
Only 31.36% out of item all the items  account for 80% of the sales


### To reduce the size of the dataset we will only consider the transactions with these 53 items which account for 80% of sales

In [11]:
# consider only 53 items for further analysis
subset = raw_data[sales_eighty_per.index].copy()

# remove rows all zeros value
subset = subset.loc[subset.sum(axis=1)!=0,:].copy()

subset.sample(3)

Unnamed: 0,whole milk,other vegetables,rolls/buns,soda,yogurt,bottled water,root vegetables,tropical fruit,shopping bags,sausage,pastry,citrus fruit,bottled beer,newspapers,canned beer,pip fruit,fruit/vegetable juice,whipped/sour cream,brown bread,domestic eggs,frankfurter,margarine,coffee,pork,butter,curd,beef,napkins,chocolate,frozen vegetables,chicken,white bread,cream cheese,waffles,salty snack,long life bakery product,dessert,sugar,UHT-milk,berries,hamburger meat,hygiene articles,onions,specialty chocolate,candy,misc. beverages,frozen meals,oil,butter milk,specialty bar,beverages,ham,meat
9161,0,0,1,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4276,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7654,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [12]:
print("Orignal Transaction dataset shape {}".format(raw_data.shape))
print("Reduce Transaction dataset shape {}".format(subset.shape))

Orignal Transaction dataset shape (9835, 169)
Reduce Transaction dataset shape (9372, 53)


### Creating Cross Sell Grid Matrix where row is Primary Product and columns is other products purhcase with primary

In [14]:
# creating empty cross sell matrix 
cross_sell_grid = pd.DataFrame(index=subset.columns,columns=subset.columns,data=0)

# add primary count for the count of reference product considered/index
cross_sell_grid.insert(loc = 0, column='Primary_Count', value=0)

# filling the cross sell matrix
for item in subset.columns:
    sales = subset[subset[item]==1]
    
    # adding count for reference product
    cross_sell_grid.at[item,'Primary_Count'] = sales[item].sum()
    
    # adding count for all other products i.e. secondary product
    count = sales.sum()
    cross_sell_grid.loc[item,count.index] = count
    
    # making column of primary product 0 as it is already counted in Primary_Count column
    cross_sell_grid.at[item,item] = 0

In [15]:
cross_sell_grid

Unnamed: 0,Primary_Count,whole milk,other vegetables,rolls/buns,soda,yogurt,bottled water,root vegetables,tropical fruit,shopping bags,sausage,pastry,citrus fruit,bottled beer,newspapers,canned beer,pip fruit,fruit/vegetable juice,whipped/sour cream,brown bread,domestic eggs,frankfurter,margarine,coffee,pork,butter,curd,beef,napkins,chocolate,frozen vegetables,chicken,white bread,cream cheese,waffles,salty snack,long life bakery product,dessert,sugar,UHT-milk,berries,hamburger meat,hygiene articles,onions,specialty chocolate,candy,misc. beverages,frozen meals,oil,butter milk,specialty bar,beverages,ham,meat
whole milk,2513,0,736,557,394,551,338,481,416,241,294,327,300,201,269,87,296,262,317,248,295,202,238,184,218,271,257,209,194,164,201,173,168,162,125,110,133,135,148,39,116,145,126,119,79,81,69,97,111,114,64,67,113,98
other vegetables,1903,736,0,419,322,427,244,466,353,228,265,222,284,159,190,89,257,207,284,184,219,162,194,132,213,197,169,194,142,125,175,176,135,135,99,106,105,114,106,80,101,136,94,140,60,68,55,74,98,102,55,51,90,98
rolls/buns,1809,557,419,0,377,338,238,239,242,192,301,206,165,134,194,111,137,143,144,124,154,189,145,108,111,132,99,134,115,116,100,95,64,98,90,49,78,67,69,63,65,85,58,67,55,70,46,48,50,75,55,53,68,68
soda,1715,394,322,377,0,269,285,183,205,242,239,207,126,167,144,136,131,181,114,124,122,111,100,98,117,87,80,80,118,133,85,82,101,67,94,92,75,97,72,75,72,57,69,52,62,85,72,61,46,45,71,47,49,54
yogurt,1372,551,427,338,269,0,226,254,288,150,193,174,213,91,151,53,177,184,204,143,141,110,140,96,94,144,170,115,121,91,122,82,89,122,74,61,86,97,68,73,104,64,72,71,47,54,44,61,52,84,44,54,66,52
bottled water,1087,338,244,238,285,226,0,154,182,108,118,88,133,155,111,79,104,140,86,81,90,72,101,72,73,88,60,61,85,57,61,52,43,58,42,43,42,51,47,72,40,29,56,58,39,36,52,31,41,37,27,30,29,30
root vegetables,1072,481,466,239,183,254,154,0,207,126,147,108,174,95,113,40,153,118,168,100,141,100,109,72,134,127,107,171,98,63,114,107,78,74,65,49,52,57,63,48,65,61,53,93,33,42,30,38,69,50,21,31,36,50
tropical fruit,1032,416,353,242,205,288,182,207,0,133,137,130,196,81,116,30,201,135,136,105,112,93,92,70,84,98,101,75,99,80,86,63,86,71,60,55,62,62,47,49,66,42,66,56,33,53,45,54,46,54,32,42,53,33
shopping bags,969,241,228,192,242,150,108,126,133,0,154,117,96,55,68,112,92,105,78,91,89,81,56,92,63,49,53,48,71,80,41,49,73,55,54,59,53,61,35,46,49,39,51,42,47,44,42,46,30,38,34,30,38,33
sausage,924,294,265,301,239,193,118,147,137,154,0,123,111,77,79,62,106,99,89,105,94,99,70,68,64,85,75,55,66,65,59,52,71,55,49,52,53,58,36,34,49,51,43,36,24,36,22,37,40,36,23,27,49,52


# Naive cross sell recommendations basis cross sell product grid count

In [18]:
# function for getting cross sell recommendations basis cross grid count
def cross_sell_recommend(cross_sell_grid, primary_item):
    primary = cross_sell_grid.loc[primary_item,:].copy()
    primary.drop('Primary_Count',inplace=True)
    xsell_recommend = primary[primary > 0].nlargest(3).copy()
    return xsell_recommend

In [19]:
# get cross sell recommendations for soda
xsell_recommend = cross_sell_recommend(cross_sell_grid,'soda')
xsell_recommend

whole milk          394
rolls/buns          377
other vegetables    322
Name: soda, dtype: int64

In [24]:
# get cross sell recommendations for other candy
xsell_recommend = cross_sell_recommend(cross_sell_grid,'candy')
xsell_recommend

soda          85
whole milk    81
rolls/buns    70
Name: candy, dtype: int64

### Association Rule Mining using FP Growth

In [25]:
# prepraring data for FP Growth : Getting Items purchased sequence for each transactions
subset.columns[subset.loc[0,:] > 0]

Index(['citrus fruit', 'margarine'], dtype='object')

In [26]:
transactions = []
for idx,row in subset.iterrows():
    item_purchase = subset.columns[row > 0].tolist()
    transactions.append(item_purchase)  

In [27]:
# Setting mimium support threshold to be 2% of the transactions
support_threshold_per = 0.5
support_threshold_count = int(support_threshold_per*subset.shape[0]/100)
support_threshold_count

46

In [28]:
patterns = pyfpgrowth.find_frequent_patterns(transactions, support_threshold_count)
frequent_pattern = pd.DataFrame.from_dict(patterns,
                                          orient='index',
                                          columns=['Support_Count']).sort_values(by='Support_Count',ascending=False)
frequent_pattern.head(10)

Unnamed: 0,Support_Count
"(whole milk,)",2513
"(other vegetables,)",1903
"(other vegetables, whole milk)",736
"(rolls/buns, whole milk)",557
"(whole milk, yogurt)",551
"(root vegetables, whole milk)",481
"(other vegetables, root vegetables)",466
"(other vegetables, yogurt)",427
"(other vegetables, rolls/buns)",419
"(tropical fruit, whole milk)",416


From Frequent Pattern we can see that (other vegetables, whole milk) are the most frequent product brought together

In [29]:
frequent_pattern.shape

(869, 1)

In [30]:
confidence_threshold = 0.6
rules = pyfpgrowth.generate_association_rules(patterns, confidence_threshold)
association_rule = pd.DataFrame.from_dict(rules,orient='index',
                                          columns=['Secondary_Product',
                                                   'Confidence']).sort_values(by='Confidence',ascending=False)
association_rule

Unnamed: 0,Secondary_Product,Confidence
"(curd, domestic eggs)","(whole milk,)",0.734375
"(butter, curd)","(whole milk,)",0.716418
"(other vegetables, pip fruit, root vegetables)","(whole milk,)",0.675
"(rolls/buns, root vegetables, yogurt)","(whole milk,)",0.647887
"(citrus fruit, root vegetables, whole milk)","(other vegetables,)",0.633333
"(citrus fruit, other vegetables, yogurt)","(whole milk,)",0.626667
"(other vegetables, pip fruit, yogurt)","(whole milk,)",0.625
"(domestic eggs, pip fruit)","(whole milk,)",0.623529
"(curd, pip fruit)","(whole milk,)",0.623377
"(domestic eggs, margarine)","(whole milk,)",0.621951


There is around 73% chance that whole milk will be bought with curd and  domestic eggs. Hence it there can three strategy that can be used here:
1. Put Whole Milk near Curd and Domestic Eggs 
2. Bundle Whole Milk, Curd, Domestic Eggs as a single product offering
3. Put Whole Milk at the opposite end of Curd and Domestic Eggs so that Customer will be force to travel to the other end and in the process may some additional product