In [25]:
import numpy as np
import pandas as pd

In [26]:
# loading dataset using pandas
path = "Groceries_Dataset.csv"

# reading csv file and printing out first 15 rows
df = pd.read_csv(path)
df.head(15)

Unnamed: 0,Member_number,Date,itemDescription
0,1808,21-07-2015,tropical fruit
1,2552,05-01-2015,whole milk
2,2300,19-09-2015,pip fruit
3,1187,12-12-2015,other vegetables
4,3037,01-02-2015,whole milk
5,4941,14-02-2015,rolls/buns
6,4501,08-05-2015,other vegetables
7,3803,23-12-2015,pot plants
8,2762,20-03-2015,whole milk
9,4119,12-02-2015,tropical fruit


In [27]:
# printing the number of unique items in this column
unique_items_count = df['itemDescription'].nunique()
print(f"There are {unique_items_count} unique items in the dataset")

There are 167 unique items in the dataset


This dataset contains repetitive 'Member_number' that means a person has bought multiple items    
so by creating a transaction matrix (also knwon as one-hot encoded matrix) we can see which item each customer purchased       

In [28]:
# creates a table ___ rows: each unique customer id
#                 \__ columns: each unique item
transaction_matrix = pd.crosstab(df['Member_number'], df['itemDescription'])

# converting to ones-zeros
# converts x>0 to 1 (purchased)
# and 0 means not purchased
transaction_matrix = transaction_matrix.map(lambda x : 1 if x>0 else 0)
transaction_matrix

itemDescription,Instant food products,UHT-milk,abrasive cleaner,artif. sweetener,baby cosmetics,bags,baking powder,bathroom cleaner,beef,berries,...,turkey,vinegar,waffles,whipped/sour cream,whisky,white bread,white wine,whole milk,yogurt,zwieback
Member_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1000,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0
1001,0,0,0,0,0,0,0,0,1,0,...,0,0,0,1,0,1,0,1,0,0
1002,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1003,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1004,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,0
4998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4999,0,0,0,0,0,0,0,0,0,1,...,0,0,0,1,0,0,0,0,1,0


In [29]:
# how often items are putchased together (item x item matrix)
occurrence = np.dot(transaction_matrix.T, transaction_matrix)

# filling self-pairs with 0s (diameter of the matrix)
np.fill_diagonal(occurrence, 0)

# converting from numpy array to dataframe
df_occurrence = pd.DataFrame(occurrence,
                             index=transaction_matrix.columns,
                             columns=transaction_matrix.columns)

df_occurrence

itemDescription,Instant food products,UHT-milk,abrasive cleaner,artif. sweetener,baby cosmetics,bags,baking powder,bathroom cleaner,beef,berries,...,turkey,vinegar,waffles,whipped/sour cream,whisky,white bread,white wine,whole milk,yogurt,zwieback
itemDescription,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Instant food products,0,4,0,0,0,0,1,0,9,3,...,3,2,5,8,0,8,3,31,17,2
UHT-milk,4,0,3,3,0,1,12,0,41,29,...,5,1,25,54,0,25,17,158,91,9
abrasive cleaner,0,3,0,0,0,0,0,0,4,2,...,1,0,1,4,0,3,1,9,9,0
artif. sweetener,0,3,0,0,1,0,0,0,3,4,...,3,0,2,4,0,2,0,18,4,0
baby cosmetics,0,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
white bread,8,25,3,2,0,0,9,2,49,36,...,10,4,26,84,1,0,25,186,111,8
white wine,3,17,1,0,0,0,4,2,26,16,...,3,5,14,29,1,25,0,92,64,2
whole milk,31,158,9,18,0,0,59,12,250,153,...,37,22,148,311,8,186,92,0,587,36
yogurt,17,91,9,4,1,2,46,4,150,95,...,20,11,92,187,2,111,64,587,0,22


how this system is going to work?   
get a list of items a customer has purchased and compare it to the occurrence dataframe   
select columns corresponding to purchased items from the occurrence matrix  
sum those occurrence values and returns top 3 as recommended item   
   

difference between lift and confidence method:
imagine we have 1000 transactions (milk appears in 400, eggs appear in 300, eggs and milk together in 200)

confidence: "when customers buy milk  how often do they also buy eggs?" 
answer: P(eggs|milk) = count(milk & eggs) / count(milk) = 200/400 = 0.5    

lift: "are milk and eggs purchased together more often than random chance?" 
asnwer: P(eggs|milk) / P(eggs) = (200/400) / (300/1000) = 1.67

lift > 1: _ buyers are more likely to buy _    
lift = 1: can't recommend anything    
lift < 1: _buyers are less likely to buy _    

In [30]:
def item_recommendation(purchased_item, df_occurrence, n=3, method='lift'):
    # sums accurrence count for each item that are together with the purchased item
    score = df_occurrence[purchased_item].sum(axis=1)

    '''if method == 'confidence':
        item_popularity = transaction_matrix.sum(axis=0)
        score = score / item_popularity[purchased_item].sum()'''

    if method == 'lift':
        item_popularity = transaction_matrix.sum(axis=0)    # total purchase per item

        # (item X popularity) * (purchased item popularity) / total transactions
        expected_occurrence = item_popularity * item_popularity[purchased_item].sum() / len(transaction_matrix)

        #"1e-10" is uses to prevent division by zero
        score = score / (expected_occurrence + 1e-10)

    score = score[~score.index.isin(purchased_item)]    # exclude items the customer has already bought

    # returns top 3 (can be changed to desired number) recommended items with high scores
    return score.nlargest(n).index.tolist()             

In [31]:
# Example 1: Customer bought 'baby cosmetics'
print("\nRecommendations for customers who bought 'whole milk':")
print(item_recommendation(['baby cosmetics'], df_occurrence))

# Example 2: Customer bought multiple items
print("\nRecommendations for customers who bought 'whole milk' and 'tropical fruit':")
print(item_recommendation(['whole milk', 'tropical fruit'], df_occurrence))


Recommendations for customers who bought 'whole milk':
['skin care', 'artif. sweetener', 'cake bar']

Recommendations for customers who bought 'whole milk' and 'tropical fruit':
['whisky', 'bathroom cleaner', 'meat spreads']


Resources:  
converting to transaction matrix:   
https://stackoverflow.com/questions/20574257/constructing-a-co-occurrence-matrix-in-python-pandas   

lift and confidence method:     
https://www.thedataschool.co.uk/liu-zhang/understanding-lift-for-market-basket-analysis/    