In [166]:
import pandas as pd
import numpy as np
import os
import pyvis
import networkx as nx
from matplotlib import pyplot as plt
from itertools import product
import plotly.graph_objects as go
import plotly.express as px

In [167]:
pd.set_option('display.max_columns', None)

In [168]:
current_directory = os.getcwd()

In [169]:
skus_data = pd.read_csv(f"{current_directory}/data/skus.csv", delimiter = ",")
transactions_data = pd.read_csv(f"{current_directory}/data/transactions.csv", delimiter = ",")

# Hierarchical categories

In [170]:
# skus_data_plot = skus_data[(skus_data["DEPARTMENT"]=="Services")]
# hierarchical_categories = skus_data_plot[["DEPARTMENT", "CATEGORY", "SUBCATEGORY1"]].drop_duplicates()
# fig = px.treemap(hierarchical_categories, path=["DEPARTMENT", "CATEGORY", "SUBCATEGORY1"])
# fig.write_image(f"{current_directory}/data/services_department.png")
# fig.show()


In [171]:
# skus_data_plot = skus_data[(skus_data["DEPARTMENT"]=="Weighed")]
# hierarchical_categories = skus_data_plot[["DEPARTMENT", "CATEGORY"]].drop_duplicates()
# fig = px.treemap(hierarchical_categories, path=["DEPARTMENT", "CATEGORY"])
# fig.write_image(f"{current_directory}/data/weighed_department.png")
# fig.show()


In [172]:
# skus_data_plot = skus_data[(skus_data["DEPARTMENT"]=="Sm.Animal")]
# hierarchical_categories = skus_data_plot[["DEPARTMENT", "CATEGORY", "SUBCATEGORY1"]].drop_duplicates()
# fig = px.treemap(hierarchical_categories, path=["DEPARTMENT", "CATEGORY", "SUBCATEGORY1"])
# fig.write_image(f"{current_directory}/data/Sm.Animal_department.png")
# fig.show()

# Creation of a department ID

In [173]:
def create_cluster_id(row):
    # for _, row in df.iterrows():
    department_name = row["DEPARTMENT"]
    category_name = row["CATEGORY"]
    subcategory_1_name = row["SUBCATEGORY1"]
    # subcategory_2_name = row["SUBCATEGORY2"]

    cluster_name = department_name
    if department_name == "Weighed":
        cluster_name = category_name
    if department_name == "Services":
        cluster_name = subcategory_1_name
    if cluster_name == "Bird" or cluster_name == "Wildbird" or cluster_name == "Dom.Bird":
        cluster_name = "Bird"
    return cluster_name

skus_data["CLUSTER_NAME"] = ""
skus_data["CLUSTER_NAME"] = skus_data.apply(lambda x : create_cluster_id(x), axis = 1)


In [174]:
skus_data["CLUSTER_NAME"].unique()

array(['Dog', 'Cat', 'Bird', 'Misc.', 'Sm.Animal', 'Pond', 'Reptile'],
      dtype=object)

# Merge SKUs and transactions

In [175]:
transactions_data = transactions_data.merge(skus_data, on = "SKU", how = "left")

# Data cleaning
### Remove Misc. department

In [176]:
transactions_data[transactions_data["DEPARTMENT"]=="Misc."]["REVENUE"].sum()/transactions_data["REVENUE"].sum()
print(len(transactions_data[transactions_data["DEPARTMENT"]=="Misc."])/len(transactions_data))
print(transactions_data[transactions_data["DEPARTMENT"]=="Misc."]["REVENUE"].sum()/transactions_data["REVENUE"].sum())
transactions_data = transactions_data[transactions_data["DEPARTMENT"]!="Misc."]
skus_data = skus_data[skus_data["DEPARTMENT"]!="Misc."]

0.1204745346696666
0.0021075037906296847



### Small Animals


In [177]:
len(transactions_data[transactions_data["DEPARTMENT"]=="Sm.Animal"])/len(transactions_data), transactions_data[transactions_data["DEPARTMENT"]=="Sm.Animal"]["REVENUE"].sum()/transactions_data["REVENUE"].sum()

(0.03813953488372093, np.float64(0.03252841760760317))

# SKU analysis
### Aggregate per sku and count total number of transactions and revenue generated

In [178]:
item_purchases = transactions_data.groupby(["SKU"]).agg(
    SKU_TOTAL_TRANSACTION = ("TRANSACTION_ID", "count"),
    SKU_TOTAL_REVENUE = ("REVENUE", "sum")
).reset_index()

In [179]:
clusters_statistics = transactions_data.groupby(["CLUSTER_NAME"]).agg(
    CLUSTER_TOTAL_TRANSACTION = ("TRANSACTION_ID", "count"),
    CLUSTER_TOTAL_REVENUE = ("REVENUE", "sum")
).reset_index()

In [180]:
skus_data = skus_data.merge(item_purchases, on = "SKU", how = "left")
skus_data = skus_data.merge(clusters_statistics, on = "CLUSTER_NAME", how = "left")

In [181]:
skus_data["SUPPORT"] = skus_data["SKU_TOTAL_TRANSACTION"]/skus_data["SKU_TOTAL_TRANSACTION"].sum()
skus_data["SUPPORT_CLUSTER"] = skus_data["SKU_TOTAL_TRANSACTION"]/skus_data["CLUSTER_TOTAL_TRANSACTION"]

In [182]:
# transactions_data = transactions_data.merge(item_purchases, on = "SKU", how = "left")
# transactions_data = transactions_data.merge(clusters_statistics, on = "CLUSTER_NAME", how = "left")

# Model implementation

### Count number of times a pair of items is bought together

In [183]:
# Create all possible pairs
pairs = list(product(skus_data["SKU"].sort_values(), repeat=2))

# # Create a new DataFrame with the pairs
pairs_df = pd.DataFrame(pairs, columns=["SKU_1", "SKU_2"])
pairs_df["TRANSACTION_COUNT"] = 0

# pairs_df["SKU1"], pairs_df["SKU2"] = zip(*pairs_df.apply(lambda row: sorted([row["SKU1"], row["SKU2"]]), axis=1))
# pairs_df = pairs_df.drop_duplicates()
pairs_df = pairs_df[pairs_df["SKU_1"] != pairs_df["SKU_2"]]

In [184]:
skus_per_transaction = transactions_data.groupby(["TRANSACTION_ID"]).agg({
    "SKU" : lambda x : set(list(x))
}).reset_index()

skus_per_transaction["SKU"] = skus_per_transaction.SKU.map(lambda x : sorted(x))

## Create all possible pairs (including self-pairs)
skus_per_transaction["AllPairs"] = skus_per_transaction["SKU"].map(lambda x : list(product(x, repeat=2)))

In [185]:
for _, row in skus_per_transaction.iterrows():
    list_pairs = row["AllPairs"]
    for _pair in list_pairs:
        sku_id1 = _pair[0]
        sku_id2 = _pair[1]
        # if sku_id1 < sku_id2:
        pairs_df.loc[(pairs_df["SKU_1"]==sku_id1) & (pairs_df["SKU_2"]==sku_id2), "TRANSACTION_COUNT"] += 1

In [186]:
# item_purchases1 = item_purchases.copy()
# item_purchases1.columns = [str(x)+"_1" for x in item_purchases1.columns]

# item_purchases2 = item_purchases.copy()
# item_purchases2.columns = [str(x)+"_2" for x in item_purchases2.columns]

# pairs_df = pairs_df.merge(item_purchases1, on = "SKU_1", how = "left")
# pairs_df = pairs_df.merge(item_purchases2, on = "SKU_2", how = "left")

In [187]:
skus_data1 = skus_data.copy()
skus_data1.columns = [str(x)+"_1" for x in skus_data1.columns]

skus_data2 = skus_data.copy()
skus_data2.columns = [str(x)+"_2" for x in skus_data2.columns]

pairs_df = pairs_df.merge(skus_data1, on = "SKU_1", how = "left")
pairs_df = pairs_df.merge(skus_data2, on = "SKU_2", how = "left")

In [188]:
# pairs_df["RATIO_1"] = pairs_df.apply(lambda x : x.TRANSACTION_COUNT / x.SKU_TOTAL_TRANSACTION_1 if x.SKU_TOTAL_TRANSACTION_1 != 0 else 0, axis = 1)
# pairs_df["RATIO_2"] = pairs_df.apply(lambda x : x.TRANSACTION_COUNT / x.SKU_TOTAL_TRANSACTION_2 if x.SKU_TOTAL_TRANSACTION_2 != 0 else 0, axis = 1)
# pairs_df["MAX_RATIO"] = pairs_df.apply(lambda x : max(x.RATIO_1, x.RATIO_2), axis = 1)

# Manual thresholds

In [189]:
def filter_nbre_transactions(df, threshold_transactions):
   df = df[df["SKU_TOTAL_TRANSACTION_1"]>=threshold_transactions]
   df = df[df["SKU_TOTAL_TRANSACTION_2"]>=threshold_transactions]
   return df

In [190]:
# pairs_df_filtred = filter_nbre_transactions(pairs_df, 5)
# tt = pairs_df_filtred[(pairs_df_filtred["MAX_RATIO"]>=0.05)]
# tt = tt[(tt["TRANSACTION_COUNT"]>=2)]

# tt["SKU_1"].nunique()

# ll = tt.groupby(["SKU_1"]).agg(
#     nbre_halo_effect = ("SKU_2", "count")
# ).reset_index()

# ll["nbre_halo_effect"].value_counts().sort_index()

### Same cluster?

In [191]:
pairs_df["SAME_CLUSTER"] = pairs_df.apply(lambda x : 1 if x.CLUSTER_NAME_1 == x.CLUSTER_NAME_2 else 0, axis = 1)

# Apriori algorithm implementation

In [192]:
pairs_df["CONFIDENCE"] = pairs_df["TRANSACTION_COUNT"]/pairs_df["SKU_TOTAL_TRANSACTION_1"]
pairs_df["LIFT"] = pairs_df["CONFIDENCE"]/pairs_df["SUPPORT_CLUSTER_2"]
pairs_df["CONVICTION"] = (1-pairs_df["SUPPORT_CLUSTER_2"])/(1-pairs_df["CONFIDENCE"])

In [193]:
pairs_df_filtred = filter_nbre_transactions(pairs_df, 5)
pairs_df_filtred = pairs_df_filtred[(pairs_df_filtred["LIFT"]>1.0)]
pairs_df_filtred = pairs_df_filtred[(pairs_df_filtred["CONFIDENCE"]>0.1)]
pairs_df_filtred = pairs_df_filtred[(pairs_df_filtred["TRANSACTION_COUNT"]>=2)]
pairs_df_filtred = pairs_df_filtred[(pairs_df_filtred["CONVICTION"]>=1.10)]
pairs_df_filtred = pairs_df_filtred[(pairs_df_filtred["SAME_CLUSTER"]==1)]

In [194]:
pairs_df_filtred

Unnamed: 0,SKU_1,SKU_2,TRANSACTION_COUNT,ITEM_DESCRIPTION_1,DEPARTMENT_1,CATEGORY_1,SUBCATEGORY1_1,SUBCATEGORY2_1,BRAND_1,ITEM_WEIGHT_1,ITEM_UNIT_OF_MEASURE_1,CLUSTER_NAME_1,SKU_TOTAL_TRANSACTION_1,SKU_TOTAL_REVENUE_1,CLUSTER_TOTAL_TRANSACTION_1,CLUSTER_TOTAL_REVENUE_1,SUPPORT_1,SUPPORT_CLUSTER_1,ITEM_DESCRIPTION_2,DEPARTMENT_2,CATEGORY_2,SUBCATEGORY1_2,SUBCATEGORY2_2,BRAND_2,ITEM_WEIGHT_2,ITEM_UNIT_OF_MEASURE_2,CLUSTER_NAME_2,SKU_TOTAL_TRANSACTION_2,SKU_TOTAL_REVENUE_2,CLUSTER_TOTAL_TRANSACTION_2,CLUSTER_TOTAL_REVENUE_2,SUPPORT_2,SUPPORT_CLUSTER_2,SAME_CLUSTER,CONFIDENCE,LIFT,CONVICTION
985,9515,9517,2,Frozen F/F Beef,Dog,Food,Frozen,Adult,Natures Menu,2.000,KG,Dog,17.0,134.4005,3398,15333.3354,0.003953,0.005003,Frozen F/F Chicken,Dog,Food,Frozen,Adult,Natures Menu,2.000,KG,Dog,5.0,53.9400,3398,15333.3354,0.001163,0.001471,1,0.117647,79.952941,1.131666
986,9515,9518,4,Frozen F/F Beef,Dog,Food,Frozen,Adult,Natures Menu,2.000,KG,Dog,17.0,134.4005,3398,15333.3354,0.003953,0.005003,Frozen F/F Lamb,Dog,Food,Frozen,Adult,Natures Menu,2.000,KG,Dog,8.0,72.8190,3398,15333.3354,0.001860,0.002354,1,0.235294,99.941176,1.304614
1092,9515,36027,3,Frozen F/F Beef,Dog,Food,Frozen,Adult,Natures Menu,2.000,KG,Dog,17.0,134.4005,3398,15333.3354,0.003953,0.005003,Dog Biscuits,Weighed,Dog,Biscuits,Na,Weighed,,,Dog,90.0,242.2891,3398,15333.3354,0.020930,0.026486,1,0.176471,6.662745,1.182124
1344,9515,56439,2,Frozen F/F Beef,Dog,Food,Frozen,Adult,Natures Menu,2.000,KG,Dog,17.0,134.4005,3398,15333.3354,0.003953,0.005003,Rabbit Ears Packed,Dog,Treat,Natural,Other,T. Forrest,0.180,KG,Dog,7.0,59.9336,3398,15333.3354,0.001628,0.002060,1,0.117647,57.109244,1.130999
1966,9517,9515,2,Frozen F/F Chicken,Dog,Food,Frozen,Adult,Natures Menu,2.000,KG,Dog,5.0,53.9400,3398,15333.3354,0.001163,0.001471,Frozen F/F Beef,Dog,Food,Frozen,Adult,Natures Menu,2.000,KG,Dog,17.0,134.4005,3398,15333.3354,0.003953,0.005003,1,0.400000,79.952941,1.658328
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
241514,66971,63417,2,Turkey Goose N Cranberry,Dog,Food,Frozen,Adult,Raw And Simple,0.454,KG,Dog,17.0,19.8389,3398,15333.3354,0.003953,0.005003,Salmon Mince,Dog,Food,Frozen,Adult,Raw And Simple,0.454,KG,Dog,17.0,47.4350,3398,15333.3354,0.003953,0.005003,1,0.117647,23.515571,1.127663
241515,66971,63418,2,Turkey Goose N Cranberry,Dog,Food,Frozen,Adult,Raw And Simple,0.454,KG,Dog,17.0,19.8389,3398,15333.3354,0.003953,0.005003,Turkey Mince,Dog,Food,Frozen,Adult,Raw And Simple,0.454,KG,Dog,34.0,63.0080,3398,15333.3354,0.007907,0.010006,1,0.117647,11.757785,1.121993
241541,66971,65783,2,Turkey Goose N Cranberry,Dog,Food,Frozen,Adult,Raw And Simple,0.454,KG,Dog,17.0,19.8389,3398,15333.3354,0.003953,0.005003,Green Beef Tripe,Dog,Food,Frozen,Adult,Raw And Simple,0.454,KG,Dog,25.0,44.7502,3398,15333.3354,0.005814,0.007357,1,0.117647,15.990588,1.124995
241542,66971,65784,3,Turkey Goose N Cranberry,Dog,Food,Frozen,Adult,Raw And Simple,0.454,KG,Dog,17.0,19.8389,3398,15333.3354,0.003953,0.005003,Pork Mince,Dog,Food,Frozen,Adult,Raw And Simple,0.454,KG,Dog,10.0,14.5885,3398,15333.3354,0.002326,0.002943,1,0.176471,59.964706,1.210712


In [195]:
pairs_df_top3 = pairs_df_filtred.sort_values(by=["SKU_1", "LIFT"], ascending=[True, False]).groupby("SKU_1").head(3).reset_index(drop=True)      

In [196]:
pairs_df_top3.groupby("SKU_1").agg(
    {"SKU_2" : "count"}
).reset_index().value_counts("SKU_2")

SKU_2
1    71
3    62
2    56
Name: count, dtype: int64

In [197]:
item_purchases[item_purchases["SKU"].isin(pairs_df_filtred["SKU_1"].unique())]["SKU_TOTAL_REVENUE"].sum()/item_purchases["SKU_TOTAL_REVENUE"].sum(), pairs_df_filtred["SKU_1"].nunique()

(np.float64(0.4392669222525312), 189)

# Halo effect degree n+1