In [57]:
import pandas as pd
import numpy as np
import os
import pyvis
from pyvis.network import Network
import networkx as nx
from matplotlib import pyplot as plt
from itertools import product
import plotly.graph_objects as go
import plotly.express as px
import project_code.utils as ut
from project_code.preprocessing_classes import DataExploration

In [58]:
pd.set_option('display.max_columns', None)

In [59]:
data_exploration = DataExploration()
skus_data, transactions_data = data_exploration.run_data_preprocessing_piepeline()

# Hierarchical categories

In [60]:
# skus_data_plot = skus_data[(skus_data["DEPARTMENT"]=="Services")]
# hierarchical_categories = skus_data_plot[["DEPARTMENT", "CATEGORY", "SUBCATEGORY1"]].drop_duplicates()
# fig = px.treemap(hierarchical_categories, path=["DEPARTMENT", "CATEGORY", "SUBCATEGORY1"])
# fig.write_image(f"{ut.CURRENT_DIRECTORY}/data/services_department.png")
# fig.show()


In [61]:
# skus_data_plot = skus_data[(skus_data["DEPARTMENT"]=="Weighed")]
# hierarchical_categories = skus_data_plot[["DEPARTMENT", "CATEGORY"]].drop_duplicates()
# fig = px.treemap(hierarchical_categories, path=["DEPARTMENT", "CATEGORY"])
# fig.write_image(f"{ut.CURRENT_DIRECTORY}/data/weighed_department.png")
# fig.show()


In [62]:
# skus_data_plot = skus_data[(skus_data["DEPARTMENT"]=="Sm.Animal")]
# hierarchical_categories = skus_data_plot[["DEPARTMENT", "CATEGORY", "SUBCATEGORY1"]].drop_duplicates()
# fig = px.treemap(hierarchical_categories, path=["DEPARTMENT", "CATEGORY", "SUBCATEGORY1"])
# fig.write_image(f"{ut.CURRENT_DIRECTORY}/data/Sm.Animal_department.png")
# fig.show()

# Creation of a department ID

In [63]:
# def create_cluster_id(self, row : pd.Series) -> str:
#     """ 
#     Create a cluster name for every item given Department, Category, Subcategory1, etc.
#     """
#     department_name = row["DEPARTMENT"]
#     category_name = row["CATEGORY"]
#     subcategory_1_name = row["SUBCATEGORY1"]

#     cluster_name = department_name
#     if department_name == "Weighed":
#         cluster_name = category_name
#     if department_name == "Services":
#         cluster_name = subcategory_1_name
#     if cluster_name == "Bird" or cluster_name == "Wildbird" or cluster_name == "Dom.Bird":
#         cluster_name = "Bird"
#     return cluster_name

# skus_data["CLUSTER_NAME"] = ""
# skus_data["CLUSTER_NAME"] = skus_data.apply(lambda x : create_cluster_id(x), axis = 1)
# skus_data["CLUSTER_NAME"].unique()

# Merge SKUs and transactions

In [64]:
# data_exploration = DataExploration()
# transactions_data = data_exploration.merge_skus_transactions(skus_data, transactions_data)

# Data cleaning
### Remove Misc. department

In [65]:
def remove_department(df_skus : pd.DataFrame, df_transactions : pd.DataFrame, department_name : str = "Misc.") -> tuple[pd.DataFrame, pd.DataFrame]:
    """
    Remove a department in the datasets. 
    """
    df_transactions[df_transactions["DEPARTMENT"]==department_name]["REVENUE"].sum()/df_transactions["REVENUE"].sum()
    print(f"Percentage of transactions department {department_name} represents: {len(df_transactions[df_transactions['DEPARTMENT']==department_name])/len(df_transactions)}")
    print(f"Percentage of revenue department {department_name} represents: {df_transactions[df_transactions['DEPARTMENT']==department_name]['REVENUE'].sum()/df_transactions['REVENUE'].sum()}")
    df_transactions = df_transactions[df_transactions["DEPARTMENT"]!=department_name]
    df_skus = df_skus[df_skus["DEPARTMENT"]!=department_name]
    return df_skus, df_transactions
skus_data, transactions_data = remove_department(skus_data, transactions_data)

Percentage of transactions department Misc. represents: 0.030647782679359182
Percentage of revenue department Misc. represents: 0.0021068021703166276



### Small Animals


In [66]:
# len(transactions_data[transactions_data["DEPARTMENT"]=="Sm.Animal"])/len(transactions_data), transactions_data[transactions_data["DEPARTMENT"]=="Sm.Animal"]["REVENUE"].sum()/transactions_data["REVENUE"].sum()

# SKU analysis
### Aggregate per sku and count total number of transactions and revenue generated

In [68]:
# def total_transactions_per_sku(df):
#     return df.groupby(["SKU"]).agg(
#         SKU_TOTAL_TRANSACTION = ("TRANSACTION_ID", "count"),
#         SKU_TOTAL_REVENUE = ("REVENUE", "sum")
#     ).reset_index()

# item_purchases = total_transactions_per_sku(transactions_data)

In [69]:
def total_transactions_per_cluster(df : pd.DataFrame) -> pd.DataFrame:
    """
    Return the total number of transactions and revenue per cluster.
    """
    return df.groupby(["CLUSTER_NAME"]).agg(
        CLUSTER_TOTAL_TRANSACTION = ("TRANSACTION_ID", "count"),
        CLUSTER_TOTAL_REVENUE = ("REVENUE", "sum")
    ).reset_index()

cluster_purchases = total_transactions_per_cluster(transactions_data)

In [70]:
def merge_sku_cluster_statistics(df_skus : pd.DataFrame, df_clusters : pd.DataFrame) -> pd.DataFrame:
    """
    Add clsuter statistics to the SKUs dataset. 
    """
    return df_skus.merge(df_clusters, on = "CLUSTER_NAME", how = "left")

skus_data = merge_sku_cluster_statistics(skus_data, cluster_purchases)

In [71]:
def calculate_support_sku(df : pd.DataFrame) -> pd.DataFrame:
    """
    Calculate the Support metric of every item.
    Support = Total number of transactions Item A / Total number of transactions cluster.
    """
    # df["SUPPORT"] = df["SKU_TOTAL_TRANSACTION"]/df["SKU_TOTAL_TRANSACTION"].sum()
    df["SUPPORT_CLUSTER"] = df["SKU_TOTAL_TRANSACTION"]/df["CLUSTER_TOTAL_TRANSACTION"]
    return df
skus_data = calculate_support_sku(skus_data)

In [72]:
# transactions_data = transactions_data.merge(item_purchases, on = "SKU", how = "left")
# transactions_data = transactions_data.merge(clusters_statistics, on = "CLUSTER_NAME", how = "left")

# Model implementation

### Count number of times a pair of items is bought together

In [73]:
def create_all_possible_pairs_sku(df: pd.DataFrame) -> pd.DataFrame:
    """
    Create a datasets with all the possible pairs of SKUs.
    SKU 1 in clomun 1, SKU 2 in column 2.
    Remove pairs with the same SKUs.
    """
    ## Create all possible pairs
    pairs = list(product(df["SKU"].sort_values(), repeat=2))

    # # Create a new DataFrame with the pairs
    pairs_df = pd.DataFrame(pairs, columns=["SKU_1", "SKU_2"])
    pairs_df["TRANSACTION_COUNT"] = 0

    # pairs_df["SKU1"], pairs_df["SKU2"] = zip(*pairs_df.apply(lambda row: sorted([row["SKU1"], row["SKU2"]]), axis=1))
    # pairs_df = pairs_df.drop_duplicates()
    ## Remove pairs with the same SKUs
    pairs_df = pairs_df[pairs_df["SKU_1"] != pairs_df["SKU_2"]]
    return pairs_df

pairs_df = create_all_possible_pairs_sku(skus_data)

In [74]:
def create_tuple_skus_sold_together(df : pd.DataFrame) -> pd.DataFrame:
    """
    Find all the pairs of SKUs sold in the same basket.
    """
    ## Aggregate all the SKUs sold in the same basket into a list
    skus_per_transaction = df.groupby(["TRANSACTION_ID"]).agg({
        "SKU" : lambda x : set(list(x))
    }).reset_index()

    ## Sort the list of SKUs by SKU
    skus_per_transaction["SKU"] = skus_per_transaction.SKU.map(lambda x : sorted(x))

    ## Create all possible pairs of SKUs for every basket
    skus_per_transaction["PAIR_SKUS"] = skus_per_transaction["SKU"].map(lambda x : list(product(x, repeat=2)))
    return skus_per_transaction

skus_per_transaction = create_tuple_skus_sold_together(transactions_data)

In [75]:
def count_pair_purchase(df_pairs_sku, df_transactions_skus):
    """
    Count the number of times a pair of SKUs was sold.
    Input:
        - Dataframe containing all the possible pairs of SKUs.
        - Dataframe containing the SKUs sold together for every transactions.     
    Output:
        - Dataframe containing all the possible pairs of SKUs.  
    """
    for _, row in df_transactions_skus.iterrows():
        list_pairs = row["PAIR_SKUS"]
        for _pair in list_pairs:
            sku_id1 = _pair[0]
            sku_id2 = _pair[1]
            # if sku_id1 < sku_id2:
            df_pairs_sku.loc[(df_pairs_sku["SKU_1"]==sku_id1) & (df_pairs_sku["SKU_2"]==sku_id2), "TRANSACTION_COUNT"] += 1

    return df_pairs_sku

pairs_df = count_pair_purchase(pairs_df, skus_per_transaction)

In [78]:
def add_skus_data(df_pairs : pd.DataFrame, df_skus : pd.DataFrame) -> pd.DataFrame:
    """
    Add statistics about both SKUs contained in every pair of a same basket.
    """
    ## Rename columns of the dataset containing unique SKUs, their cluster and statsitics
    skus_data1 = df_skus.copy()
    skus_data1.columns = [str(x)+"_1" for x in skus_data1.columns]

    ## Add statistics for SKU 1 using the suffixe 'SKU_1' 
    df_pairs = df_pairs.merge(skus_data1, on = "SKU_1", how = "left")

    ## Same for SKU 2
    skus_data2 = df_skus.copy()
    skus_data2.columns = [str(x)+"_2" for x in skus_data2.columns]
    df_pairs = df_pairs.merge(skus_data2, on = "SKU_2", how = "left")

    return df_pairs

pairs_df = add_skus_data(pairs_df, skus_data)

In [23]:
# pairs_df["RATIO_1"] = pairs_df.apply(lambda x : x.TRANSACTION_COUNT / x.SKU_TOTAL_TRANSACTION_1 if x.SKU_TOTAL_TRANSACTION_1 != 0 else 0, axis = 1)
# pairs_df["RATIO_2"] = pairs_df.apply(lambda x : x.TRANSACTION_COUNT / x.SKU_TOTAL_TRANSACTION_2 if x.SKU_TOTAL_TRANSACTION_2 != 0 else 0, axis = 1)
# pairs_df["MAX_RATIO"] = pairs_df.apply(lambda x : max(x.RATIO_1, x.RATIO_2), axis = 1)

# Manual thresholds

In [25]:
# pairs_df_filtred = filter_nbre_transactions(pairs_df, 5)
# tt = pairs_df_filtred[(pairs_df_filtred["MAX_RATIO"]>=0.05)]
# tt = tt[(tt["TRANSACTION_COUNT"]>=2)]

# tt["SKU_1"].nunique()

# ll = tt.groupby(["SKU_1"]).agg(
#     nbre_halo_effect = ("SKU_2", "count")
# ).reset_index()

# ll["nbre_halo_effect"].value_counts().sort_index()

### Same cluster?

In [81]:
def flag_same_cluster(df : pd.DataFrame) -> pd.DataFrame:
    """
    Flag whether the SKUs contained in a pair have the same cluster.
    """
    df["SAME_CLUSTER"] = df.apply(lambda x : 1 if x.CLUSTER_NAME_1 == x.CLUSTER_NAME_2 else 0, axis = 1)
    return df

pairs_df = flag_same_cluster(pairs_df)

# Apriori algorithm implementation

In [85]:
def generate_features(df : pd.DataFrame) -> pd.DataFrame:
    """
    Perform feature engineering and return for every pairs:
    - Confidence metric
    - Lift metric
    - Conviction  metric
    """
    df["CONFIDENCE"] = df["TRANSACTION_COUNT"]/df["SKU_TOTAL_TRANSACTION_1"]
    df["LIFT"] = df["CONFIDENCE"]/df["SUPPORT_CLUSTER_2"]
    df["CONVICTION"] = (1-df["SUPPORT_CLUSTER_2"])/(1-df["CONFIDENCE"])
    return df

pairs_df = generate_features(pairs_df)

In [83]:
def filter_nbre_transactions(df : pd.DataFrame, threshold_transactions : pd.DataFrame) -> pd.DataFrame:
   """
   Apply a threshold on the minimum number of transactions over the past 3 months for every SKU.  
   """
   df = df[df["SKU_TOTAL_TRANSACTION_1"]>=threshold_transactions]
   df = df[df["SKU_TOTAL_TRANSACTION_2"]>=threshold_transactions]
   return df

In [144]:
correlation_thresholds = {
    "threshold_number_transactions" : 5,
    "confidence_threshold" : 0.1,
    "lift_threshold" : 1.1,
    "support_pair_threshold" : 2,
    "conviction_threshold" : 1.10
    }

def correlation_skus_detection(df : pd.DataFrame, correlation_thresholds : dict) -> pd.DataFrame:
    """
    Apply filters to flag items whose sales are positively correlated (i.e.: measure of halo effect between the two).
    """
    df_filtered = filter_nbre_transactions(df, correlation_thresholds["threshold_number_transactions"])
    ## Apply filtering using lift metric
    df_filtered = df_filtered[(df_filtered["LIFT"]>=correlation_thresholds["lift_threshold"])]

    ## Apply filtering using confidence metric
    df_filtered = df_filtered[(df_filtered["CONFIDENCE"]>=correlation_thresholds["confidence_threshold"])]

    ## Apply filtering using support metric for the pair of items
    df_filtered = df_filtered[(df_filtered["TRANSACTION_COUNT"]>=correlation_thresholds["support_pair_threshold"])]

    ## Apply filtering using conviction metric
    df_filtered = df_filtered[(df_filtered["CONVICTION"]>=correlation_thresholds["conviction_threshold"])]

    ## Only keep pairs from the same cluster
    df_filtered = df_filtered[(df_filtered["SAME_CLUSTER"]==1)]
    return df_filtered
pairs_df_filtred = correlation_skus_detection(pairs_df, correlation_thresholds)

In [145]:
# pairs_df_filtred["SAME_CLUSTER"].value_counts()
skus_data[skus_data["SKU"].isin(pairs_df_filtred["SKU_1"].unique())]["SKU_TOTAL_REVENUE"].sum()/skus_data["SKU_TOTAL_REVENUE"].sum(), pairs_df_filtred["SKU_1"].nunique()

(np.float64(0.40018135524222903), 170)

In [146]:
def find_top_3_upsells(df : pd.DataFrame) -> pd.DataFrame:
 """
 For every item find the top 3 most correlated other items.
 """
 return df.sort_values(by=["SKU_1", "LIFT"], ascending=[True, False]).groupby("SKU_1").head(3).reset_index(drop=True) 

pairs_df_top3 = find_top_3_upsells(pairs_df_filtred)   
pairs_df_top3.groupby("SKU_1").agg(
    {"SKU_2" : "count"}
).reset_index().value_counts("SKU_2")

SKU_2
1    71
3    53
2    46
Name: count, dtype: int64

# Halo effect degree n+1

In [147]:
def find_nearest_neighbors_node(G) -> dict:
    """
    Find the nearest neighbors of all the nodes in the graph and the weight of the edge between the node and its neighbors
    with a degree of 1. 
    """
    ## Create a dictionnary with all the nodes of the graph as keys
    node_neighbor_weights = {_node: {}  for _node in G.nodes()}

    ## For every node of the graph:
    for _node in G.nodes():
        ## Find the nearest neighbors
        node_neighbors = G.neighbors(_node)    
        for _neighbor in node_neighbors:
            ## Return the value of the weight for the edge between the node and its neighbor
            edge_neighbor = G.get_edge_data(_node, _neighbor)
            
            ## Add the neighbor ID and the edge value to the dictionnary
            node_neighbor_weights[int(_node)][int(_neighbor)] = edge_neighbor["weight"]
    return node_neighbor_weights

def order_neighbors_per_weight(node_neighbors_dict : dict) -> dict:
    """
    For every node, it orders the list of its nearest neighbors given the weight of the edge between them. 
    """
    for _node, _neighbors in node_neighbors_dict.items():
        node_neighbors_dict[_node] = dict(sorted(_neighbors.items(), key=lambda item: item[1], reverse = True))
    return node_neighbors_dict

def find_n_plus_1_neighbors_node(G, node_neighbors_dict : dict) -> dict:
    """
    For every node, it goes through the list of its nearest neighbors ordered by importance (i.e.: the weight of the edge).
    It then finds their nearest neighbors (i.e.: they are the neighbors of the original node with a degree of 2).
    It orders the list of neighbors given their weight.
    It returns the original list of nodes with their neighbors with a degree of 2 ordered by their weight.
    """
    ## Create a dictionnary with all the nodes of the graph as keys
    node_neighbor_weights_n_1 = {_node: []  for _node in G.nodes()}

    ## Iter through all the nearest neighbors of every nodes
    for _node_0, _neighbors_1 in node_neighbors_dict.items():
        ## Create a dictionnary to add all the nodes with a of degree 2
        node_neighbor_weights_n_2 = {_node: {}  for _node in _neighbors_1.keys()}
        for _neighbor_1 in _neighbors_1.keys():
            ## Find the neighbors (degree 2) of the neighbor (degree 1)
            node_neighbors_2 = G.neighbors(_neighbor_1)    
            for _neighbor_2 in node_neighbors_2:
                ## Save the neighbor with a degree of 2 and the weight of the edge shared with neighbor with a degree of 1
                edge_neighbor = G.get_edge_data(_neighbor_1, _neighbor_2)
                node_neighbor_weights_n_2[int(_neighbor_1)][int(_neighbor_2)] = edge_neighbor["weight"]

            node_neighbor_weights_n_2 = order_neighbors_per_weight(node_neighbor_weights_n_2)

            node_neighbor_weights_n_1[_node_0] += list(node_neighbor_weights_n_2[_neighbor_1].keys())
    return node_neighbor_weights_n_1

In [159]:
G = nx.Graph()

for pos, row in pairs_df_top3.iterrows():
    node1 = row["SKU_1"]
    node2 = row["SKU_2"]

    G.add_node(int(row["SKU_1"]), label=row["ITEM_DESCRIPTION_1"])
    G.add_node(int(row["SKU_2"]), label=row["ITEM_DESCRIPTION_2"])
    G.add_edge(int(row["SKU_1"]), int(row["SKU_2"]), weight=row["CONVICTION"])

def remove_node_neighbors(node_neighbors_dict : dict) -> dict:
    """
    Remove the node ID of its neighbors with a degree of 2 list.
    """
    for _node, _neighbors in node_neighbors_dict.items():
        node_neighbors_dict[_node] = [x for x in _neighbors if x!=_node]
    return node_neighbors_dict

In [311]:
node_neighbor_weights = find_nearest_neighbors_node(G)

node_neighbor_weights_order = order_neighbors_per_weight(node_neighbor_weights)

node_neighbor_n_plus_one = find_n_plus_1_neighbors_node(G, node_neighbor_weights_order)

node_neighbor_n_plus_one = remove_node_neighbors(node_neighbor_n_plus_one)

In [312]:
def all_possible_neighbors() -> pd.DataFrame:
    """
    Create a sparse dataset for every possible pair of SKUs.
    NEIGHBOR_RANKING is a score given to SKU 2 - neighbor of SKU 1 with a degree of 2.
    NEIGHBOR_RANKING = 1, 2 or 3, with 1 beign the best. 
    """
    df = create_all_possible_pairs_sku(skus_data)
    df["NEIGHBOR_RANKING"] = 0
    return df

neighbors_degree_2 = all_possible_neighbors()

In [313]:
def remove_duplicated_neighbors(node_neighbors_dict : dict) -> dict:
    """
    Input:
        - node_neighbor_n_plus_one: Key: SKU, value: list of every neighbors with a degree of 2.
    Remove duplicated neighbors for every SKU.
    """
    for _node, _neighbors in node_neighbors_dict.items():
        node_neighbors_dict[_node] = list(dict.fromkeys(_neighbors))
    return node_neighbors_dict

node_neighbor_n_plus_one = remove_duplicated_neighbors(node_neighbor_n_plus_one)

In [314]:
def find_best_neighbors_n_2(node_neighbors_dict : dict, neighbors_degree_2 : pd.DataFrame) -> pd.DataFrame:
    """
    Input:
        - node_neighbor_n_plus_one: Key: SKU, value: list of every neighbors with a degree of 2.
    For every SKU, return the 3 best neighbors with a degree of 2, ranked from 1 to 3.
    """
    for _node, _neighbors in node_neighbors_dict.items():
        count_iter = 1
        for node_neighbor in _neighbors:
                if count_iter < 4:
                    neighbors_degree_2.loc[(neighbors_degree_2["SKU_1"]==_node) & (neighbors_degree_2["SKU_2"]==node_neighbor), "TRANSACTION_COUNT"] = 1
                    neighbors_degree_2.loc[(neighbors_degree_2["SKU_1"]==_node) & (neighbors_degree_2["SKU_2"]==node_neighbor), "NEIGHBOR_RANKING"] = count_iter
                    count_iter +=1
    return neighbors_degree_2

neighbors_degree_2 = find_best_neighbors_n_2(node_neighbor_n_plus_one, neighbors_degree_2)
        

In [315]:
neighbors_degree_2 = neighbors_degree_2[neighbors_degree_2["TRANSACTION_COUNT"]>0]
neighbors_degree_2[(neighbors_degree_2["SKU_1"]==9518) & (neighbors_degree_2["SKU_2"].isin(node_neighbor_n_plus_one[9518]))]

Unnamed: 0,SKU_1,SKU_2,TRANSACTION_COUNT,NEIGHBOR_RANKING
2464,9518,9517,1,1
2614,9518,45806,1,3
2823,9518,56439,1,2


In [None]:
neighbors_degree_2 = add_skus_data(neighbors_degree_2, skus_data) 
neighbors_degree_2 = generate_features(neighbors_degree_2)
neighbors_degree_2 = flag_same_cluster(neighbors_degree_2)

pairs_df_top3["NEIGHBOR_RANKING"] = 0

pairs_df_top_neighbors = pd.concat([pairs_df_top3, neighbors_degree_2])
pairs_df_top_neighbors = pairs_df_top_neighbors.sort_values(by=["SKU_1", "NEIGHBOR_RANKING"], ascending=[True, True]).groupby("SKU_1").head(3).reset_index(drop=True) 


In [323]:
pairs_df_top_neighbors.groupby("SKU_1").agg(
    {"SKU_2" : "count"}
).reset_index().value_counts("SKU_2")

SKU_2
3    123
1     29
2     26
Name: count, dtype: int64

In [None]:
from pathlib import Path

# Get the current working directory
cwd = Path.cwd()
cwd

PosixPath('/home/guillaume/jollyes')

In [38]:
net = Network(notebook=True, filter_menu=True, cdn_resources='remote')
for edge, data in G.edges.items():
    data["label"] = str(data["weight"])
    
net.from_nx(G)
net.show_buttons(filter_=['physics'])
net.show_buttons()
net.show("teams.html")

teams.html
