In [4]:
import pandas as pd
import numpy as np

In [7]:
hierarchy = pd.read_csv("/home/arsun/projects/YZV311_2425_150210324_150210329/data/raw/product_category_map.csv")
hierarchy

Unnamed: 0,category_id,parent_category_id
0,0,75
1,1,1499
2,2,1082
3,3,3498
4,4,1623
...,...,...
4327,4295,3898
4328,4296,3898
4329,4297,3898
4330,4298,3898


In [61]:
product_catalog = pd.read_csv("/home/arsun/projects/YZV311_2425_150210324_150210329/data/raw/product_catalog.csv")

product_catalog["categories"] = product_catalog["categories"].apply(lambda x: eval(x) if pd.notna(x) else [])

product_catalog 

Unnamed: 0,product_id,manufacturer_id,attribute_1,attribute_2,attribute_3,attribute_4,attribute_5,categories
0,22665,861,4,0,490,2,66,"[2890, 855, 3908, 3909]"
1,28640,1366,10,1,537,0,101,[]
2,13526,1090,10,0,511,0,0,"[3270, 163, 284, 1694, 12, 3837, 2422, 3595, 3..."
3,21399,1090,10,1,511,0,0,[3270]
4,8504,768,4,1,484,0,66,[2470]
...,...,...,...,...,...,...,...,...
32771,12036,1383,10,0,503,0,101,"[3502, 1088, 2004, 691, 3422, 2308, 308, 3688,..."
32772,22709,567,4,2,491,3,66,"[3900, 3905, 3903, 3910, 1807, 274, 1467, 914]"
32773,32322,1385,10,1,500,0,37,"[2686, 2609, 725]"
32774,19118,1113,6,2,491,0,117,"[3900, 3910, 3903, 2473, 914, 274]"


### Feature Engineering

#### Converting dates to weeks, which will be used as labels for the model.

In [62]:
def convert_to_week(transactions):
    transactions['purchase_date'] = pd.to_datetime(transactions['purchase_date'])
    transactions['week'] = transactions['purchase_date'].dt.to_period('W')

    transactions.loc[:,("week")] = transactions.loc[:,("week")].dt.week

    transactions = transactions[transactions["week"] != 53]
    transactions["week"] = np.where(transactions["purchase_date"].dt.year == 2021,  transactions["week"] + 52, transactions["week"])
    
    transactions.loc[:,("week")] = transactions.loc[:,("week")].astype(int)
    # transactions = transactions.sort_values(by=['customer_id', 'product_id', 'purchase_date'])
    # transactions["weeks_for_next_purchase"] = transactions.groupby(['customer_id', 'product_id'])['week'].shift(-1).diff().fillna(0).astype(int)

    return transactions

transactions = convert_to_week(transactions)
transactions.sort_values(by=['purchase_date'])

  transactions.loc[:,("week")] = transactions.loc[:,("week")].dt.week
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  transactions["week"] = np.where(transactions["purchase_date"].dt.year == 2021,  transactions["week"] + 52, transactions["week"])


Unnamed: 0,customer_id,product_id,purchase_date,quantity,week
0,38769,3477,2020-06-01,1,23
3399,21825,8019,2020-06-01,1,23
3398,14663,7428,2020-06-01,1,23
3397,29055,15709,2020-06-01,3,23
3396,29055,30511,2020-06-01,2,23
...,...,...,...,...,...
1066651,9942,31384,2021-01-31,2,56
1066650,44584,5221,2021-01-31,1,56
1066649,3990,24254,2021-01-31,1,56
1066647,32302,17861,2021-01-31,1,56


In [8]:
import networkx as nx

def topological_sort_hierarchy(hierarchy):
    graph = nx.DiGraph()
    for _, row in hierarchy.iterrows():
        graph.add_edge(row['parent_category_id'], row['category_id'])

    # Remove self-loops
    graph.remove_edges_from(nx.selfloop_edges(graph))

    # Perform topological sort
    sorted_categories = list(nx.topological_sort(graph))

    # Rearrange the DataFrame based on the topological sort
    sorted_df = pd.DataFrame(columns=['category_id', 'parent_category_id'])
    for category in sorted_categories:
        # Find rows corresponding to the sorted category order
        rows = hierarchy[hierarchy['category_id'] == category]
        sorted_df = pd.concat([sorted_df, rows], ignore_index=True)

    sorted_df = sorted_df.drop(sorted_df.index[0])
    sorted_df = sorted_df.astype(int)
    
    return sorted_df

sorted_df = topological_sort_hierarchy(hierarchy)
print(sorted_df["category_id"].values)
sorted_df

[  34  182  252 ... 2831  911 3489]


Unnamed: 0,category_id,parent_category_id
1,34,3898
2,182,3898
3,252,3898
4,275,3898
5,298,3898
...,...,...
4327,3113,2191
4328,1551,25
4329,2831,25
4330,911,172


In [None]:
import networkx as nx
import matplotlib.pyplot as plt
import pandas as pd

hierarchy = pd.read_csv("/home/arsun/projects/YZV311_2425_150210324_150210329/data/raw/product_category_map.csv")

def create_and_visualize_hierarchy(hierarchy):
    # Create directed graph
    graph = nx.DiGraph()
    
    # Add edges from parent to child
    for _, row in hierarchy.iterrows():
        graph.add_edge(row['parent_category_id'], row['category_id'])
    
    # Remove self-loops
    graph.remove_edges_from(nx.selfloop_edges(graph))
    
    # Set up the plot
    plt.figure(figsize=(20, 20))
    
    # Create layout (you can experiment with different layouts)
    layout = nx.spring_layout(graph, k=1, iterations=50)
    
    # Draw the graph
    nx.draw(graph, 
            layout, 
            with_labels=True,
            node_color='lightblue',
            node_size=1000,
            arrowsize=20,
            font_size=8)
    
    plt.title("Category Hierarchy")
    plt.show()
    
    return graph

# Create and visualize the graph
graph = create_and_visualize_hierarchy(hierarchy)

In [64]:
from tqdm import tqdm
def find_parent_categories(topological_sort, product_catalog):
    """
    Finds the parent categories for each category in the product catalog
    based on the topological sort of categories, excluding the root node (e.g., 3898).

    Args:
        topological_sort (DataFrame): DataFrame with 'category_id' and 'parent_category_id'.
        product_catalog (DataFrame): DataFrame containing product information.

    Returns:
        DataFrame: Updated product catalog with parent categories.
    """
    # Drop rows with missing categories
    product_catalog = product_catalog.dropna(subset=['categories']).copy()

    # Create a dictionary for fast lookups of parent categories
    parent_map = dict(zip(topological_sort['category_id'], topological_sort['parent_category_id']))

    # Function to find all parents for a given category
    def get_parents(category):
        parents = set()
        while category in parent_map and parent_map[category] != 3898:
            category = parent_map[category]
            parents.add(category)
        return parents

    # Process each product's categories
    parent_categories = []
    for categories in tqdm(product_catalog['categories'], total=len(product_catalog)):
        if not categories:
            parent_categories.append([])
        else:
            # Collect all parent categories for all categories of this product
            all_parents = set()
            for category in categories:
                all_parents.update(get_parents(category))
            parent_categories.append(list(all_parents))

    # Add the result to the product catalog
    product_catalog['parent_categories'] = parent_categories
    return product_catalog


product_catalog = find_parent_categories(hierarchy, product_catalog)
product_catalog

100%|██████████| 32776/32776 [00:00<00:00, 809152.06it/s]


Unnamed: 0,product_id,manufacturer_id,attribute_1,attribute_2,attribute_3,attribute_4,attribute_5,categories,parent_categories
0,22665,861,4,0,490,2,66,"[2890, 855, 3908, 3909]","[2832, 1178, 2012, 2838]"
1,28640,1366,10,1,537,0,101,[],[]
2,13526,1090,10,0,511,0,0,"[3270, 163, 284, 1694, 12, 3837, 2422, 3595, 3...","[3860, 2364, 600, 3241, 1420]"
3,21399,1090,10,1,511,0,0,[3270],"[1420, 2364]"
4,8504,768,4,1,484,0,66,[2470],"[1072, 2566]"
...,...,...,...,...,...,...,...,...,...
32771,12036,1383,10,0,503,0,101,"[3502, 1088, 2004, 691, 3422, 2308, 308, 3688,...","[322, 3860, 600, 3241, 458, 2364]"
32772,22709,567,4,2,491,3,66,"[3900, 3905, 3903, 3910, 1807, 274, 1467, 914]","[1072, 2928, 2566, 2920, 2475, 3565]"
32773,32322,1385,10,1,500,0,37,"[2686, 2609, 725]","[725, 2917, 1735]"
32774,19118,1113,6,2,491,0,117,"[3900, 3910, 3903, 2473, 914, 274]","[1072, 2928, 2566, 2920, 3565]"


In [65]:
# from pandarallel import pandarallel

# # Initialize pandarallel
# pandarallel.initialize(progress_bar=True)  # Enable progress bar

# def weeks_until_next_purchase(row):
#     customer_id = row['customer_id']
#     product_id = row['product_id']
#     week = row['week']
    
#     # Get the parent categories of the current product
#     parent_categories = product_catalog.loc[product_catalog['product_id'] == product_id, 'parent_categories'].values[0]
    
#     # Filter future transactions for the same customer
#     future_purchases = transactions[(transactions['customer_id'] == customer_id) & 
#                                      (transactions['week'] > week)]
    
#     # Check for mutual parent categories (non-parallel here)
#     future_purchases['mutual_parent'] = future_purchases['product_id'].apply(
#         lambda pid: bool(set(product_catalog.loc[product_catalog['product_id'] == pid, 'parent_categories'].values[0]) & set(parent_categories))
#     )
    
#     # Filter only transactions with mutual parent categories
#     future_purchases_with_mutual_parent = future_purchases[future_purchases['mutual_parent']]
    
#     if not future_purchases_with_mutual_parent.empty:
#         # Calculate the weeks until the next purchase
#         weeks_until_next = future_purchases_with_mutual_parent['week'].min() - week
#         return weeks_until_next
#     else:
#         # No purchase with mutual parent category in the future
#         return 0  # Use -1 to indicate no purchase in the next 4 weeks

# # Apply the function in parallel (only top-level parallelism)
# transactions['weeks_until_next_purchase'] = transactions.parallel_apply(weeks_until_next_purchase, axis=1)

# # Replace values in the 'weeks_until_next_purchase' column
# transactions["weeks_until_next_purchase"] = transactions["weeks_until_next_purchase"].replace({-1: 0})

# # Display the updated DataFrame
# transactions["weeks_until_next_purchase"] = transactions["weeks_until_next_purchase"].apply(lambda x: 0 if x > 4 else x)

# # Display the updated DataFrame
# transactions.drop(columns=["weeks_for_next_purchase"], inplace=True)

# # Display the updated DataFrame
# transactions
# def weeks_past_from_before_purchase(row):
#     customer_id = row['customer_id']
#     product_id = row['product_id']
#     week = row['week']
    
#     # Get the parent categories of the current product
#     parent_categories = product_catalog.loc[product_catalog['product_id'] == product_id, 'parent_categories'].values[0]
    
#     # Filter past transactions for the same customer
#     past_purchases = transactions[(transactions['customer_id'] == customer_id) & 
#                                   (transactions['week'] < week)]
    
#     # Check for mutual parent categories
#     past_purchases['mutual_parent'] = past_purchases['product_id'].apply(
#         lambda pid: bool(set(product_catalog.loc[product_catalog['product_id'] == pid, 'parent_categories'].values[0]) & set(parent_categories))
#     )
    
#     # Filter only transactions with mutual parent categories
#     past_purchases_with_mutual_parent = past_purchases[past_purchases['mutual_parent']]
    
#     if not past_purchases_with_mutual_parent.empty:
#         # Calculate the weeks since the last purchase
#         weeks_since_last = week - past_purchases_with_mutual_parent['week'].max()
#         return weeks_since_last
#     else:
#         # No purchase with mutual parent category in the past
#         return 0

# # Apply the function to the transactions DataFrame
# transactions['weeks_past_from_before_purchase'] = transactions.parallel_apply(weeks_past_from_before_purchase, axis=1)

# # Display the updated DataFrame
# transactions

In [67]:
import pandas as pd
from tqdm import tqdm
from itertools import combinations
from collections import defaultdict

# Define Jaccard similarity function
def jaccard_similarity(set1, set2):
    intersection = len(set1 & set2)
    union = len(set1 | set2)
    return intersection / union if union != 0 else 0

# Pre-compute sets for parent categories
product_catalog['parent_categories_set'] = product_catalog['parent_categories'].apply(set)

# Create a dictionary to store Jaccard similarities
jaccard_similarities = defaultdict(list)

# Iterate over unique pairs of products
product_ids = product_catalog['product_id'].values
parent_categories = product_catalog['parent_categories_set'].values

for (i, j) in tqdm(combinations(range(len(product_ids)), 2), total=len(product_ids) * (len(product_ids) - 1) // 2):
    product_id1, product_id2 = product_ids[i], product_ids[j]
    set1, set2 = parent_categories[i], parent_categories[j]
    
    similarity = jaccard_similarity(set1, set2)
    if similarity > 0:  # Only store non-zero similarities
        jaccard_similarities[product_id1].append((product_id2, similarity))
        jaccard_similarities[product_id2].append((product_id1, similarity))

# Sort similarities for each product
for product_id in jaccard_similarities:
    jaccard_similarities[product_id] = sorted(jaccard_similarities[product_id], key=lambda x: x[1], reverse=True)

# Display the Jaccard similarities for the first few products
for product_id, similarities in list(jaccard_similarities.items())[:5]:
    print(f"Product ID: {product_id}")
    print("Similar Products (Product ID, Jaccard Similarity):")
    for similar_product_id, similarity in similarities[:5]:  # Display top 5 similar products
        print(f"  {similar_product_id}: {similarity:.2f}")
    print()


100%|██████████| 537116700/537116700 [06:22<00:00, 1405215.74it/s]


Product ID: 22665
Similar Products (Product ID, Jaccard Similarity):
  22534: 1.00
  16644: 1.00
  6837: 1.00
  20740: 1.00
  16260: 1.00

Product ID: 3914
Similar Products (Product ID, Jaccard Similarity):
  24190: 1.00
  27505: 1.00
  10145: 1.00
  25406: 1.00
  10136: 1.00

Product ID: 24190
Similar Products (Product ID, Jaccard Similarity):
  3914: 1.00
  27505: 1.00
  10145: 1.00
  25406: 1.00
  10136: 1.00

Product ID: 10963
Similar Products (Product ID, Jaccard Similarity):
  7257: 1.00
  23945: 1.00
  1259: 1.00
  27298: 1.00
  30618: 1.00

Product ID: 13653
Similar Products (Product ID, Jaccard Similarity):
  30824: 1.00
  17002: 1.00
  18698: 0.80
  18484: 0.80
  31112: 0.67



In [None]:
jaccard_similarities = {k: [item for item in v if item[1] >= 0.6] for k, v in jaccard_similarities.items()}


[(22534, 1.0),
 (16644, 1.0),
 (6837, 1.0),
 (20740, 1.0),
 (16260, 1.0),
 (28263, 1.0),
 (30284, 1.0),
 (32694, 1.0),
 (22266, 0.8),
 (14455, 0.8),
 (1234, 0.8),
 (820, 0.8),
 (8062, 0.8),
 (9918, 0.75),
 (12404, 0.75),
 (13761, 0.75),
 (8899, 0.75),
 (32186, 0.75),
 (1606, 0.6666666666666666),
 (7388, 0.6666666666666666),
 (20273, 0.6666666666666666),
 (12129, 0.6666666666666666),
 (31027, 0.6666666666666666),
 (27586, 0.6),
 (14848, 0.6)]

: 

In [51]:
transactions

Unnamed: 0,customer_id,product_id,purchase_date,quantity,week,parent_categories,weeks_until_next_purchase,weeks_past_from_before_purchase
0,0,20664,2020-06-05,1,23,"[2346, 2054, 1303]",0,0
1,0,12468,2020-08-03,1,32,"[2385, 1841, 2723, 519, 3707]",0,0
2,0,15083,2020-08-03,1,32,"[1148, 1666, 1062, 2762, 2364, 621, 1374, 3439]",2,0
3,0,12505,2020-08-18,1,34,"[3616, 2762, 298, 1932, 366, 686, 2771, 2231, ...",2,2
4,0,1505,2020-09-01,1,36,"[1148, 1666, 1062, 2762, 2364, 1374, 3439]",0,2
...,...,...,...,...,...,...,...,...
1041472,46137,2550,2021-01-18,1,55,"[2012, 621]",0,0
1041473,46137,14695,2021-01-18,1,55,"[1932, 686]",0,0
1041474,46137,22403,2021-01-18,1,55,"[2497, 2369, 2793, 2346, 1739, 621, 2542, 1678...",1,0
1041475,46137,11338,2021-01-31,1,56,"[1932, 686]",0,1


In [52]:
transactions

Unnamed: 0,customer_id,product_id,purchase_date,quantity,week,parent_categories,weeks_until_next_purchase,weeks_past_from_before_purchase
0,0,20664,2020-06-05,1,23,"[2346, 2054, 1303]",0,0
1,0,12468,2020-08-03,1,32,"[2385, 1841, 2723, 519, 3707]",0,0
2,0,15083,2020-08-03,1,32,"[1148, 1666, 1062, 2762, 2364, 621, 1374, 3439]",2,0
3,0,12505,2020-08-18,1,34,"[3616, 2762, 298, 1932, 366, 686, 2771, 2231, ...",2,2
4,0,1505,2020-09-01,1,36,"[1148, 1666, 1062, 2762, 2364, 1374, 3439]",0,2
...,...,...,...,...,...,...,...,...
1041472,46137,2550,2021-01-18,1,55,"[2012, 621]",0,0
1041473,46137,14695,2021-01-18,1,55,"[1932, 686]",0,0
1041474,46137,22403,2021-01-18,1,55,"[2497, 2369, 2793, 2346, 1739, 621, 2542, 1678...",1,0
1041475,46137,11338,2021-01-31,1,56,"[1932, 686]",0,1


### Making Predictions

In [54]:
test = pd.read_csv("/home/arsun/projects/YZV311_2425_150210324_150210329/data/raw/test.csv")
test.drop(columns=["id", "prediction"], inplace=True)


test = test.merge(product_catalog, on=["product_id"], how="left")


test

In [55]:


from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier

X = test.drop(columns=["week", "parent_categories"])
# Multiclass Example
clf = RandomForestClassifier()
clf.fit(X_transformed, y_multiclass)

# Multilabel Example
multi_clf = MultiOutputClassifier(RandomForestClassifier())
multi_clf.fit(X_transformed, y_multilabel)


Unnamed: 0,customer_id,product_id,manufacturer_id,attribute_1,attribute_2,attribute_3,attribute_4,attribute_5,categories,parent_categories
0,0,20664,408,4,0,284,0,66,"[236, 356]","[2346, 2054, 1303]"
1,0,28231,193,4,3,468,3,108,"[3024, 3955, 3956]",[1682]
2,13,2690,406,4,3,491,0,66,[2995],[3189]
3,15,1299,1056,4,0,474,-1,108,"[3900, 3901, 3902, 3903, 949, 2563, 2424]","[3761, 2723, 1499, 2364]"
4,15,20968,1315,4,0,444,0,144,"[2629, 3228, 3915, 3914]","[2920, 1603, 2928, 686]"
...,...,...,...,...,...,...,...,...,...,...
9995,46118,20106,1111,4,0,491,0,66,"[1920, 3983]",[1682]
9996,46124,19677,1006,4,0,491,3,154,"[1616, 1705, 71, 237, 1675]","[3137, 2346, 3178, 1840, 370, 3027, 2364, 1693]"
9997,46125,12878,1111,4,0,491,0,66,"[699, 785, 1920, 4047]","[1682, 2364, 3703]"
9998,46127,7963,1111,4,0,485,3,154,[975],"[1072, 2475, 2566]"


In [56]:
y_pred = model.predict(test)

NameError: name 'model' is not defined

In [41]:
y_pred = pd.Series(y_pred)
y_pred.value_counts()



0    10000
Name: count, dtype: int64

In [3]:
import pandas as pd

df = pd.read_csv("/home/arsun/projects/YZV311_2425_150210324_150210329/data/raw/test.csv")


df["prediction"] = 0 

df.to_csv("/home/arsun/projects/YZV311_2425_150210324_150210329/data/raw/zero_sub.csv", index=False)