In [30]:
# Import required module
import os
import subprocess


try:
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    from PIL import Image
    import glob
    from tqdm import tqdm
    import seaborn as sns
    import torch
    import torchvision.models as models
    import torchvision.transforms as transforms
    from sklearn.metrics.pairwise import cosine_similarity
    import torch.nn as nn
    import torch.nn.functional as F
    from torch_geometric.nn import GCNConv
    from scipy.stats import rankdata
    from torch.utils.data import Dataset
    from sklearn.feature_extraction.text import TfidfVectorizer
    from collections import defaultdict
    import random
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import StandardScaler
    from sklearn.cluster import KMeans
    from torch_geometric.utils import add_self_loops
    from sklearn.metrics import roc_auc_score
    from sklearn.preprocessing import MinMaxScaler
    from sklearn.metrics import precision_score, recall_score, f1_score, ndcg_score
    from itertools import chain
    from sklearn.manifold import TSNE
    import networkx as nx
    from torchvision import models, transforms
    import torch
    import torch_xla
    import torch_xla.core.xla_model as xm
    import gc
    import glob
    import pickle
    import zipfile 
except ImportError:
    print("Installing dependencies...")

    # Install system-level dependencies
    subprocess.run(["apt-get", "update"])
    subprocess.run(["apt-get", "install", "-y", "libcairo2", "libcairo2-dev"])

    # Install Python packages
    subprocess.run(["pip", "install", "--quiet", "pycairo"])
    subprocess.run(["pip", "install", "--quiet", "torch torchvision torchaudio"])
    subprocess.run(["pip", "install", "--quiet", "torch-geometric", "torch-scatter", "torch-sparse", "torch-cluster", "torch-spline-conv"])
    subprocess.run(["pip", "install", "--quiet", "pandas", "numpy", "matplotlib", "pillow", "tqdm", "seaborn", "scipy"])
    subprocess.run(["pip", "install", "--quiet", "scikit-learn"])
    subprocess.run(["pip", "install", "--quiet", "networkx"])

    print("All dependencies installed.")


print("✅ All required dependencies are installed!")


✅ All required dependencies are installed!


In [31]:
# Check if TPU is available
device = xm.xla_device()
print(f"✅ Using device: {device}")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
dtype = torch.float16 if torch.cuda.is_available() else torch.bfloat16


✅ Using device: xla:0


In [32]:
# Adjust article ID format
def adjust_id(x):
    x = str(x)
    return "0" + x if len(x) == 9 else x

In [35]:
# ✅ Correct path (inside `/kaggle/input/preprocessed-data/`)
PREPROCESSED_DIR = "/kaggle/input/preprocessed-data"

# # ✅ Define file paths for saving preprocessed data
# PREPROCESSED_DIR = "/kaggle/working/preprocessed-data"
# ZIP_FILE = "/kaggle/working/preprocessed-data.zip"  # Final zipped archive

# # ✅ Create the directory if it doesn't exist
# os.makedirs(PREPROCESSED_DIR, exist_ok=True)


In [37]:
# ✅ Extract ZIP only if the preprocessed files don't already exist
if os.path.exists(os.path.join(PREPROCESSED_DIR, "articles_with_images.pkl")):
    print("✅ Preprocessed data already exists. Loading...")

    with open(os.path.join(PREPROCESSED_DIR, "articles_with_images.pkl"), "rb") as f:
        articles_with_images = pickle.load(f)

    with open(os.path.join(PREPROCESSED_DIR, "transactions_filtered.pkl"), "rb") as f:
        transactions_filtered = pickle.load(f)

    with open(os.path.join(PREPROCESSED_DIR, "customers_processed.pkl"), "rb") as f:
        customers = pickle.load(f)

    print("✅ Preprocessed data loaded successfully!")

else :
    print("⚡ Running preprocessing for the first time...")
    articles = pd.read_csv('/kaggle/input/h-and-m-personalized-fashion-recommendations/articles.csv')
    customers = pd.read_csv('/kaggle/input/h-and-m-personalized-fashion-recommendations/customers.csv')
    transactions = pd.read_csv('/kaggle/input/h-and-m-personalized-fashion-recommendations/transactions_train.csv')

    # Get all paths from the image folder
    all_image_paths = glob.glob("/kaggle/input/h-and-m-personalized-fashion-recommendations/images/*/*")
    
    # Adjust the article ID and product code to be string & add "0"
    articles["article_id"] = articles["article_id"].apply(lambda x: adjust_id(x))
    articles["product_code"] = articles["article_id"].apply(lambda x: x[:3])
    
    # Get all valid article IDs and create a set to store image IDs
    all_image_ids = set()
    
    for path in tqdm(all_image_paths, desc="Processing Images"):
        article_id = os.path.basename(path).split('.')[0]  # Extract image ID from filename
        all_image_ids.add(article_id)
    
    
    # Construct full image paths and filter invalid ones
    images_path = "/kaggle/input/h-and-m-personalized-fashion-recommendations/images/"
    articles["path"] = articles["article_id"].apply(
        lambda x: images_path + x[:3] + "/" + x + ".jpg" if x in all_image_ids else None
    )
    
    # ✅ Keep only articles with valid images
    articles_with_images = articles.dropna(subset=["path"]).reset_index(drop=True)

    # Adjust the article ID and product code to be string & add "0"

    articles_with_images["article_id"] = articles_with_images["article_id"].astype("category")
    articles_with_images["product_code"] = articles_with_images["product_code"].astype("category")

    # Fill missing values
    customers.fillna({
        "FN": 0,
        "Active": 0,
        "club_member_status": "UNKNOWN",
        "fashion_news_frequency": "UNKNOWN",
        "age": customers["age"].median()
    }, inplace=True)
    
    customers["customer_id"] = customers["customer_id"].astype("category")
    customers["Active"] = customers["Active"].astype(np.int8)
    customers["FN"] = customers["FN"].astype(np.int8)
    customers["age"] = customers["age"].astype(np.float16)

    if 'age' in customers.columns:
        scaler_age = MinMaxScaler()
        customers['normalized_age'] = scaler_age.fit_transform(customers[['age']])
    else:
        raise ValueError("Error: `age` column is missing in customers!")
    
    # customers["fashion_news_frequency"] = customers["fashion_news_frequency"].replace({"None": "NONE"})
    # customers["age_interval"] = customers["age"].apply(create_age_interval)

    # Adjust article_id (as did for articles dataframe)
    transactions["article_id"] = transactions["article_id"].apply(lambda x: adjust_id(x))
    
    # Filter the transactions dataset to keep only valid article IDs
    transactions_filtered = transactions[transactions["article_id"].isin(set(articles_with_images["article_id"]))].reset_index(drop=True)
    
    # Optionally save the filtered transactions dataset
    # transactions_filtered.to_csv("transactions_filtered.csv", index=False)
    transactions_filtered["article_id"] = transactions_filtered["article_id"].astype("category")
    transactions_filtered["price"] = transactions_filtered["price"].astype(np.float16)

    # ✅ Save preprocessed data locally
    with open(os.path.join(PREPROCESSED_DIR, "articles_with_images.pkl"), "wb") as f:
        pickle.dump(articles_with_images, f)

    with open(os.path.join(PREPROCESSED_DIR, "transactions_filtered.pkl"), "wb") as f:
        pickle.dump(transactions_filtered, f)

    with open(os.path.join(PREPROCESSED_DIR, "customers_processed.pkl"), "wb") as f:
        pickle.dump(customers, f)

    # ✅ Create a ZIP archive containing all preprocessed files
    with zipfile.ZipFile(ZIP_FILE, 'w') as zipf:
        for file in os.listdir(PREPROCESSED_DIR):
            zipf.write(os.path.join(PREPROCESSED_DIR, file), arcname=file)

    print(f"✅ Preprocessing completed! Saved as {ZIP_FILE}")


✅ Preprocessed data already exists. Loading...
✅ Preprocessed data loaded successfully!


In [42]:
gc.collect()
torch.cuda.empty_cache()  # If using GPU

In [43]:
# Set random seed for reproducibility
np.random.seed(42)
random.seed(42)

creating embeddings

In [44]:
transactions_filtered.head()

  has_large_values = (abs_vals > 1e6).any()


Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.050842,2
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.030487,2
2,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221004,0.015236,2
3,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687003,0.016937,2
4,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687004,0.016937,2


In [47]:
# # ✅ Define file paths for saving preprocessed data
# IMAGE_EMBEDDINGS_DIR = "/kaggle/working/image_embeddings"
# IMAGE_EMBEDDINGS_ZIP_FILE = "/kaggle/working/image_embeddings.zip"  # Final zipped archive

# # ✅ Create the directory if it doesn't exist
# os.makedirs(IMAGE_EMBEDDINGS_DIR, exist_ok=True)

# ✅ Correct path (inside `/kaggle/input/preprocessed-data/`)
IMAGE_EMBEDDINGS_DIR = "/kaggle/input/image-embeddings"

In [48]:
# ✅ Extract ZIP only if the preprocessed files don't already exist
if os.path.exists(os.path.join(IMAGE_EMBEDDINGS_DIR, "image_embeddings.pkl")):
    print("✅ image_embeddings data already exists. Loading...")

    with open(os.path.join(IMAGE_EMBEDDINGS_DIR, "image_embeddings.pkl"), "rb") as f:
        image_embeddings = pickle.load(f)

    print("✅ image embeddings data loaded successfully!")

else:
    print("⚡ Extracting image embeddings for the first time...")

    # ✅ Load pre-trained ResNet50 model (Feature Extraction)
    resnet_model = models.resnet50(pretrained=True)
    resnet_model = torch.nn.Sequential(*list(resnet_model.children())[:-1])  # Remove the last FC layer
    resnet_model = resnet_model.to(device).eval()  # Move to device & set to eval mode

    # ✅ Define image transformations
    transform = transforms.Compose([
        transforms.Resize((224, 224)),  
        transforms.ToTensor(),          
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  
    ])

    # ✅ Function to extract image embeddings
    def get_image_embeddings_batch(image_paths_batch):
        images = [Image.open(path).convert('RGB') for path in image_paths_batch]
        images = [transform(image).unsqueeze(0) for image in images]
        images_tensor = torch.cat(images, dim=0).to(device)

        with torch.no_grad():
            features = resnet_model(images_tensor).squeeze(-1).squeeze(-1)  # Remove extra dimensions
        
        return features.cpu().numpy()

    # ✅ Process images in batches to avoid memory issues
    batch_size = 544  # Adjust batch size based on available memory
    image_embeddings = []

    for i in tqdm(range(0, len(articles_with_images), batch_size), desc="Extracting Features"):
        image_paths_batch = articles_with_images['path'][i:i + batch_size]
        embeddings_batch = get_image_embeddings_batch(image_paths_batch)
        image_embeddings.extend(embeddings_batch)

    # ✅ Convert to efficient tensor format
    image_embeddings = torch.tensor(image_embeddings, dtype=dtype).to(device)

    # ✅ Save preprocessed data locally
    with open(os.path.join(IMAGE_EMBEDDINGS_DIR, "image_embeddings.pkl"), "wb") as f:
        pickle.dump(image_embeddings, f)


    # ✅ Create a ZIP archive containing all preprocessed files
    with zipfile.ZipFile(IMAGE_EMBEDDINGS_ZIP_FILE, 'w') as zipf:
        for file in os.listdir(IMAGE_EMBEDDINGS_DIR):
            zipf.write(os.path.join(IMAGE_EMBEDDINGS_DIR, file), arcname=file)

    print(f"✅ image_embeddings completed! Saved as {IMAGE_EMBEDDINGS_ZIP_FILE}")

✅ image_embeddings data already exists. Loading...
✅ image embeddings data loaded successfully!


In [51]:
# # ✅ Define file paths for saving preprocessed data
# TEXT_EMBEDDINGS_DIR = "/kaggle/working/text_embeddings"
# TEXT_EMBEDDINGS_ZIP_FILE = "/kaggle/working/text_embeddings.zip"  # Final zipped archive

# # ✅ Create the directory if it doesn't exist
# os.makedirs(TEXT_EMBEDDINGS_DIR, exist_ok=True)

# ✅ Correct path (inside `/kaggle/input/preprocessed-data/`)
TEXT_EMBEDDINGS_DIR = "/kaggle/input/text-embeddings"

In [53]:
# ✅ Extract ZIP only if the preprocessed files don't already exist
if os.path.exists(os.path.join(TEXT_EMBEDDINGS_DIR, "text_embeddings.pkl")):
    print("✅ text_embeddings data already exists. Loading...")

    with open(os.path.join(TEXT_EMBEDDINGS_DIR, "text_embeddings.pkl"), "rb") as f:
        text_embeddings = pickle.load(f)

    print("✅ text embeddings data loaded successfull")

else :

    # Define text features
    if all(col in articles_with_images.columns for col in ['detail_desc', 'prod_name', 'product_type_name', 'product_group_name',
                                               'graphical_appearance_name', 'colour_group_name', 'index_name',
                                               'index_group_name', 'section_name', 'garment_group_name']):
        articles_with_images['text_data'] = (
            articles_with_images['detail_desc'].fillna('') + ' ' +
            articles_with_images['prod_name'].fillna('') + ' ' +
            articles_with_images['product_type_name'].fillna('') + ' ' +
            articles_with_images['product_group_name'].fillna('') + ' ' +
            articles_with_images['graphical_appearance_name'].fillna('') + ' ' +
            articles_with_images['colour_group_name'].fillna('') + ' ' +
            articles_with_images['index_name'].fillna('') + ' ' +
            articles_with_images['index_group_name'].fillna('') + ' ' +
            articles_with_images['section_name'].fillna('') + ' ' +
            articles_with_images['garment_group_name'].fillna('')
        )
    else:
        raise ValueError("Error: One or more textual columns are missing!")

    
    # Load pre-trained GloVe embeddings (for example, GloVe 100D embeddings)
    def load_glove_embeddings(glove_file_path):
        embeddings_index = {}
        with open(glove_file_path, 'r', encoding='utf-8') as f:
            for line in f:
                values = line.split()
                word = values[0]
                vector = np.asarray(values[1:], dtype='float32')
                embeddings_index[word] = vector
        return embeddings_index
    
    # Example usage (change the path to your GloVe file)
    glove_file_path = '/kaggle/input/test-feature-1/glove.6B.100d.txt'
    embeddings_index = load_glove_embeddings(glove_file_path)
    
    # Checking a sample word embedding
    # print(embeddings_index['king'])  # Example of checking the embedding for 'king'

    # Function to convert text data to embeddings using GloVe
    def text_to_glove_embeddings(text_data, embeddings_index, embedding_dim=100):
        embeddings = []
        for text in text_data:
            words = text.split()  # Split the text into words
            word_embeddings = np.zeros(embedding_dim)
            valid_word_count = 0
            
            # For each word, get its GloVe embedding (if it exists)
            for word in words:
                if word in embeddings_index:
                    word_embeddings += embeddings_index[word]
                    valid_word_count += 1
            
            # Average the embeddings of the words in the text
            if valid_word_count > 0:
                word_embeddings /= valid_word_count
            embeddings.append(word_embeddings)
        
        return np.array(embeddings)
    
    # Convert the product descriptions to GloVe embeddings
    text_embeddings = text_to_glove_embeddings(articles_with_images['text_data'], embeddings_index)


    # ✅ Save preprocessed data locally
    with open(os.path.join(TEXT_EMBEDDINGS_DIR, "text_embeddings.pkl"), "wb") as f:
        pickle.dump(text_embeddings, f)


    # ✅ Create a ZIP archive containing all preprocessed files
    with zipfile.ZipFile(TEXT_EMBEDDINGS_ZIP_FILE, 'w') as zipf:
        for file in os.listdir(TEXT_EMBEDDINGS_DIR):
            zipf.write(os.path.join(TEXT_EMBEDDINGS_DIR, file), arcname=file)

    print(f"✅ text_embeddings completed! Saved as {TEXT_EMBEDDINGS_ZIP_FILE}")


✅ text_embeddings data already exists. Loading...
✅ text embeddings data loaded successfull


In [56]:
# # ✅ Define file paths for saving preprocessed data
# CUSTOMER_EMBEDDINGS_DIR = "/kaggle/working/customer_embeddings"
# CUSTOMER_EMBEDDINGS_ZIP_FILE = "/kaggle/working/customer_embeddings.zip"  # Final zipped archive

# # ✅ Create the directory if it doesn't exist
# os.makedirs(CUSTOMER_EMBEDDINGS_DIR, exist_ok=True)

# ✅ Correct path (inside `/kaggle/input/preprocessed-data/`)
CUSTOMER_EMBEDDINGS_DIR = "/kaggle/input/customer-embeddings"

In [57]:
# ✅ Extract ZIP only if the preprocessed files don't already exist
if os.path.exists(os.path.join(CUSTOMER_EMBEDDINGS_DIR, "customer_embeddings.pkl")):
    print("✅ customer_embeddings data already exists. Loading...")

    with open(os.path.join(CUSTOMER_EMBEDDINGS_DIR, "customer_embeddings.pkl"), "rb") as f:
        customer_embeddings = pickle.load(f)

    print("✅ customer_embeddings data loaded successfull")

else :

    # Step 2: One-Hot Encode Categorical Features and Convert to float32
    categorical_features_encoded = pd.get_dummies(customers[['Active', 'club_member_status']]).astype(np.float32)
    
    # Step 3: Combine Both Categorical and Numerical Features (Ensure float32)
    customer_features = np.hstack([
        customers['normalized_age'].values.reshape(-1, 1).astype(np.float32),  
        categorical_features_encoded.astype(np.float32)
    ])
    
    # Convert to Tensor and Move to GPU
    customer_features_tensor = torch.tensor(customer_features, dtype=torch.float32).to(device)
    
    # Step 4: Use a Neural Network to Create the Customer Embedding
    class CustomerEmbedding(nn.Module):
        def __init__(self, input_dim, embedding_dim):
            super(CustomerEmbedding, self).__init__()
            self.fc1 = nn.Linear(input_dim, 128)  # Hidden layer
            self.fc2 = nn.Linear(128, embedding_dim)  # Output layer: Customer embedding
    
        def forward(self, x):
            x = torch.relu(self.fc1(x))
            x = self.fc2(x)
            return x
    
    # Define the model
    embedding_dim = 64  # Size of the customer embedding
    model = CustomerEmbedding(input_dim=customer_features.shape[1], embedding_dim=embedding_dim).to(device)  # Move the model to GPU
    
    # Convert the customer features into a tensor and move it to GPU
    customer_features_tensor = torch.tensor(customer_features, dtype=torch.float32).to(device)
    
    # Get customer embeddings by passing through the model
    customer_embeddings = model(customer_features_tensor)
    
    # Move the embeddings back to CPU if needed (optional)
    customer_embeddings = customer_embeddings.cpu().detach().numpy()
    customer_embeddings = torch.tensor(customer_embeddings, dtype=torch.bfloat16).to(device)
    
    
     # ✅ Save preprocessed data locally
    with open(os.path.join(CUSTOMER_EMBEDDINGS_DIR, "customer_embeddings.pkl"), "wb") as f:
        pickle.dump(customer_embeddings, f)


    # ✅ Create a ZIP archive containing all preprocessed files
    with zipfile.ZipFile(CUSTOMER_EMBEDDINGS_ZIP_FILE, 'w') as zipf:
        for file in os.listdir(CUSTOMER_EMBEDDINGS_DIR):
            zipf.write(os.path.join(CUSTOMER_EMBEDDINGS_DIR, file), arcname=file)

    print(f"✅ customer_embeddings completed! Saved as {CUSTOMER_EMBEDDINGS_ZIP_FILE}")


✅ customer_embeddings data already exists. Loading...
✅ customer_embeddings data loaded successfull


In [58]:
edge_types = ["product_to_product", "customer_to_product", "customer_to_customer"]

In [59]:
def sliced_wasserstein_distance(P, Q, num_projections=50):
    """
    Computes the Sliced Wasserstein Distance (SWD) between two trajectory distributions P and Q.
    Uses random projections to compute the distance between the two distributions.
    """
    device = P.device
    proj_vectors = torch.randn((P.shape[1], num_projections), device=device)  # Random projection vectors
    proj_vectors = proj_vectors / torch.norm(proj_vectors, dim=0, keepdim=True)  # Normalize

    P_proj = P @ proj_vectors  # Project P onto vectors
    Q_proj = Q @ proj_vectors  # Project Q onto vectors

    P_sorted, _ = torch.sort(P_proj, dim=0)
    Q_sorted, _ = torch.sort(Q_proj, dim=0)

    return torch.mean(torch.abs(P_sorted - Q_sorted))

data splitting

In [60]:
### Step 6: Geometric Distributed Sampling (GDS)
def geometric_distributed_sampling(ranks, rho=0.5, max_samples=300000):
    """
    Optimized Geometric Distributed Sampling (GDS).
    Ensures probabilities remain valid by normalizing ranks.
    """
    sorted_indices = torch.argsort(ranks, descending=True)
    max_categories = min(len(ranks), 16_000_000)  
    filtered_indices = sorted_indices[:max_categories] 

    # Normalize ranks
    ranks_subset = ranks[filtered_indices]
    normalized_ranks = (ranks_subset - ranks_subset.min()) / (ranks_subset.max() - ranks_subset.min() + 1e-8)

    # Compute probabilities
    probabilities = torch.exp(-rho * normalized_ranks)
    probabilities /= probabilities.sum()  # Normalize to sum = 1

    # Sample indices
    sampled_relative_indices = torch.multinomial(probabilities, num_samples=min(max_samples, len(filtered_indices)), replacement=False)

    # Map back to original transaction indices
    sampled_indices = filtered_indices[sampled_relative_indices]

    return sampled_indices.cpu()

In [61]:
# # ✅ Define file paths for saving preprocessed data
# DATA_SPLITTING_DIR = "/kaggle/working/data_splitting"
# DATA_SPLITTING_ZIP_FILE = "/kaggle/working/data_splitting.zip"  # Final zipped archive

# # ✅ Create the directory if it doesn't exist
# os.makedirs(DATA_SPLITTING_DIR, exist_ok=True)

# ✅ Correct path (inside `/kaggle/input/preprocessed-data/`)
DATA_SPLITTING_DIR = "/kaggle/input/data-splitting"

In [62]:
# ✅ Extract ZIP only if the preprocessed files don't already exist
if os.path.exists(os.path.join(DATA_SPLITTING_DIR, "train_data.pkl")):
    print("✅ train_data data already exists. Loading...")

    # Load extracted data
    with open(os.path.join(DATA_SPLITTING_DIR, "train_data.pkl"), "rb") as f:
        train_data = pickle.load(f)

        # Load extracted data
    with open(os.path.join(DATA_SPLITTING_DIR, "val_data.pkl"), "rb") as f:
        val_data = pickle.load(f)

        # Load extracted data
    with open(os.path.join(DATA_SPLITTING_DIR, "test_data.pkl"), "rb") as f:
        test_data = pickle.load(f)

        # Load extracted data
    with open(os.path.join(DATA_SPLITTING_DIR, "train_customers.pkl"), "rb") as f:
        train_customers = pickle.load(f)

        # Load extracted data
    with open(os.path.join(DATA_SPLITTING_DIR, "val_customers.pkl"), "rb") as f:
        val_customers = pickle.load(f)

        # Load extracted data
    with open(os.path.join(DATA_SPLITTING_DIR, "test_customers.pkl"), "rb") as f:
        test_customers = pickle.load(f)

        # Load extracted data
    with open(os.path.join(DATA_SPLITTING_DIR, "train_articles.pkl"), "rb") as f:
        train_articles = pickle.load(f)

        # Load extracted data
    with open(os.path.join(DATA_SPLITTING_DIR, "val_articles.pkl"), "rb") as f:
        val_articles = pickle.load(f)

    with open(os.path.join(DATA_SPLITTING_DIR, "test_articles.pkl"), "rb") as f:
        test_articles = pickle.load(f)

    print("✅ SPLIT data loaded successfully!")

else :

    print("⚡ Extracting SPLIT for the first time...")

    ### Step 3: Convert transaction date to datetime
    transactions_filtered['t_dat'] = pd.to_datetime(transactions_filtered['t_dat'])
    
    # Sort transactions by date
    transactions_filtered = transactions_filtered.sort_values('t_dat', ascending=True)
    
    ### Step 4: Compute train, validation, and test cutoffs
    cutoffs = transactions_filtered['t_dat'].quantile([0.75, 0.875, 1.0]).values
    train_cutoff, val_cutoff, test_cutoff = cutoffs  # 75% Train, 12.5% Val, 12.5% Test
    
    ### Step 5: Assign Ranks Efficiently for GDS
    transactions_filtered["rank"] = transactions_filtered['t_dat'].rank(method="first", ascending=True)
    
    # Convert ranks to a tensor
    ranks_tensor = torch.tensor(transactions_filtered["rank"].to_numpy(dtype=np.float32), device=device)
    
    # Get sampled transactions
    sampled_indices = geometric_distributed_sampling(ranks_tensor, rho=0.5, max_samples=300000)
    transactions_filtered_sampled = transactions_filtered.iloc[sampled_indices.numpy()].reset_index(drop=True)

    ### Step 7: Train/Validation/Test Splitting
    train_mask = transactions_filtered_sampled['t_dat'] <= train_cutoff
    val_mask = (transactions_filtered_sampled['t_dat'] > train_cutoff) & (transactions_filtered_sampled['t_dat'] <= val_cutoff)
    test_mask = (transactions_filtered_sampled['t_dat'] > val_cutoff) & (transactions_filtered_sampled['t_dat'] <= test_cutoff)
    
    train_data = transactions_filtered_sampled[train_mask]
    val_data = transactions_filtered_sampled[val_mask]
    test_data = transactions_filtered_sampled[test_mask]
    
    ### Step 8: Cold Start Customer Handling
    transaction_customers = set(transactions_filtered['customer_id'])  
    cold_start_customers = customers[~customers['customer_id'].isin(transaction_customers)]
    
    # Split cold start customers into train, validation, and test
    train_cold_start, temp_cold_start = train_test_split(cold_start_customers, test_size=0.25, random_state=42)
    val_cold_start, test_cold_start = train_test_split(temp_cold_start, test_size=0.5, random_state=42)
    
    # Merge cold-start data with train/val/test datasets
    train_data = pd.concat([train_data, train_cold_start])
    val_data = pd.concat([val_data, val_cold_start])
    test_data = pd.concat([test_data, test_cold_start])
    
    # Filter customers to only include those appearing in train/val/test sets
    train_customers = customers[customers['customer_id'].isin(train_data['customer_id'])]
    val_customers = customers[customers['customer_id'].isin(val_data['customer_id'])]
    test_customers = customers[customers['customer_id'].isin(test_data['customer_id'])]
    
    # Filter articles to only include those appearing in train/val/test sets
    train_articles = articles_with_images[articles_with_images['article_id'].isin(train_data['article_id'])]
    val_articles = articles_with_images[articles_with_images['article_id'].isin(val_data['article_id'])]
    test_articles = articles_with_images[articles_with_images['article_id'].isin(test_data['article_id'])]



        # ✅ Save preprocessed data locally
    with open(os.path.join(DATA_SPLITTING_DIR, "train_data.pkl"), "wb") as f:
        pickle.dump(train_data, f)

    with open(os.path.join(DATA_SPLITTING_DIR, "val_data.pkl"), "wb") as f:
        pickle.dump(val_data, f)

    with open(os.path.join(DATA_SPLITTING_DIR, "test_data.pkl"), "wb") as f:
        pickle.dump(test_data, f)

    with open(os.path.join(DATA_SPLITTING_DIR, "train_customers.pkl"), "wb") as f:
        pickle.dump(train_customers, f)

    with open(os.path.join(DATA_SPLITTING_DIR, "val_customers.pkl"), "wb") as f:
        pickle.dump(val_customers, f)

    with open(os.path.join(DATA_SPLITTING_DIR, "test_customers.pkl"), "wb") as f:
        pickle.dump(test_customers, f)


    with open(os.path.join(DATA_SPLITTING_DIR, "train_articles.pkl"), "wb") as f:
        pickle.dump(train_articles, f)

    with open(os.path.join(DATA_SPLITTING_DIR, "val_articles.pkl"), "wb") as f:
        pickle.dump(val_articles, f)

    with open(os.path.join(DATA_SPLITTING_DIR, "test_articles.pkl"), "wb") as f:
        pickle.dump(test_articles, f)
    

    # ✅ Create a ZIP archive containing all preprocessed files
    with zipfile.ZipFile(DATA_SPLITTING_ZIP_FILE, 'w') as zipf:
        for file in os.listdir(DATA_SPLITTING_DIR):
            zipf.write(os.path.join(DATA_SPLITTING_DIR, file), arcname=file)

    print(f"✅ data splitting completed! Saved as {DATA_SPLITTING_ZIP_FILE}")

✅ data splitting completed! Saved as /kaggle/working/data_splitting.zip


In [63]:
### Step 9: Create the final feature matrices

def filter_and_align_embeddings(data, articles, customers):
    """ Ensures that price, text, image, and customer embeddings are aligned properly. """
    
    # 1️⃣ **Filter transactions to unique article IDs**
    valid_articles = articles[articles['article_id'].isin(data['article_id'])].reset_index(drop=True)

    # 2️⃣ **Filter customers to only those appearing in `train_data`**
    valid_customers = customers[customers['customer_id'].isin(data['customer_id'])].reset_index(drop=True)

    # 3️⃣ **Extract price values matching valid articles**
    price_values = (
        data.drop_duplicates(subset=["article_id"])  # Deduplicate articles
        .set_index("article_id")
        .loc[valid_articles["article_id"], "price"]
        .values.reshape(-1, 1).astype(np.float32)
    )

    # 4️⃣ **Ensure correct embedding shapes**
    text_embeds = np.array(text_embeddings[valid_articles.index].tolist(), dtype=np.float32)
    image_embeds = np.array(image_embeddings[valid_articles.index].tolist(), dtype=np.float32)
    customer_embeds = np.array(customer_embeddings[valid_customers.index].tolist(), dtype=np.float32)

    # 5️⃣ **Ensure all features have the same row count**
    min_rows = min(len(price_values), len(text_embeds), len(image_embeds), len(customer_embeds))
    return np.hstack([
        price_values[:min_rows],  
        text_embeds[:min_rows],  
        image_embeds[:min_rows],  
        customer_embeds[:min_rows]
    ])

In [64]:
# # ✅ Define file paths for saving preprocessed data
# FEATURES_DIR = "/kaggle/working/features"
# FEATURES_ZIP_FILE = "/kaggle/working/features.zip"  # Final zipped archive

# # ✅ Create the directory if it doesn't exist
# os.makedirs(FEATURES_DIR, exist_ok=True)

# ✅ Correct path (inside `/kaggle/input/preprocessed-data/`)
FEATURES_DIR = "/kaggle/input/features"

In [65]:
# ✅ Extract ZIP only if the preprocessed files don't already exist
if os.path.exists(os.path.join(FEATURES_DIR, "train_features.pkl")):
    print("✅ features are already exists. Loading...")

    # Load extracted data
    with open(os.path.join(FEATURES_DIR, "train_features.pkl"), "rb") as f:
        train_features = pickle.load(f)

    with open(os.path.join(FEATURES_DIR, "val_features.pkl"), "rb") as f:
        val_features = pickle.load(f)

    with open(os.path.join(FEATURES_DIR, "test_features.pkl"), "rb") as f:
        test_features = pickle.load(f)

    print("✅ features loaded successfully!")

else :

    print("⚡ Extracting features for the first time...")
    # ✅ Align and stack features properly
    train_features = filter_and_align_embeddings(train_data, train_articles, train_customers)
    val_features = filter_and_align_embeddings(val_data, val_articles, val_customers)
    test_features = filter_and_align_embeddings(test_data, test_articles, test_customers)


         # ✅ Save preprocessed data locally
    with open(os.path.join(FEATURES_DIR, "train_features.pkl"), "wb") as f:
        pickle.dump(train_features, f)

    with open(os.path.join(FEATURES_DIR, "val_features.pkl"), "wb") as f:
        pickle.dump(val_features, f)

    with open(os.path.join(FEATURES_DIR, "test_features.pkl"), "wb") as f:
        pickle.dump(test_features, f)


    # ✅ Create a ZIP archive containing all preprocessed files
    with zipfile.ZipFile(FEATURES_ZIP_FILE, 'w') as zipf:
        for file in os.listdir(FEATURES_DIR):
            zipf.write(os.path.join(FEATURES_DIR, file), arcname=file)

    print(f"✅ features completed! Saved as {FEATURES_ZIP_FILE}")


⚡ Extracting features for the first time...
✅ features completed! Saved as /kaggle/working/features.zip


In [None]:
# ✅ Extract ZIP only if the folder is empty
if not os.path.exists(os.path.join(DATA_SPLITTING_DIR, "train_data.pkl")):
    if os.path.exists(DATA_SPLITTING_ZIP_FILE):
        print("✅ Extracting SPLIT data from ZIP...")
        with zipfile.ZipFile(DATA_SPLITTING_ZIP_FILE, 'r') as zip_ref:
            zip_ref.extractall(DATA_SPLITTING_DIR)
        print("✅ SPLIT data loaded successfully!")

# ✅ Load existing preprocessed files
def load_pkl(filepath):
    with open(filepath, "rb") as f:
        return pickle.load(f)

train_data = load_pkl(os.path.join(DATA_SPLITTING_DIR, "train_data.pkl"))
train_customers = load_pkl(os.path.join(DATA_SPLITTING_DIR, "train_customers.pkl"))
train_articles = load_pkl(os.path.join(DATA_SPLITTING_DIR, "train_articles.pkl"))

print("✅ Splitted data loaded. Updating positive indices...")

In [None]:
# ✅ Check if positive indices already exist in datasets
positive_indices_exist = (
    "positive_indices" in train_articles.columns and 
    "positive_indices" in train_customers.columns and
    "customer_to_product_positive_indices" in train_customers.columns
)



In [None]:
if positive_indices_exist:
    print("✅ Positive indices already exist. Skipping computation!")
else:
    print("⚡ Positive indices not found! Generating...")
    
    # ✅ Move embeddings to GPU if available
    article_embedding_matrix = torch.tensor(np.hstack([text_embeddings, image_embeddings]), dtype=torch.float32).to(device)
    customer_embedding_matrix = torch.tensor(customer_embeddings, dtype=torch.float32).to(device)
    
    # ✅ Create Index Mappings
    article_to_index = {article_id: idx for idx, article_id in enumerate(train_articles['article_id'])}
    customer_to_index = {customer_id: idx for idx, customer_id in enumerate(train_customers['customer_id'])}
    
    # ✅ Define function with GPU support
    def get_positive_indices(idx, product_group_name, colour_group_code, entity_id, embedding_matrix, entity_type):
        if entity_type == "customer":
            # Retrieve co-purchased articles for the customer
            purchased_articles = train_data[train_data['customer_id'] == entity_id]['article_id'].unique()
            return [article_to_index[aid] for aid in purchased_articles if aid in article_to_index][:10]
    
        elif entity_type == "product":
            # Find articles with the same product group and colour group
            group_indices = train_articles[
                (train_articles['product_group_name'] == product_group_name) &
                (train_articles['colour_group_code'] == colour_group_code)
            ].index.tolist()
    
            return group_indices[:10]  # Select top 10 based on group and color (for now)
    
        elif entity_type == "customer_to_product":
            # Retrieve customers who purchased a given product
            purchasing_customers = train_data[train_data['article_id'] == entity_id]['customer_id'].unique()
            return [customer_to_index[cid] for cid in purchasing_customers if cid in customer_to_index][:10]
    
        return []
    
    # ✅ Move `apply` operations to GPU-compatible format (Avoid slow `apply`)
    # Processing in batches instead of iterating over rows (vectorized computation)
    /
    # ✅ **Process Articles**
    articles_product_ids = train_articles['article_id'].values
    product_positive_indices = [
        get_positive_indices(idx, row['product_group_name'], row['colour_group_code'], row['article_id'], article_embedding_matrix, "product")
        for idx, row in train_articles.iterrows()
    ]
    train_articles['positive_indices'] = product_positive_indices
    
    # ✅ **Process Customers**
    customer_positive_indices = [
        get_positive_indices(idx, None, None, row['customer_id'], customer_embedding_matrix, "customer")
        for idx, row in train_customers.iterrows()
    ]
    train_customers['positive_indices'] = customer_positive_indices
    
    # ✅ **Process Customer-to-product Relations**
    customer_to_product_positive_indices = [
        get_positive_indices(idx, None, None, row['customer_id'], customer_embedding_matrix, "customer_to_product")
        for idx, row in train_customers.iterrows()
    ]
    train_customers['customer_to_product_positive_indices'] = customer_to_product_positive_indices
    
    print("✅ Positive indices generation complete and moved to GPU where possible!")


    # ✅ Overwrite only modified files
    def save_pkl(data, filepath):
        with open(filepath, "wb") as f:
            pickle.dump(data, f)
    
    save_pkl(train_data, os.path.join(DATA_SPLITTING_DIR, "train_data.pkl"))
    save_pkl(train_customers, os.path.join(DATA_SPLITTING_DIR, "train_customers.pkl"))
    save_pkl(train_articles, os.path.join(DATA_SPLITTING_DIR, "train_articles.pkl"))
    
    # ✅ Update ZIP without deleting existing files
    with zipfile.ZipFile(DATA_SPLITTING_ZIP_FILE, 'a') as zipf:
        zipf.write(os.path.join(DATA_SPLITTING_DIR, "train_data.pkl"), arcname="train_data.pkl")
        zipf.write(os.path.join(DATA_SPLITTING_DIR, "train_customers.pkl"), arcname="train_customers.pkl")
        zipf.write(os.path.join(DATA_SPLITTING_DIR, "train_articles.pkl"), arcname="train_articles.pkl")
    
    print(f"✅ Updated data saved and zipped at {DATA_SPLITTING_ZIP_FILE}")


In [None]:
# Check if CUDA is available
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ✅ Convert lists to GPU tensors
total_products = len(train_articles)
total_customers = len(train_customers)

product_indices = torch.arange(total_products, device=device)
customer_indices = torch.arange(total_customers, device=device)

# ✅ **GPU-Accelerated Hard Negative Sampling**
def generate_hard_negatives(total_items, positive_sets, entity_type, num_samples=10):
    """
    Implements HARD NEGATIVE SAMPLING (as in C-STAR).
    Selects items that are similar to positives but were NOT co-purchased.

    ✅ Now uses **GPU acceleration** for efficiency 🚀
    """
    negative_indices = torch.full((total_items, num_samples), -1, dtype=torch.long, device=device)  # Pre-allocate

    # ✅ Convert `positive_sets` to **tensors** for GPU processing
    exclusions = {i: torch.tensor(list(positive_sets.iloc[i]), device=device) if i < len(positive_sets) else torch.tensor([], device=device) for i in range(total_items)}

    for i in range(total_items):
        if entity_type == "product":
            valid_negatives = product_indices[~torch.isin(product_indices, exclusions[i])]
        elif entity_type == "customer":
            valid_negatives = customer_indices[~torch.isin(customer_indices, exclusions[i])]
        elif entity_type == "customer_to_product":
            valid_negatives = customer_indices[~torch.isin(customer_indices, exclusions[i])]

        if len(valid_negatives) > 0:
            sampled_negatives = valid_negatives[:min(num_samples, len(valid_negatives))]
            negative_indices[i, :len(sampled_negatives)] = sampled_negatives  # Assign sampled negatives

    return negative_indices.cpu().numpy()  # Move back to CPU **after** computation

# ✅ **Generate Negative Indices for Products (GPU-Accelerated)**
positive_sets_products = train_articles['positive_indices'].apply(set)
train_articles['negative_indices'] = list(generate_hard_negatives(total_products, positive_sets_products, entity_type="product"))

# ✅ **Generate Negative Indices for Customers (GPU-Accelerated)**
positive_sets_customers = train_customers['positive_indices'].apply(set)
train_customers['negative_indices'] = list(generate_hard_negatives(total_customers, positive_sets_customers, entity_type="customer"))

# ✅ **Generate Negative Indices for Product-to-Customer (GPU-Accelerated)**
positive_sets_customer_to_product = train_customers['customer_to_product_positive_indices'].apply(set)
train_customers['negative_customer_to_product_indices'] = list(generate_hard_negatives(total_customers, positive_sets_customer_to_product, entity_type="customer_to_product"))

print("✅ Hard negative sampling moved to GPU for **faster execution** 🚀")


In [None]:
class GCNLayer(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(GCNLayer, self).__init__()
        self.linear = nn.Linear(input_dim, output_dim)

    def forward(self, features, edge_index, edge_type):
        row, col = edge_index
        degree = torch.bincount(row, minlength=features.size(0)).float() + 1e-6
        norm = 1.0 / torch.sqrt(degree[row] * degree[col])

        # Different handling for different edge types
        if edge_type == "product_to_product":
            # Apply specific processing for product-to-product edges
            pass  # Implement your specific logic here if needed
        elif edge_type == "customer_to_product":
            # Apply specific processing for customer-to-product edges
            pass  # Implement your specific logic here if needed
        elif edge_type == "customer_to_customer":
            # Apply specific processing for customer-to-customer edges
            pass  # Implement your specific logic here if needed

        # Aggregate the features
        agg_features = torch.zeros_like(features)
        agg_features.index_add_(0, row, features[col] * norm.view(-1, 1))

        return F.relu(self.linear(agg_features))

In [None]:
class C_STAR(nn.Module):
    def __init__(self, input_dim, embedding_dim, edge_index, edge_types, num_nodes, num_layers, dropout):
        super(C_STAR, self).__init__()

        self.embedding_layer = nn.Sequential(
            nn.Linear(input_dim, embedding_dim),
            nn.ReLU()
        )

        self.gcn_layers = nn.ModuleList([GCNLayer(embedding_dim, embedding_dim) for _ in range(num_layers)])
        self.dropout = dropout
        self.num_nodes = num_nodes
        self.edges = edge_index
        self.edge_types = edge_types  # Store edge types
        self.reference_embedding = nn.Parameter(torch.randn(embedding_dim))  # Randomly initialized reference

    def forward(self, features, edge_index, edge_type):
        embeddings = self.embedding_layer(features)
        embeddings = F.dropout(embeddings, p=self.dropout, training=self.training)

        for gcn_layer, edge_type in zip(self.gcn_layers, self.edge_types):
            embeddings = gcn_layer(embeddings, edge_index, edge_type)
            embeddings = F.dropout(embeddings, p=self.dropout, training=self.training)

        return embeddings

In [None]:
def inter_trajectory_loss(embeddings, reference_embedding):
    """
    Computes the inter-trajectory loss using Sliced Wasserstein Distance.
    """
    return sliced_wasserstein_distance(embeddings, reference_embedding)



def intra_trajectory_loss(embeddings, positive_indices, negative_indices, batch_mapping):
    """
    Computes the intra-trajectory loss.
    Ensures positive & negative indices are properly mapped and converted to tensors.
    """
    # ✅ Convert lists to tensors before using `.view(-1)`
    positive_indices = torch.tensor(positive_indices, dtype=torch.long, device=embeddings.device)
    negative_indices = torch.tensor(negative_indices, dtype=torch.long, device=embeddings.device)

    # ✅ Map to local batch indices (if needed)
    positive_indices = torch.tensor(
        [batch_mapping.get(idx, -1) for idx in positive_indices.tolist()],
        dtype=torch.long, device=embeddings.device
    )
    negative_indices = torch.tensor(
        [batch_mapping.get(idx, -1) for idx in negative_indices.tolist()],
        dtype=torch.long, device=embeddings.device
    )

    # ✅ Filter valid indices
    positive_indices = positive_indices[positive_indices >= 0]
    negative_indices = negative_indices[negative_indices >= 0]

    # ✅ Return 0 loss if no valid pairs exist
    if positive_indices.numel() == 0 or negative_indices.numel() == 0:
        return torch.tensor(0.0, requires_grad=True, device=embeddings.device)

    # ✅ Compute cosine similarity for loss
    positive_scores = torch.einsum('ij,ij->i', embeddings[positive_indices], embeddings[positive_indices])
    negative_scores = torch.einsum('ij,ij->i', embeddings[negative_indices], embeddings[negative_indices])

    # ✅ Adaptive margin based on embedding variance
    margin = 1.0 + 0.1 * torch.std(embeddings)
    loss = F.relu(negative_scores + margin - positive_scores).mean()
    
    return loss



In [None]:
def compute_loss(embeddings, article_idx, pos_indices, neg_indices, model, edge_type):
    inter_loss = inter_trajectory_loss(embeddings[article_idx].unsqueeze(0), model.reference_embedding)
    intra_loss = intra_trajectory_loss(embeddings, pos_indices, neg_indices, {})
    return inter_loss + intra_loss

In [None]:
# ✅ Define directory and file paths
GRAPH_DIR = "/kaggle/working/graph_data"
GRAPH_ZIP_FILE = "/kaggle/working/graph_data.zip"

# ✅ Create the directory if it doesn't exist
os.makedirs(GRAPH_DIR, exist_ok=True)


In [None]:
# ✅ Check if precomputed ZIP file exists
if os.path.exists(GRAPH_ZIP_FILE):
    print("✅ Extracting PR-Graph and Edge Index data from ZIP...")

    # Extract the ZIP file
    with zipfile.ZipFile(GRAPH_ZIP_FILE, 'r') as zip_ref:
        zip_ref.extractall(GRAPH_DIR)

    # ✅ Load PR-Graph
    with open(os.path.join(GRAPH_DIR, "pr_graph.pkl"), "rb") as f:
        PR_graph = pickle.load(f)

    # ✅ Load Edge Index Tensors
    edge_index = torch.load(os.path.join(GRAPH_DIR, "edge_index.pt"))
    product_to_product_edge_index = torch.load(os.path.join(GRAPH_DIR, "product_to_product_edge_index.pt"))
    customer_to_customer_edge_index = torch.load(os.path.join(GRAPH_DIR, "customer_to_customer_edge_index.pt"))
    customer_to_product_edge_index = torch.load(os.path.join(GRAPH_DIR, "customer_to_product_edge_index.pt"))

    print("✅ PR-Graph and Edge Index loaded successfully!")

else:
    print("⚡ Recomputing PR-Graph and Edge Index...")

    # Construct the PR-Graph (Product-Relation Graph) based on co-purchase pairs
    PR_graph = nx.Graph()

    # ✅ Step 1: Add customer nodes with embeddings
    for customer_idx, customer_embedding in enumerate(customer_embeddings):
        customer_id = train_data['customer_id'].iloc[customer_idx]  # Use `.iloc` to avoid index mismatch
        PR_graph.add_node(customer_id, embedding=customer_embedding)

    # ✅ Step 2: Add product nodes (without redundant iteration)
    for article_id in articles_with_images['article_id']:
        PR_graph.add_node(article_id)  # No embedding needed for products

    # ✅ Step 3: Add customer-to-product edges based on purchase history
    customer_article_pairs = train_data[['customer_id', 'article_id']].values
    PR_graph.add_edges_from(customer_article_pairs)
    
    # ✅ Step 4: Efficiently Add Customer-to-Customer Similarity Edges using SWD
    customer_ids = list(customers['customer_id'])  # List of customer IDs for mapping

    # Convert embeddings to tensor
    customer_embeddings_tensor = torch.tensor(customer_embeddings, dtype=torch.float32)
    
    # Compute **pairwise Sliced Wasserstein Distance (SWD)**
    for i in range(len(customer_embeddings)):
        swd_distances = []
        for j in range(i + 1, len(customer_embeddings)):
            swd_dist = sliced_wasserstein_distance(
                customer_embeddings_tensor[i], customer_embeddings_tensor[j]
            )
            swd_distances.append((customer_ids[i], customer_ids[j], swd_dist.item()))
    
        # ✅ Only add edges for customers with SWD < threshold (efficient filtering)
        for u, v, dist in swd_distances:
            if dist < 0.5:  # ✅ Adjust the threshold as needed
                PR_graph.add_edge(u, v)

    # ✅ Step 5: Add product-to-product co-purchase edges (but avoid redundant loops)
    co_purchase_pairs = train_data.groupby('customer_id')['article_id'].apply(list)
    for articles_list in co_purchase_pairs:
        for i in range(len(articles_list)):
            for j in range(i + 1, len(articles_list)):
                PR_graph.add_edge(articles_list[i], articles_list[j])

    # ✅ Done! Your PR-Graph is now optimized 🚀
    print(f"📌 PR-Graph Constructed: {PR_graph.number_of_nodes()} nodes, {PR_graph.number_of_edges()} edges")

    # Step 5: Map article IDs and customer IDs to indices for use in edge_index
    article_to_index = {article_id: idx for idx, article_id in enumerate(PR_graph.nodes)}
    customer_to_index = {customer_id: idx for idx, customer_id in enumerate(customers['customer_id'])}
    
    # Step 6: Convert edge list to tensor (edge_index)
    edges_indices = []
    
    # Separate the edges into product-to-product, customer-to-product, and customer-to-customer
    product_to_product_edges = []
    customer_to_product_edges = []
    customer_to_customer_edges = []
    
    # Iterate through the PR graph edges and classify the edges based on node types
    for u, v in PR_graph.edges():
        if u in article_to_index and v in article_to_index:
            # Product-to-product edge
            product_to_product_edges.append((article_to_index[u], article_to_index[v]))
            edges_indices.append((article_to_index[u], article_to_index[v]))
        elif u in customer_to_index and v in article_to_index:
            # Customer-to-product edge
            customer_to_product_edges.append((customer_to_index[u], article_to_index[v]))
            edges_indices.append((customer_to_index[u], article_to_index[v]))
        elif u in customer_to_index and v in customer_to_index:
            # Customer-to-customer edge
            customer_to_customer_edges.append((customer_to_index[u], customer_to_index[v]))
            edges_indices.append((customer_to_index[u], customer_to_index[v]))
    
    # Convert edge list to tensor (edge_index) for all edges
    edge_index = torch.tensor(edges_indices, dtype=torch.long).t().contiguous()
    
    # Step 7: Add self-loops (common in graph-based models)
    edge_index, _ = add_self_loops(edge_index, num_nodes=len(PR_graph.nodes))
    
    # Ensure edge_index dimensions are correct
    assert edge_index.size(0) == 2, "Edge index should have two rows: source and target nodes."
    assert edge_index.size(1) > 0, "Edge index should have at least one edge."
    
    print("PR Graph successfully constructed!")
    
    # Now, you have separate edge indices for each type of edge.
    product_to_product_edge_index = torch.tensor(product_to_product_edges, dtype=torch.long).t().contiguous()
    customer_to_product_edge_index = torch.tensor(customer_to_product_edges, dtype=torch.long).t().contiguous()
    customer_to_customer_edge_index = torch.tensor(customer_to_customer_edges, dtype=torch.long).t().contiguous()
    
    # These edge indices are now available for further processing.
    
    print(edge_index.shape) 
    
     # ✅ Save PR-Graph & Edge Index Locally
    with open(os.path.join(GRAPH_DIR, "pr_graph.pkl"), "wb") as f:
        pickle.dump(PR_graph, f)

    torch.save(edge_index, os.path.join(GRAPH_DIR, "edge_index.pt"))
    torch.save(product_to_product_edge_index, os.path.join(GRAPH_DIR, "product_to_product_edge_index.pt"))
    torch.save(customer_to_product_edge_index, os.path.join(GRAPH_DIR, "customer_to_product_edge_index.pt"))
    torch.save(customer_to_customer_edge_index, os.path.join(GRAPH_DIR, "customer_to_customer_edge_index.pt"))

    # ✅ Create ZIP Archive for Future Use
    with zipfile.ZipFile(GRAPH_ZIP_FILE, 'w') as zipf:
        for file in os.listdir(GRAPH_DIR):
            zipf.write(os.path.join(GRAPH_DIR, file), arcname=file)

    print(f"✅ PR-Graph and Edge Index processing completed! Saved as {GRAPH_ZIP_FILE}")

In [None]:
# ✅ Define checkpoint file path
CHECKPOINT_FILE = "/kaggle/working/checkpoint.pth"

# ✅ Save model checkpoint
def save_checkpoint(model, optimizer, epoch, filename=CHECKPOINT_FILE):
    checkpoint = {
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict()
    }
    torch.save(checkpoint, filename)
    print(f"✅ Model checkpoint saved at epoch {epoch}")

# ✅ Load model checkpoint
def load_checkpoint(filename, model, optimizer=None):
    if os.path.exists(filename):
        print("✅ Loading checkpoint...")
        checkpoint = torch.load(filename, map_location=device)
        model.load_state_dict(checkpoint['model_state_dict'])
        if optimizer:
            optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        print(f"✅ Resuming training from epoch {checkpoint['epoch'] + 1}")
        return checkpoint['epoch'] + 1  # Resume from the next epoch
    else:
        print("⚡ No checkpoint found. Starting from scratch.")
        return 0  # Start from epoch 0 if no checkpoint exists

In [None]:
# ✅ Use a smaller dataset for quick testing
train_data = train_data.sample(n=10000, random_state=42)  
val_data = val_data.sample(n=2000, random_state=42)

In [None]:
# ✅ Training Loop with Early Stopping + GPU Optimization
def train_full_batch_with_early_stopping(model, optimizer, num_epochs, patience, start_epoch):
    best_val_loss = float('inf')
    patience_counter = 0
    best_model_state = None

    # ✅ Move model & data to GPU (if available)
    # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)

    train_features_tensor = torch.tensor(train_features, dtype=dtype).to(device)
    val_features_tensor = torch.tensor(val_features, dtype=dtype).to(device)

    product_to_product_edge_index_tensor = product_to_product_edge_index.to(device)
    customer_to_product_edge_index_tensor = customer_to_product_edge_index.to(device)
    customer_to_customer_edge_index_tensor = customer_to_customer_edge_index.to(device)

    with torch.no_grad():
        initial_embeddings = model(train_features_tensor, product_to_product_edge_index_tensor)
        initial_swd = sliced_wasserstein_distance(initial_embeddings, model.reference_embedding).item()
        print(f"✅ Initial SWD: {initial_swd:.4f}")

    for epoch in range(start_epoch, num_epochs):
        model.train()
        optimizer.zero_grad()

        # ✅ Forward pass for each edge type
        all_embeddings_product_to_product = model(train_features_tensor, product_to_product_edge_index_tensor)
        all_embeddings_customer_to_product = model(train_features_tensor, customer_to_product_edge_index_tensor)
        all_embeddings_customer_to_customer = model(train_features_tensor, customer_to_customer_edge_index_tensor)

        train_loss_acc = torch.zeros([], requires_grad=True, device=device)

        for _, row in train_data.iterrows():
            article_id = row["article_id"]
            customer_id = row["customer_id"]

            article_idx = article_to_index.get(article_id, -1)
            customer_idx = customer_to_index.get(customer_id, -1)

            if article_idx == -1 or customer_idx == -1:
                continue  # Skip invalid entries

            # ✅ Move positive/negative indices to GPU before converting to tensor
            pos_indices_product = torch.tensor(row["positive_indices_product"], dtype=torch.long, device=device)
            neg_indices_product = torch.tensor(row["negative_indices_product"], dtype=torch.long, device=device)

            pos_indices_customer_product = torch.tensor(row["positive_indices_customer_product"], dtype=torch.long, device=device)
            neg_indices_customer_product = torch.tensor(row["negative_indices_customer_product"], dtype=torch.long, device=device)

            pos_indices_customer = torch.tensor(row["positive_indices_customer"], dtype=torch.long, device=device)
            neg_indices_customer = torch.tensor(row["negative_indices_customer"], dtype=torch.long, device=device)

            # ✅ Compute losses separately for each edge type
            train_loss_acc += (
                compute_loss(all_embeddings_product_to_product, article_idx, pos_indices_product, neg_indices_product, model, "product_to_product") +
                compute_loss(all_embeddings_customer_to_product, customer_idx, pos_indices_customer_product, neg_indices_customer_product, model, "customer_to_product") +
                compute_loss(all_embeddings_customer_to_customer, customer_idx, pos_indices_customer, neg_indices_customer, model, "customer_to_customer")
            )

        # ✅ Backpropagation
        train_loss_acc.backward()
        # ✅ TPU Optimized Training Step
        xm.optimizer_step(optimizer)
        xm.mark_step()


        avg_train_loss = train_loss_acc.item() / max(len(train_data), 1)

        # ✅ Validation Phase
        model.eval()
        total_val_loss = 0.0

        with torch.no_grad():
            val_embeddings_product_to_product = model(val_features_tensor, product_to_product_edge_index_tensor)
            val_embeddings_customer_to_product = model(val_features_tensor, customer_to_product_edge_index_tensor)
            val_embeddings_customer_to_customer = model(val_features_tensor, customer_to_customer_edge_index_tensor)

            for _, row in val_data.iterrows():
                article_id = row["article_id"]
                customer_id = row["customer_id"]

                article_idx = article_to_index.get(article_id, -1)
                customer_idx = customer_to_index.get(customer_id, -1)

                if article_idx == -1 or customer_idx == -1:
                    continue  # Skip invalid entries

                pos_indices_product = torch.tensor(row["positive_indices_product"], dtype=torch.long, device=device)
                neg_indices_product = torch.tensor(row["negative_indices_product"], dtype=torch.long, device=device)

                pos_indices_customer_product = torch.tensor(row["positive_indices_customer_product"], dtype=torch.long, device=device)
                neg_indices_customer_product = torch.tensor(row["negative_indices_customer_product"], dtype=torch.long, device=device)

                pos_indices_customer = torch.tensor(row["positive_indices_customer"], dtype=torch.long, device=device)
                neg_indices_customer = torch.tensor(row["negative_indices_customer"], dtype=torch.long, device=device)

                total_val_loss += (
                    compute_loss(val_embeddings_product_to_product, article_idx, pos_indices_product, neg_indices_product, model, "product_to_product") +
                    compute_loss(val_embeddings_customer_to_product, customer_idx, pos_indices_customer_product, neg_indices_customer_product, model, "customer_to_product") +
                    compute_loss(val_embeddings_customer_to_customer, customer_idx, pos_indices_customer, neg_indices_customer, model, "customer_to_customer")
                )

        avg_val_loss = total_val_loss / max(len(val_data), 1)

        print(f"Epoch {epoch + 1}/{num_epochs} | Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f}")


        # ✅ Save checkpoint
        save_checkpoint(model, optimizer, epoch, filename=CHECKPOINT_FILE)
        
        # ✅ Early Stopping Logic
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            patience_counter = 0
            best_model_state = model.state_dict()
            save_checkpoint(model, optimizer, epoch, filename='best_checkpoint.pth')
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print("🚀 Early stopping triggered! Restoring best model...")
                model.load_state_dict(best_model_state)
                break

    return model


In [None]:
# Compute input dimensions
input_dim_text = text_embeddings.shape[1]  # Text feature dimension
input_dim_numeric = len(train_data['price'])  # Numeric feature dimension
input_dim_image = image_embeddings.shape[1]  # Image feature dimension
input_dim_customer = customer_embeddings.shape[1]  # Customer feature dimension

# Total input dimension
input_dim = input_dim_text + input_dim_image + input_dim_numeric +input_dim_customer

# Training with updated input dimensions
# embedding_dim = 64  # Size of the embeddings
# num_layers = 3      # Number of GCN layers
# dropout = 0.3       # Dropout rate for regularization
num_projections = 50  # Number of projections for Sliced Wasserstein Distance (SWD)
# learning_rate = 1e-2  # Learning rate for the optimizer
weight_decay = 1e-5 

learning_rate = 5e-3  # ✅ Slightly higher but stable
num_epochs = 2        # ✅ Just for testing if training runs
embedding_dim = 32    # ✅ Smaller embeddings reduce computation
num_layers = 2        # ✅ Fewer layers for quick runs
dropout = 0.2         # ✅ Lower dropout for faster learning


# Use the correct input dimensions for text and numeric features
# input_dim_text = len(text_embeddings[0])  # Text feature dimension
# input_dim_numeric = len(numerical_columns)  # Numeric feature dimension

model = C_STAR(input_dim=input_dim, embedding_dim=embedding_dim, edge_index=edge_index, 
               edge_types=edge_types, num_nodes=len(PR_graph.nodes), num_layers=num_layers)
model = model.to(device)

optimizer = optim_xla.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
start_epoch = load_checkpoint(CHECKPOINT_FILE, model, optimizer)

# Train the model with early stopping
num_epochs = 1
patience = 10
trained_model = train_full_batch_with_early_stopping(
    model=model,
    optimizer=optimizer,
    num_epochs=num_epochs,
    patience=patience, 
    start_epoch=start_epoch
)

In [None]:
# Metrics Evaluation Functions
def recall_at_k(recommended_articles, relevant_articles, k=10):
    recommended_set = set(recommended_articles[:k])
    relevant_set = set(relevant_articles)
    intersection = recommended_set.intersection(relevant_set)
    return len(intersection) / len(relevant_set) if len(relevant_set) > 0 else 0.0

def ndcg_at_k(recommended_articles, relevant_articles, k=10):
    recommended_set = set(recommended_articles[:k])
    dcg = 0.0
    idcg = 0.0

    for i in range(min(k, len(recommended_articles))):
        if recommended_articles[i] in relevant_articles:
            dcg += 1 / np.log2(i + 2)

    for i in range(min(k, len(relevant_articles))):
        idcg += 1 / np.log2(i + 2)

    return dcg / idcg if idcg > 0 else 0.0

In [None]:
# Full-batch evaluation with Recall@K and NDCG@K metrics using Sliced Wasserstein Distance (SWD)
def test_model_full_batch_with_metrics(model, k=10):
    model.eval()
    total_test_loss = 0.0
    all_recall_at_k = []
    all_ndcg_at_k = []

    test_features_tensor = torch.tensor(test_features, dtype=dtype).to(device)

    with torch.no_grad():
        # Compute embeddings for each edge type separately
        test_embeddings_product_to_product = model(test_features_tensor, product_to_product_edge_index)
        test_embeddings_customer_to_product = model(test_features_tensor, customer_to_product_edge_index)
        test_embeddings_customer_to_customer = model(test_features_tensor, customer_to_customer_edge_index)

        for _, row in test_data.iterrows():
            customer_id = row["customer_id"]
            article_id = row["article_id"]

            # Get purchased articles for this customer
            customer_transactions = transactions_filtered[transactions_filtered['customer_id'] == customer_id]
            relevant_articles = customer_transactions['article_id'].unique()

            # Map article and customer IDs to indices
            article_idx = article_to_index.get(article_id, -1)
            customer_idx = customer_to_index.get(customer_id, -1)

            if article_idx == -1 or customer_idx == -1:
                continue  # Skip invalid entries

            # Compute **Sliced Wasserstein Distance (SWD)**
            swd_distances_product = []
            swd_distances_customer = []
            swd_distances_customer_to_product = []

            # **Compute SWD for Product-to-Product**
            product_embedding = test_embeddings_product_to_product[article_idx].unsqueeze(0)
            for i in range(test_embeddings_product_to_product.shape[0]):
                if i != article_idx:  # Skip the same article
                    dist = sliced_wasserstein_distance(product_embedding, test_embeddings_product_to_product[i].unsqueeze(0))
                    swd_distances_product.append((i, dist.item()))

            # **Compute SWD for Customer-to-Customer**
            customer_embedding = test_embeddings_customer_to_customer[customer_idx].unsqueeze(0)
            for i in range(test_embeddings_customer_to_customer.shape[0]):
                if i != customer_idx:  # Skip the same customer
                    dist = sliced_wasserstein_distance(customer_embedding, test_embeddings_customer_to_customer[i].unsqueeze(0))
                    swd_distances_customer.append((i, dist.item()))

            # **Compute SWD for Customer-to-Product**
            for i in range(test_embeddings_product_to_product.shape[0]):
                dist = sliced_wasserstein_distance(customer_embedding, test_embeddings_product_to_product[i].unsqueeze(0))
                swd_distances_customer_to_product.append((i, dist.item()))

            # Sort by SWD distance (lower is better)
            swd_distances_product.sort(key=lambda x: x[1])
            swd_distances_customer.sort(key=lambda x: x[1])
            swd_distances_customer_to_product.sort(key=lambda x: x[1])

            # Get top-k recommendations
            top_indices_product = [item[0] for item in swd_distances_product[:k]]
            top_indices_customer = [item[0] for item in swd_distances_customer[:k]]
            top_indices_customer_to_product = [item[0] for item in swd_distances_customer_to_product[:k]]

            recommended_articles = [list(article_to_index.keys())[idx] for idx in top_indices_customer_to_product]

            # **Compute Recall@K and NDCG@K**
            recall = recall_at_k(recommended_articles, relevant_articles, k)
            ndcg = ndcg_at_k(recommended_articles, relevant_articles, k)

            all_recall_at_k.append(recall)
            all_ndcg_at_k.append(ndcg)

            # Retrieve **correct positive and negative indices**
            pos_indices_product = articles_with_images.loc[articles_with_images["article_id"] == article_id, "positive_indices"].values[0]
            neg_indices_product = articles_with_images.loc[articles_with_images["article_id"] == article_id, "negative_indices"].values[0]

            pos_indices_customer = customers.loc[customers["customer_id"] == customer_id, "positive_indices"].values[0]
            neg_indices_customer = customers.loc[customers["customer_id"] == customer_id, "negative_indices"].values[0]

            pos_indices_customer_product = customers.loc[customers["customer_id"] == customer_id, "customer_to_product_positive_indices"].values[0]
            neg_indices_customer_product = customers.loc[customers["customer_id"] == customer_id, "negative_customers"].values[0]

            # Compute **Inter-Trajectory and Intra-Trajectory Loss**
            inter_loss_product = inter_trajectory_loss(test_embeddings_product_to_product[article_idx].unsqueeze(0), model.reference_embedding)
            intra_loss_product = intra_trajectory_loss(test_embeddings_product_to_product, pos_indices_product, neg_indices_product, {})

            inter_loss_customer = inter_trajectory_loss(test_embeddings_customer_to_customer[customer_idx].unsqueeze(0), model.reference_embedding)
            intra_loss_customer = intra_trajectory_loss(test_embeddings_customer_to_customer, pos_indices_customer, neg_indices_customer, {})

            inter_loss_customer_product = inter_trajectory_loss(test_embeddings_customer_to_product[customer_idx].unsqueeze(0), model.reference_embedding)
            intra_loss_customer_product = intra_trajectory_loss(test_embeddings_customer_to_product, pos_indices_customer_product, neg_indices_customer_product, {})

            # **Total Loss**
            total_test_loss += (inter_loss_product + intra_loss_product +
                                inter_loss_customer + intra_loss_customer +
                                inter_loss_customer_product + intra_loss_customer_product).item()

    # **Normalize Loss**
    avg_test_loss = total_test_loss / max(len(test_data), 1)
    avg_recall_at_k = np.mean(all_recall_at_k)
    avg_ndcg_at_k = np.mean(all_ndcg_at_k)

    # **Print Metrics**
    print(f"✅ Test Loss: {avg_test_loss:.4f}")
    print(f"✅ Recall@{k}: {avg_recall_at_k:.4f}")
    print(f"✅ NDCG@{k}: {avg_ndcg_at_k:.4f}")

    return avg_test_loss, avg_recall_at_k, avg_ndcg_at_k


In [None]:
# Example usage of testing
test_model_full_batch_with_metrics(
    model=trained_model,
    edge_index=edge_index,
    k=10
)

In [None]:
def find_similar_embeddings(target_embedding, reference_embeddings, top_n=10):
    """
    Compute Sliced Wasserstein Distance (SWD) between a target embedding and all reference embeddings.
    Returns the indices of the top-N most similar embeddings.
    """
    swd_distances = []
    
    for i in range(reference_embeddings.shape[0]):
        dist = sliced_wasserstein_distance(target_embedding.unsqueeze(0), reference_embeddings[i].unsqueeze(0))
        swd_distances.append((i, dist.item()))

    # ✅ Sort by SWD distance (lower is better)
    swd_distances.sort(key=lambda x: x[1])
    
    return [item[0] for item in swd_distances[:top_n]]  # Return top N indices


In [None]:
def recommend_articles_any_input(
    model, edge_index, top_n=10, image_path=None, text_input=None, numeric_input=None, 
    age=None, active=None, club_status=None):
    """
    Multi-modal product recommendation function.
    Users can input any combination of image, text, and numeric data.
    Supports cold-start users by recommending based on similar customers if any of `age`, `active`, or `club_status` is provided.
    """
    input_features = []
    test_features_tensor = torch.tensor(test_features, dtype=dtype).to(device)

    # ✅ 1. Process Image Input (if provided)
    if image_path:
        image = Image.open(image_path).convert('RGB')
        image = transform(image).unsqueeze(0).to(device)
        with torch.no_grad():
            image_embedding = resnet_model(image).squeeze(0).cpu().numpy()
        input_features.append(image_embedding)

    # ✅ 2. Process Text Input (if provided)
    if text_input:
        words = text_input.split()  
        word_embeddings = np.zeros(100)  
        valid_word_count = 0

        for word in words:
            if word in embeddings_index:  
                word_embeddings += embeddings_index[word]
                valid_word_count += 1

        if valid_word_count > 0:
            word_embeddings /= valid_word_count  
        input_features.append(word_embeddings)

    # ✅ 3. Process Numeric Input (if provided)
    if numeric_input:
        numeric_input = np.array(numeric_input).reshape(1, -1)
        scaled_numeric_input = scaler.transform(numeric_input)  
        input_features.append(scaled_numeric_input.flatten())

    # ✅ 4. Handle Partial Cold Start (One or More Customer Attributes Given)
    if any([age is not None, active is not None, club_status is not None]):  # If at least one is given
        print("🚀 Partial Cold Start Detected: Generating Customer Embedding...")

        # ✅ Create a DataFrame with the provided inputs
        cold_start_input = pd.DataFrame({"Active": [active], "club_member_status": [club_status]})

        # ✅ One-hot encode categorical features using the **same approach as training**
        dummy_encoded = pd.get_dummies(cold_start_input)
        
        # ✅ Ensure missing categorical columns are handled correctly
        missing_cols = [col for col in categorical_features_encoded.columns if col not in dummy_encoded.columns]
        for col in missing_cols:
            dummy_encoded[col] = 0  # Add missing columns with default 0 values

        # ✅ Ensure column order matches training
        dummy_encoded = dummy_encoded[categorical_features_encoded.columns].astype(np.float32).values

        # ✅ Normalize age if given
        normalized_age = np.zeros(1) if age is None else scaler_age.transform([[age]]).flatten()

        # ✅ Construct customer input feature
        new_customer_features = np.hstack([normalized_age.reshape(-1, 1), dummy_encoded]).astype(np.float32)
        new_customer_features_tensor = torch.tensor(new_customer_features, dtype=torch.float32).to(device)

        # ✅ Generate embedding for the new customer
        with torch.no_grad():
            new_customer_embedding = model(new_customer_features_tensor).squeeze(0)to(device)

        # ✅ Find top-N similar customers
        top_similar_customers = find_similar_embeddings(new_customer_embedding, customer_embeddings, top_n)

        # ✅ Retrieve products purchased by these similar customers
        recommended_articles = transactions_filtered[
            transactions_filtered["customer_id"].isin([customers.iloc[i]["customer_id"] for i in top_similar_customers])
        ]
        recommended_articles = recommended_articles.groupby("article_id").size().reset_index(name="purchase_count")
        recommended_articles = recommended_articles.sort_values("purchase_count", ascending=False).head(top_n)
        recommended_articles = articles_with_images[articles_with_images["article_id"].isin(recommended_articles["article_id"])]

        print("🔹 **Final Recommendations for Partial Cold-Start User** 🔹\n")
        for _, row in recommended_articles.iterrows():
            print(f"🛍️ Product: {row['prod_name']}\n📜 Description: {row['text_data']}\n")
            display(Image.open(row['path']))
            print("\n")

        return recommended_articles[['article_id', 'prod_name']]

    # ✅ 5. If NOT a cold-start user, proceed with normal recommendations
    input_features = np.hstack(input_features)
    input_tensor = torch.tensor(input_features, dtype=torch.float32).unsqueeze(0).to(device)

    # ✅ Compute the full embedding space using the trained model
    with torch.no_grad():
        all_embeddings = model(test_features_tensor, edge_index)

    # ✅ Find top-N similar product indices
    top_indices = find_similar_embeddings(input_tensor, all_embeddings, top_n)

    # ✅ Retrieve recommended products
    recommended_articles = articles_with_images.iloc[top_indices][['article_id', 'prod_name', 'text_data', 'path']]

    print("🔹 **Final Recommendations Based on Multi-Modal Input** 🔹\n")
    for _, row in recommended_articles.iterrows():
        print(f"🛍️ Product: {row['prod_name']}\n📜 Description: {row['text_data']}\n")
        display(Image.open(row['path']))
        print("\n")

    return recommended_articles[['article_id', 'prod_name']]


In [None]:
# Image Input Recommendation
recommend_articles_any_input(
    model=trained_model,
    edge_index=edge_index,
    top_n=5,
    image_path="/kaggle/input/h-and-m-personalized-fashion-recommendations/images/079/0797892010.jpg"
)

In [None]:
# Text Input Recommendation
recommend_articles_any_input(
    model=trained_model,
    edge_index=edge_index,
    top_n=5,
    text_input="Floral summer dress with short sleeves"
)

In [None]:
# Numeric Input Recommendation
numeric_input_example = [22, 5, 14, 8, 1, 3]  # Example input for product_type_no, colour_group_code, etc.
recommend_articles_any_input(
    model=trained_model,
    edge_index=edge_index,
    top_n=5,
    numeric_input=numeric_input_example
)

In [None]:
# Combined Multi-modal Recommendation
recommend_articles_any_input(
    model=trained_model,
    edge_index=edge_index,
    top_n=5,
    image_path="/kaggle/input/h-and-m-personalized-fashion-recommendations/images/072/0720572001.jpg",
    text_input="Black mini dress",
    numeric_input=[30,40,50]
)


In [None]:
recommend_articles_any_input(
    model=trained_model,
    edge_index=edge_index,
    top_n=5,
    age=30
)
