In [47]:
import os
import glob
import pandas as pd
import numpy as np
import torch
import cudf  # RAPIDS cuDF (GPU-accelerated Pandas)
import cupy as cp  # GPU-accelerated NumPy
from tqdm import tqdm
from sklearn.preprocessing import MinMaxScaler
import torch
import torchvision.models as models
import torchvision.transforms as transforms
from PIL import Image
from tqdm import tqdm
import torch.nn as nn
from sklearn.model_selection import train_test_split
import torch.optim as optim
import torch.nn.functional as F
import faiss
from torch_geometric.data import HeteroData
from torch_geometric.utils import add_self_loops
import pickle
import zipfile


In [None]:
# Check for GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

In [None]:
# Adjust article ID format
def adjust_id(x):
    x = str(x)
    return "0" + x if len(x) == 9 else x

In [None]:
# ✅ Correct path (inside `/kaggle/input/preprocessed-data/`)
PREPROCESSED_DIR = "/kaggle/input/pre-processed-data-3"

# # ✅ Define file paths for saving preprocessed data
# PREPROCESSED_DIR = "/kaggle/working/preprocessed-data"
# ZIP_FILE = "/kaggle/working/preprocessed-data.zip"  # Final zipped archive

# # ✅ Create the directory if it doesn't exist
# os.makedirs(PREPROCESSED_DIR, exist_ok=True)


In [None]:
# ✅ Extract ZIP only if the preprocessed files don't already exist
if os.path.exists(os.path.join(PREPROCESSED_DIR, "articles_with_images.pkl")):
    print("✅ Preprocessed data already exists. Loading...")

    with open(os.path.join(PREPROCESSED_DIR, "articles_with_images.pkl"), "rb") as f:
        articles_with_images = pickle.load(f)

    with open(os.path.join(PREPROCESSED_DIR, "transactions_filtered.pkl"), "rb") as f:
        transactions_filtered = pickle.load(f)

    with open(os.path.join(PREPROCESSED_DIR, "customers_processed.pkl"), "rb") as f:
        customers = pickle.load(f)

    print("✅ Preprocessed data loaded successfully!")

else :
    print("✅ Preprocessed data not found")

    articles = pd.read_csv('/kaggle/input/h-and-m-personalized-fashion-recommendations/articles.csv')
    customers = pd.read_csv('/kaggle/input/h-and-m-personalized-fashion-recommendations/customers.csv')
    transactions = pd.read_csv('/kaggle/input/h-and-m-personalized-fashion-recommendations/transactions_train.csv')
    
    # Get all paths from the image folder
    all_image_paths = glob.glob("/kaggle/input/h-and-m-personalized-fashion-recommendations/images/*/*")
    
    # Adjust the article ID and product code to be string & add "0"
    articles["article_id"] = articles["article_id"].apply(lambda x: adjust_id(x))
    articles["product_code"] = articles["article_id"].apply(lambda x: x[:3])
    
    # Get all valid article IDs and create a set to store image IDs
    all_image_ids = set()
    
    for path in tqdm(all_image_paths, desc="Processing Images"):
        article_id = os.path.basename(path).split('.')[0]  # Extract image ID from filename
        all_image_ids.add(article_id)
    
    
    # Construct full image paths and filter invalid ones
    images_path = "/kaggle/input/h-and-m-personalized-fashion-recommendations/images/"
    articles["path"] = articles["article_id"].apply(
        lambda x: images_path + x[:3] + "/" + x + ".jpg" if x in all_image_ids else None
    )
    
    # ✅ Keep only articles with valid images
    articles_with_images = articles.dropna(subset=["path"]).reset_index(drop=True)
    
    # Adjust the article ID and product code to be string & add "0"
    
    articles_with_images["article_id"] = articles_with_images["article_id"].astype("category")
    articles_with_images["product_code"] = articles_with_images["product_code"].astype("category")
    
    # Fill missing values
    customers.fillna({
        "FN": 0,
        "Active": 0,
        "club_member_status": "UNKNOWN",
        "fashion_news_frequency": "UNKNOWN",
        "age": customers["age"].median()
    }, inplace=True)
    
    customers["customer_id"] = customers["customer_id"].astype("category")
    customers["Active"] = customers["Active"].astype(np.int8)
    customers["FN"] = customers["FN"].astype(np.int8)
    customers["age"] = customers["age"].astype(np.float16)
    
    if 'age' in customers.columns:
        scaler_age = MinMaxScaler()
        customers['normalized_age'] = scaler_age.fit_transform(customers[['age']])
    else:
        raise ValueError("Error: `age` column is missing in customers!")
    
    # Adjust article_id (as did for articles dataframe)
    transactions["article_id"] = transactions["article_id"].apply(lambda x: adjust_id(x))
    
    # Filter the transactions dataset to keep only valid article IDs
    transactions_filtered = transactions[transactions["article_id"].isin(set(articles_with_images["article_id"]))].reset_index(drop=True)
    
    # Optionally save the filtered transactions dataset
    # transactions_filtered.to_csv("transactions_filtered.csv", index=False)
    transactions_filtered["article_id"] = transactions_filtered["article_id"].astype("category")
    transactions_filtered["price"] = transactions_filtered["price"].astype(np.float16)

    # ✅ Save preprocessed data locally
    with open(os.path.join(PREPROCESSED_DIR, "articles_with_images.pkl"), "wb") as f:
        pickle.dump(articles_with_images, f)

    with open(os.path.join(PREPROCESSED_DIR, "transactions_filtered.pkl"), "wb") as f:
        pickle.dump(transactions_filtered, f)

    with open(os.path.join(PREPROCESSED_DIR, "customers_processed.pkl"), "wb") as f:
        pickle.dump(customers, f)

    # ✅ Create a ZIP archive containing all preprocessed files
    with zipfile.ZipFile(ZIP_FILE, 'w') as zipf:
        for file in os.listdir(PREPROCESSED_DIR):
            zipf.write(os.path.join(PREPROCESSED_DIR, file), arcname=file)

    print(f"✅ Preprocessing completed! Saved as {ZIP_FILE}")


In [49]:
# # ✅ Define file paths for saving preprocessed data
# IMAGE_EMBEDDINGS_DIR = "/kaggle/working/image_embeddings"
# IMAGE_EMBEDDINGS_ZIP_FILE = "/kaggle/working/image_embeddings.zip"  # Final zipped archive

# # ✅ Create the directory if it doesn't exist
# os.makedirs(IMAGE_EMBEDDINGS_DIR, exist_ok=True)

# ✅ Correct path (inside `/kaggle/input/preprocessed-data/`)
IMAGE_EMBEDDINGS_DIR = "/kaggle/input/image-embeddings-2"

In [50]:
# ✅ Extract ZIP only if the preprocessed files don't already exist
if os.path.exists(os.path.join(IMAGE_EMBEDDINGS_DIR, "image_embeddings.pkl")):
    print("✅ image_embeddings data already exists. Loading...")

    with open(os.path.join(IMAGE_EMBEDDINGS_DIR, "image_embeddings.pkl"), "rb") as f:
        image_embeddings = pickle.load(f)

    print("✅ image embeddings data loaded successfully!")

else:

    print("✅ image embeddings not found!")
    torch.backends.cudnn.benchmark = True  # Optimizes GPU computation
    
    # ✅ Load pre-trained ResNet50 model (Feature Extraction)
    resnet_model = models.resnet50(pretrained=True)
    resnet_model = torch.nn.Sequential(*list(resnet_model.children())[:-1])  # Remove the last FC layer
    resnet_model = resnet_model.to(device).eval()  # Move to GPU & set eval mode
    
    # ✅ Define image transformations
    transform = transforms.Compose([
        transforms.Resize((224, 224)),  
        transforms.ToTensor(),          
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  
    ])
    
    # ✅ Function to extract image embeddings (Batch Processing)
    def get_image_embeddings_batch(image_paths_batch):
        images = [transform(Image.open(path).convert('RGB')).unsqueeze(0) for path in image_paths_batch]
        images_tensor = torch.cat(images, dim=0).to(device)  # Stack tensors & move to GPU
        
        with torch.no_grad():
            features = resnet_model(images_tensor).squeeze(-1).squeeze(-1)  # Remove extra dimensions
    
        return features  # Return GPU tensor directly (no need for .cpu().numpy())
    
    # ✅ Process images in batches to avoid memory issues
    batch_size = 512  # Adjust batch size based on available memory
    image_embeddings = []
    
    for i in tqdm(range(0, len(articles_with_images), batch_size), desc="Extracting Features"):
        image_paths_batch = articles_with_images['path'][i:i + batch_size].tolist()
        image_embeddings.append(get_image_embeddings_batch(image_paths_batch))
    
    # ✅ Convert list of tensors to a single tensor
    image_embeddings = torch.cat(image_embeddings, dim=0)
    
    print(f"✅ Image feature extraction complete. Shape: {image_embeddings.shape}")

    # ✅ Save preprocessed data locally
    with open(os.path.join(IMAGE_EMBEDDINGS_DIR, "image_embeddings.pkl"), "wb") as f:
        pickle.dump(image_embeddings, f)


    # ✅ Create a ZIP archive containing all preprocessed files
    with zipfile.ZipFile(IMAGE_EMBEDDINGS_ZIP_FILE, 'w') as zipf:
        for file in os.listdir(IMAGE_EMBEDDINGS_DIR):
            zipf.write(os.path.join(IMAGE_EMBEDDINGS_DIR, file), arcname=file)

    print(f"✅ image_embeddings completed! Saved as {IMAGE_EMBEDDINGS_ZIP_FILE}")

✅ image_embeddings data already exists. Loading...
✅ image embeddings data loaded successfully!


In [51]:
# ✅ Define file paths for saving preprocessed data
# TEXT_EMBEDDINGS_DIR = "/kaggle/working/text_embeddings"
# TEXT_EMBEDDINGS_ZIP_FILE = "/kaggle/working/text_embeddings.zip"  # Final zipped archive

# # ✅ Create the directory if it doesn't exist
# os.makedirs(TEXT_EMBEDDINGS_DIR, exist_ok=True)

# ✅ Correct path (inside `/kaggle/input/preprocessed-data/`)
TEXT_EMBEDDINGS_DIR = "/kaggle/input/text-embeddings-2"

In [52]:
# ✅ Extract ZIP only if the preprocessed files don't already exist
if os.path.exists(os.path.join(TEXT_EMBEDDINGS_DIR, "text_embeddings.pkl")):
    print("✅ text_embeddings data already exists. Loading...")

    with open(os.path.join(TEXT_EMBEDDINGS_DIR, "text_embeddings.pkl"), "rb") as f:
        text_embeddings = pickle.load(f)

    print("✅ text embeddings data loaded successfull")

else :

    print("✅ text embeddings data not found")
    # ✅ Ensure Required Columns Exist
    required_cols = [
        'detail_desc', 'prod_name', 'product_type_name', 'product_group_name',
        'graphical_appearance_name', 'colour_group_name', 'index_name',
        'index_group_name', 'section_name', 'garment_group_name'
    ]
    
    missing_cols = [col for col in required_cols if col not in articles_with_images.columns]
    
    if missing_cols:
        raise ValueError(f"Error: Missing columns: {missing_cols}")
    
    # ✅ Efficiently Combine Text Columns
    articles_with_images['text_data'] = articles_with_images[required_cols].fillna('').agg(' '.join, axis=1)
    
    # ✅ Load GloVe Embeddings Efficiently
    def load_glove_embeddings(glove_file_path):
        embeddings_index = {}
        with open(glove_file_path, 'r', encoding='utf-8') as f:
            for line in f:
                values = line.split()
                word = values[0]
                vector = np.asarray(values[1:], dtype='float32')
                embeddings_index[word] = vector
        return embeddings_index
    
    # ✅ Load GloVe 100D embeddings
    glove_file_path = '/kaggle/input/text-glove/glove.6B.100d.txt'
    embeddings_index = load_glove_embeddings(glove_file_path)
    
    # ✅ Convert Text Data to GloVe Embeddings (Optimized)
    def text_to_glove_embeddings(text_data, embeddings_index, embedding_dim=100, device="cuda"):
        text_embeddings = torch.zeros((len(text_data), embedding_dim), dtype=torch.float32, device=device)
    
        for i, text in enumerate(text_data):
            words = text.split()
            word_vectors = [torch.tensor(embeddings_index[word], device=device) for word in words if word in embeddings_index]
            
            if word_vectors:
                text_embeddings[i] = torch.stack(word_vectors).mean(dim=0)  # Compute mean embedding
    
        return text_embeddings
    
    # ✅ Convert Text Data to GloVe Embeddings using GPU
    text_embeddings = text_to_glove_embeddings(articles_with_images['text_data'], embeddings_index)
    
    print(f"✅ Text feature extraction complete. Embeddings Shape: {text_embeddings.shape}")

    # ✅ Save preprocessed data locally
    with open(os.path.join(TEXT_EMBEDDINGS_DIR, "text_embeddings.pkl"), "wb") as f:
        pickle.dump(text_embeddings, f)


    # ✅ Create a ZIP archive containing all preprocessed files
    with zipfile.ZipFile(TEXT_EMBEDDINGS_ZIP_FILE, 'w') as zipf:
        for file in os.listdir(TEXT_EMBEDDINGS_DIR):
            zipf.write(os.path.join(TEXT_EMBEDDINGS_DIR, file), arcname=file)

    print(f"✅ text_embeddings completed! Saved as {TEXT_EMBEDDINGS_ZIP_FILE}")



✅ text_embeddings data already exists. Loading...
✅ text embeddings data loaded successfull


In [53]:
# ✅ Define file paths for saving preprocessed data
# CUSTOMER_EMBEDDINGS_DIR = "/kaggle/working/customer_embeddings"
# CUSTOMER_EMBEDDINGS_ZIP_FILE = "/kaggle/working/customer_embeddings.zip"  # Final zipped archive

# # ✅ Create the directory if it doesn't exist
# os.makedirs(CUSTOMER_EMBEDDINGS_DIR, exist_ok=True)

# ✅ Correct path (inside `/kaggle/input/preprocessed-data/`)
CUSTOMER_EMBEDDINGS_DIR = "/kaggle/input/customer-embeddings-2"

In [54]:
# ✅ Extract ZIP only if the preprocessed files don't already exist
if os.path.exists(os.path.join(CUSTOMER_EMBEDDINGS_DIR, "customer_embeddings.pkl")):
    print("✅ customer_embeddings data already exists. Loading...")

    with open(os.path.join(CUSTOMER_EMBEDDINGS_DIR, "customer_embeddings.pkl"), "rb") as f:
        customer_embeddings = pickle.load(f)

    print("✅ customer_embeddings data loaded successfull")

else :


    # ✅ One-Hot Encode Categorical Features (Efficiently)
    categorical_features = pd.get_dummies(customers[['Active', 'club_member_status']], dtype=np.float32)
    
    # ✅ Stack Numerical & Categorical Features
    customer_features = np.hstack([
        customers['normalized_age'].values.reshape(-1, 1),  # Ensure it's a column vector
        categorical_features.values  # One-hot encoded categorical features
    ])
    
    # ✅ Convert to PyTorch Tensor & Move to GPU
    customer_features_tensor = torch.tensor(customer_features, dtype=torch.float32, device=device)
    
    # ✅ Define Customer Embedding Model
    class CustomerEmbedding(nn.Module):
        def __init__(self, input_dim, embedding_dim=64):
            super(CustomerEmbedding, self).__init__()
            self.model = nn.Sequential(
                nn.Linear(input_dim, 128),
                nn.ReLU(),
                nn.Linear(128, embedding_dim)
            )
    
        def forward(self, x):
            return self.model(x)
    
    # ✅ Initialize Model & Move to GPU
    embedding_dim = 64
    model = CustomerEmbedding(input_dim=customer_features.shape[1], embedding_dim=embedding_dim).to(device)
    
    # ✅ Get Customer Embeddings
    with torch.no_grad():
        customer_embeddings = model(customer_features_tensor)  # Compute embeddings
    
    print(f"✅ Customer embeddings generated. Shape: {customer_embeddings.shape}")

    # ✅ Save preprocessed data locally
    with open(os.path.join(CUSTOMER_EMBEDDINGS_DIR, "customer_embeddings.pkl"), "wb") as f:
        pickle.dump(customer_embeddings, f)


    # ✅ Create a ZIP archive containing all preprocessed files
    with zipfile.ZipFile(CUSTOMER_EMBEDDINGS_ZIP_FILE, 'w') as zipf:
        for file in os.listdir(CUSTOMER_EMBEDDINGS_DIR):
            zipf.write(os.path.join(CUSTOMER_EMBEDDINGS_DIR, file), arcname=file)

    print(f"✅ customer_embeddings completed! Saved as {CUSTOMER_EMBEDDINGS_ZIP_FILE}")

✅ customer_embeddings data already exists. Loading...
✅ customer_embeddings data loaded successfull


In [55]:
def sliced_wasserstein_distance(P, Q, num_projections=100):
    """
    Computes the Sliced Wasserstein Distance (SWD) between two trajectory distributions.
    Uses GPU-optimized random projections.
    """
    assert P.shape == Q.shape, f"Error: Shape mismatch! P.shape={P.shape}, Q.shape={Q.shape}"

    device = P.device  # Ensure computations are on the same device
    proj_vectors = torch.randn((P.shape[1], num_projections), device=device)
    proj_vectors = proj_vectors / torch.linalg.norm(proj_vectors, dim=0, keepdim=True)  # Normalize projections

    P_proj = P @ proj_vectors
    Q_proj = Q @ proj_vectors

    return torch.mean(torch.abs(P_proj.sort(dim=0)[0] - Q_proj.sort(dim=0)[0]))  # Compute distance

In [56]:
def geometric_distributed_sampling(ranks, rho=0.5, max_samples=300000):
    """
    Optimized Geometric Distributed Sampling (GDS).
    Ensures probabilities remain valid by normalizing ranks.
    """
    assert ranks.numel() > 0, "Error: Input `ranks` tensor is empty!"

    device = ranks.device  # Ensure all operations remain on the same device

    # Sort ranks in descending order and keep top categories
    sorted_indices = torch.argsort(ranks, descending=True)
    max_categories = min(len(ranks), 16_000_000)
    filtered_indices = sorted_indices[:max_categories]  

    # Normalize ranks safely (avoid division by zero)
    ranks_subset = ranks[filtered_indices]
    min_rank, max_rank = ranks_subset.min(), ranks_subset.max()
    range_rank = max_rank - min_rank + 1e-8  # Small epsilon for numerical stability

    normalized_ranks = (ranks_subset - min_rank) / range_rank  # Scale between 0-1
    normalized_ranks = torch.clamp(normalized_ranks, 0, 1)  # Ensure within bounds

    # Compute probabilities
    probabilities = torch.exp(-rho * normalized_ranks)
    probabilities /= probabilities.sum()  # Normalize to sum = 1

    # Sample indices
    num_samples = min(max_samples, len(filtered_indices))
    sampled_relative_indices = torch.multinomial(probabilities, num_samples=num_samples, replacement=False)

    # Map back to original transaction indices
    sampled_indices = filtered_indices[sampled_relative_indices]

    return sampled_indices

In [57]:
# ✅ **Improved GPU-Optimized Hard Negative Sampling**
def generate_hard_negatives(total_items, positive_sets, entity_type, product_indices, customer_indices, num_samples=10):
    """
    Implements HARD NEGATIVE SAMPLING with IMPROVEMENTS:
    - Ensures negatives are not in positive sets
    - Uses "harder" negatives (more similar but not exact matches)
    - Handles cases where there aren’t enough negatives
    - Fully GPU-optimized 🚀
    """

    negative_indices = torch.full((total_items, num_samples), -1, dtype=torch.long, device="cuda")  # Preallocate memory

    # ✅ Convert `positive_sets` to tensors **ONCE** (avoid repeated conversions)
    exclusions = [torch.tensor(positive_sets.get(i, []), device="cuda") if len(positive_sets.get(i, [])) > 0 else torch.tensor([], device="cuda") for i in range(total_items)]

    for i in range(total_items):
        # ✅ Select valid negatives based on entity type
        if entity_type == "product":
            valid_negatives = product_indices[~torch.isin(product_indices, exclusions[i])]
        elif entity_type in ["customer", "customer_to_product"]:
            valid_negatives = customer_indices[~torch.isin(customer_indices, exclusions[i])]

        # ✅ Ensure there are valid negatives available
        if valid_negatives.numel() > 0:
            # ✅ **Select "harder" negatives** (closer in embedding space but not exact matches)
            random_negatives = valid_negatives[torch.randperm(len(valid_negatives))[:num_samples]]
            
            # ✅ Add "harder" negatives if possible
            hard_negatives = valid_negatives[:num_samples]  # Take first `num_samples` as "harder" negatives

            # ✅ Combine both types of negatives
            combined_negatives = torch.cat((random_negatives, hard_negatives), dim=0).unique()[:num_samples]

            # ✅ Assign to the preallocated tensor
            negative_indices[i, :len(combined_negatives)] = combined_negatives

    return negative_indices.cpu().numpy()  # ✅ Move back to CPU after computation



In [58]:
# # ✅ Convert Embeddings to Float32 and Move to GPU
# if isinstance(text_embeddings, torch.Tensor):
#     text_embeddings = text_embeddings.to(torch.float32).cpu().numpy()
# if isinstance(image_embeddings, torch.Tensor):
#     image_embeddings = image_embeddings.to(torch.float32).cpu().numpy()
# if isinstance(customer_embeddings, torch.Tensor):
#     customer_embeddings = customer_embeddings.to(torch.float32).cpu().numpy()

# # ✅ Stack Article Embeddings Efficiently (GPU Optimized)
# article_embedding_matrix = torch.tensor(np.hstack([text_embeddings, image_embeddings]), dtype=torch.float32, device="cuda")
# customer_embedding_matrix = torch.tensor(customer_embeddings, dtype=torch.float32, device="cuda")

In [59]:
# ✅ Define file paths for saving preprocessed data
# DATA_SPLITTING_DIR = "/kaggle/working/data_splitting"
# DATA_SPLITTING_ZIP_FILE = "/kaggle/working/data_splitting.zip"  # Final zipped archive

# # ✅ Create the directory if it doesn't exist
# os.makedirs(DATA_SPLITTING_DIR, exist_ok=True)

# ✅ Correct path (inside `/kaggle/input/preprocessed-data/`)
DATA_SPLITTING_DIR = "/kaggle/input/data-splitting-3"

In [60]:
# ✅ Extract ZIP only if the preprocessed files don't already exist
if os.path.exists(os.path.join(DATA_SPLITTING_DIR, "train_data.pkl")):
    print("✅ train_data data already exists. Loading...")

    # Load extracted data
    with open(os.path.join(DATA_SPLITTING_DIR, "train_data.pkl"), "rb") as f:
        train_data = pickle.load(f)

        # Load extracted data
    with open(os.path.join(DATA_SPLITTING_DIR, "val_data.pkl"), "rb") as f:
        val_data = pickle.load(f)

        # Load extracted data
    with open(os.path.join(DATA_SPLITTING_DIR, "test_data.pkl"), "rb") as f:
        test_data = pickle.load(f)

        # Load extracted data
    with open(os.path.join(DATA_SPLITTING_DIR, "train_customers.pkl"), "rb") as f:
        train_customers = pickle.load(f)

        # Load extracted data
    with open(os.path.join(DATA_SPLITTING_DIR, "val_customers.pkl"), "rb") as f:
        val_customers = pickle.load(f)

        # Load extracted data
    with open(os.path.join(DATA_SPLITTING_DIR, "test_customers.pkl"), "rb") as f:
        test_customers = pickle.load(f)

        # Load extracted data
    with open(os.path.join(DATA_SPLITTING_DIR, "train_articles.pkl"), "rb") as f:
        train_articles = pickle.load(f)

        # Load extracted data
    with open(os.path.join(DATA_SPLITTING_DIR, "val_articles.pkl"), "rb") as f:
        val_articles = pickle.load(f)

    with open(os.path.join(DATA_SPLITTING_DIR, "test_articles.pkl"), "rb") as f:
        test_articles = pickle.load(f)

    print("✅ SPLIT data loaded successfully!")

else :

    print("⚡ Extracting SPLIT for the first time...")
    
    ### ✅ Step 3: Convert Transaction Date to Datetime (Efficiently)
    transactions_filtered['t_dat'] = pd.to_datetime(transactions_filtered['t_dat'])
    
    # ✅ Sort transactions by date (ascending)
    transactions_filtered.sort_values('t_dat', ascending=True, inplace=True)
    
    ### ✅ Step 4: Compute Train, Validation, and Test Cutoffs
    train_cutoff, val_cutoff, test_cutoff = transactions_filtered['t_dat'].quantile([0.75, 0.875, 1.0]).values
    
    ### ✅ Step 5: Assign Ranks for GDS Efficiently
    transactions_filtered["rank"] = transactions_filtered['t_dat'].rank(method="first", ascending=True)
    
    # ✅ Convert ranks to GPU Tensor
    ranks_tensor = torch.tensor(transactions_filtered["rank"].values, dtype=torch.float32, device="cuda")
    
    # ✅ Sample transactions using Geometric Distributed Sampling (GDS)
    sampled_indices = geometric_distributed_sampling(ranks_tensor, rho=0.5, max_samples=300000)
    
    # ✅ Efficiently extract sampled transactions
    transactions_filtered_sampled = transactions_filtered.loc[sampled_indices.cpu().numpy()].reset_index(drop=True)
    
    ### ✅ Step 7: Train/Validation/Test Splitting (Optimized)
    train_mask = transactions_filtered_sampled['t_dat'] <= train_cutoff
    val_mask = (transactions_filtered_sampled['t_dat'] > train_cutoff) & (transactions_filtered_sampled['t_dat'] <= val_cutoff)
    test_mask = transactions_filtered_sampled['t_dat'] > val_cutoff
    
    train_data, val_data, test_data = (
        transactions_filtered_sampled[train_mask], 
        transactions_filtered_sampled[val_mask], 
        transactions_filtered_sampled[test_mask]
    )
    
    ### ✅ Step 8: Cold Start Customer Handling (Optimized)
    transaction_customers = set(transactions_filtered['customer_id'])
    cold_start_customers = customers[~customers['customer_id'].isin(transaction_customers)]
    
    # ✅ Split cold start customers into train, validation, and test
    train_cold_start, temp_cold_start = train_test_split(
        cold_start_customers, test_size=0.25, random_state=42
    )
    val_cold_start, test_cold_start = train_test_split(
        temp_cold_start, test_size=0.5, random_state=42
    )
    
    # ✅ Merge cold-start customers with train/val/test sets
    train_data = pd.concat([train_data, train_cold_start], ignore_index=True)
    val_data = pd.concat([val_data, val_cold_start], ignore_index=True)
    test_data = pd.concat([test_data, test_cold_start], ignore_index=True)
    
    # ✅ Filter customers appearing in train/val/test sets
    train_customers = customers[customers['customer_id'].isin(train_data['customer_id'])]
    val_customers = customers[customers['customer_id'].isin(val_data['customer_id'])]
    test_customers = customers[customers['customer_id'].isin(test_data['customer_id'])]
    
    # ✅ Filter articles appearing in train/val/test sets
    train_articles = articles_with_images[articles_with_images['article_id'].isin(train_data['article_id'])]
    val_articles = articles_with_images[articles_with_images['article_id'].isin(val_data['article_id'])]
    test_articles = articles_with_images[articles_with_images['article_id'].isin(test_data['article_id'])]
    
    print("✅ Data split completed successfully! 🚀")


    
    # ✅ Convert IDs to Strings for Consistency
    train_customers['customer_id'] = train_customers['customer_id'].astype(str)
    train_articles['article_id'] = train_articles['article_id'].astype(str)
    
    # ✅ Create Fast Index Mappings
    article_to_index = {str(article_id): idx for idx, article_id in enumerate(train_articles['article_id'])}
    customer_to_index = {str(customer_id): idx for idx, customer_id in enumerate(train_customers['customer_id'])}
    
    # ✅ Initialize Positive Index Dictionaries
    customer_positive_indices = {str(cid): [] for cid in train_customers['customer_id']}
    article_positive_indices = {str(aid): [] for aid in train_articles['article_id']}
    customer_to_product_positive_indices = {str(cid): [] for cid in train_customers['customer_id']}  # New
    
    # ✅ Populate Positive Indices Using `train_transactions`
    for _, row in train_data.iterrows():
        cid, aid = str(row['customer_id']), str(row['article_id'])
    
        if cid in customer_to_index and aid in article_to_index:  # Ensure valid entries
            customer_positive_indices[cid].append(article_to_index[aid])  # Customer's purchased articles
            article_positive_indices[aid].append(customer_to_index[cid])  # Articles linked to customers
    
    # ✅ Generate Customer-to-Product Positive Indices
    for aid in train_data['article_id'].astype(str).unique():
        customer_list = train_data.loc[train_data['article_id'] == aid, 'customer_id'].astype(str).unique()
        mapped_customers = [customer_to_index[c] for c in customer_list if c in customer_to_index]
    
        for cid in mapped_customers:
            if cid in customer_to_product_positive_indices:  
                customer_to_product_positive_indices[cid].extend(
                    [c for c in mapped_customers if c != cid][:10]
                )
            else:
                customer_to_product_positive_indices[cid] = []  # Assign empty list
    
    # ✅ Map Positive Indices Efficiently
    train_customers['positive_indices'] = train_customers['customer_id'].map(customer_positive_indices)
    train_articles['positive_indices'] = train_articles['article_id'].map(article_positive_indices)
    train_customers['customer_to_product_positive_indices'] = train_customers['customer_id'].map(customer_to_product_positive_indices)
    
    print("✅ Positive indices generated successfully!")

    # ✅ **Define Product and Customer Indices**
    product_indices = torch.arange(len(train_articles), device="cuda")
    customer_indices = torch.arange(len(train_customers), device="cuda")
    
    
    # ✅ **Generate Negative Indices for Products (GPU-Accelerated)**
    print("⚡ Generating Hard Negative Samples for Products...")
    train_articles['negative_indices'] = list(generate_hard_negatives(
        len(train_articles), train_articles['positive_indices'], "product", product_indices, customer_indices
    ))
    print("✅ Done!")
    
    # ✅ **Generate Negative Indices for Customers (GPU-Accelerated)**
    print("⚡ Generating Hard Negative Samples for Customers...")
    train_customers['negative_indices'] = list(generate_hard_negatives(
        len(train_customers), train_customers['positive_indices'], "customer", product_indices, customer_indices
    ))
    print("✅ Done!")
    
    # ✅ **Generate Negative Indices for Customer-to-Product Interactions (GPU-Accelerated)**
    print("⚡ Generating Hard Negative Samples for Customer-to-Product Interactions...")
    train_customers['negative_customer_to_product_indices'] = list(generate_hard_negatives(
        len(train_customers), train_customers['customer_to_product_positive_indices'], "customer_to_product", product_indices, customer_indices
    ))
    print("✅ Done!")
    
    print("🚀 Hard Negative Sampling Completed with Full GPU Optimization! ✅")


    # ✅ Save preprocessed data locally
    with open(os.path.join(DATA_SPLITTING_DIR, "train_data.pkl"), "wb") as f:
        pickle.dump(train_data, f)

    with open(os.path.join(DATA_SPLITTING_DIR, "val_data.pkl"), "wb") as f:
        pickle.dump(val_data, f)

    with open(os.path.join(DATA_SPLITTING_DIR, "test_data.pkl"), "wb") as f:
        pickle.dump(test_data, f)

    with open(os.path.join(DATA_SPLITTING_DIR, "train_customers.pkl"), "wb") as f:
        pickle.dump(train_customers, f)

    with open(os.path.join(DATA_SPLITTING_DIR, "val_customers.pkl"), "wb") as f:
        pickle.dump(val_customers, f)

    with open(os.path.join(DATA_SPLITTING_DIR, "test_customers.pkl"), "wb") as f:
        pickle.dump(test_customers, f)


    with open(os.path.join(DATA_SPLITTING_DIR, "train_articles.pkl"), "wb") as f:
        pickle.dump(train_articles, f)

    with open(os.path.join(DATA_SPLITTING_DIR, "val_articles.pkl"), "wb") as f:
        pickle.dump(val_articles, f)

    with open(os.path.join(DATA_SPLITTING_DIR, "test_articles.pkl"), "wb") as f:
        pickle.dump(test_articles, f)
    

    # ✅ Create a ZIP archive containing all preprocessed files
    with zipfile.ZipFile(DATA_SPLITTING_ZIP_FILE, 'w') as zipf:
        for file in os.listdir(DATA_SPLITTING_DIR):
            zipf.write(os.path.join(DATA_SPLITTING_DIR, file), arcname=file)

    print(f"✅ data splitting completed! Saved as {DATA_SPLITTING_ZIP_FILE}")


✅ train_data data already exists. Loading...
✅ SPLIT data loaded successfully!


In [61]:
def filter_and_align_embeddings(data, articles, customers):
    """ Ensures that price, text, image, and customer embeddings are aligned properly. """
    
    # ✅ 1️⃣ Filter valid articles and customers
    valid_articles = articles.loc[articles['article_id'].isin(data['article_id'])].reset_index(drop=True)
    valid_customers = customers.loc[customers['customer_id'].isin(data['customer_id'])].reset_index(drop=True)

    # ✅ 2️⃣ Extract & Align Price Values
    price_values = (
        data.drop_duplicates(subset=["article_id"])
        .set_index("article_id")
        .reindex(valid_articles["article_id"])["price"]
        .fillna(0)  # Handle missing prices
        .values.reshape(-1, 1)
        .astype(np.float32)
    )

    # ✅ 3️⃣ Extract Text, Image, and Customer Embeddings (REMOVE `.cpu()` BECAUSE THEY ARE NUMPY ARRAYS)
    text_embeds = text_embeddings[valid_articles.index]  # ✅ FIXED
    image_embeds = image_embeddings[valid_articles.index]  # ✅ FIXED
    customer_embeds = customer_embeddings[valid_customers.index]  # ✅ FIXED

    # ✅ 4️⃣ Ensure Feature Matrices Have the Same Row Count
    min_rows = min(len(price_values), len(text_embeds), len(image_embeds), len(customer_embeds))
    
    # ✅ 5️⃣ Stack Features Efficiently
    final_features = np.hstack([
        price_values[:min_rows],  
        text_embeds[:min_rows],  
        image_embeds[:min_rows],  
        customer_embeds[:min_rows]
    ])

    print(f"✅ Features Extracted: {final_features.shape}")
    return final_features




In [62]:
# ✅ Define file paths for saving preprocessed data
# FEATURES_DIR = "/kaggle/working/features"
# FEATURES_ZIP_FILE = "/kaggle/working/features.zip"  # Final zipped archive

# # ✅ Create the directory if it doesn't exist
# os.makedirs(FEATURES_DIR, exist_ok=True)

# ✅ Correct path (inside `/kaggle/input/preprocessed-data/`)
FEATURES_DIR = "/kaggle/input/features-2"

In [63]:
# ✅ Extract ZIP only if the preprocessed files don't already exist
if os.path.exists(os.path.join(FEATURES_DIR, "train_features.pkl")):
    print("✅ features are already exists. Loading...")

    # Load extracted data
    with open(os.path.join(FEATURES_DIR, "train_features.pkl"), "rb") as f:
        train_features = pickle.load(f)

    with open(os.path.join(FEATURES_DIR, "val_features.pkl"), "rb") as f:
        val_features = pickle.load(f)

    with open(os.path.join(FEATURES_DIR, "test_features.pkl"), "rb") as f:
        test_features = pickle.load(f)

    print("✅ features loaded successfully!")

else :

    # ✅ Extract & Align Features for Train/Validation/Test Sets
    print("⚡ Extracting features for the first time...")
    
    train_features = filter_and_align_embeddings(train_data, train_articles, train_customers)
    val_features = filter_and_align_embeddings(val_data, val_articles, val_customers)
    test_features = filter_and_align_embeddings(test_data, test_articles, test_customers)
    
    print("✅ Feature Extraction Completed! 🚀")
          # ✅ Save preprocessed data locally
    with open(os.path.join(FEATURES_DIR, "train_features.pkl"), "wb") as f:
        pickle.dump(train_features, f)

    with open(os.path.join(FEATURES_DIR, "val_features.pkl"), "wb") as f:
        pickle.dump(val_features, f)

    with open(os.path.join(FEATURES_DIR, "test_features.pkl"), "wb") as f:
        pickle.dump(test_features, f)


    # ✅ Create a ZIP archive containing all preprocessed files
    with zipfile.ZipFile(FEATURES_ZIP_FILE, 'w') as zipf:
        for file in os.listdir(FEATURES_DIR):
            zipf.write(os.path.join(FEATURES_DIR, file), arcname=file)

    print(f"✅ features completed! Saved as {FEATURES_ZIP_FILE}")

✅ features are already exists. Loading...
✅ features loaded successfully!


In [64]:
# # ✅ Define directory and file paths
# GRAPH_DIR = "/kaggle/working/graph_data"
# GRAPH_ZIP_FILE = "/kaggle/working/graph_data.zip"

# # ✅ Create the directory if it doesn't exist
# os.makedirs(GRAPH_DIR, exist_ok=True)

# ✅ Define Graph Load Path
GRAPH_DIR = "/kaggle/input/graph-data-2"
GRAPH_FILE = os.path.join(GRAPH_DIR, "pr_graph.pt")  # Define the PR-Graph file path

In [65]:
# ✅ Check if precomputed PR Graph exists
if os.path.exists(GRAPH_FILE):
    print("✅ Loading PR Graph from Disk...")

    # ✅ Load PR-Graph and associated embeddings
    graph_data = torch.load(GRAPH_FILE, map_location=device)
    graph = graph_data["graph"]

    print("✅ PR-Graph and embeddings loaded successfully! 🚀")
else:
    print("⚠️ No saved PR Graph found! Please generate it first.")

    if isinstance(customer_embeddings, torch.Tensor):
        customer_embeddings = customer_embeddings.to(torch.float32).cpu().numpy()

    article_to_index = {article_id: idx for idx, article_id in enumerate(train_articles['article_id'].values)}
    customer_to_index = {customer_id: idx for idx, customer_id in enumerate(train_customers['customer_id'].values)}
    
    # ✅ **Construct PR-Graph and Edge Indices Efficiently**
    print("⚡ Constructing PR-Graph and Edge Index...")
    
    train_data.dropna(subset=['customer_id', 'article_id'], inplace=True)
    
    customer_to_product_edges = torch.tensor([
        (customer_to_index[u], article_to_index[v])
        for u, v in train_data[['customer_id', 'article_id']].values
        if u in customer_to_index and v in article_to_index
    ], dtype=torch.long, device=device).T
    
    print("✅ Customer-to-Product Edge Construction Done")
    
    # ✅ **Construct Product-to-Product Co-Purchase Graph**
    product_to_product_edges = []
    co_purchase_pairs = train_data.groupby('customer_id')['article_id'].apply(list)
    for articles_list in co_purchase_pairs:
        product_to_product_edges.extend([(article_to_index[articles_list[i]], article_to_index[articles_list[j]]) 
                                         for i in range(len(articles_list)) for j in range(i + 1, len(articles_list))])
    
    product_to_product_edges = torch.tensor(product_to_product_edges, dtype=torch.long, device=device).T
    print("✅ Product-to-Product Edge Construction Done")
    
    # ✅ **Customer-to-Customer Similarity Graph Using FAISS**
    num_customers = len(train_customers)  # Total customers
    index = faiss.IndexFlatL2(customer_embeddings.shape[1])
    index.add(customer_embeddings)
    k = min(10, num_customers)  # Ensure `k` is within valid range
    
    # ✅ Perform FAISS search
    D, I = index.search(customer_embeddings, k)
    print("✅ FAISS search completed.")
    
    # ✅ Ensure FAISS indices are valid before using them
    customer_to_customer_edges = []
    
    for i in range(num_customers):  # Loop through each customer
        for j in range(1, k):  # Start from 1 to avoid self-loop
            if I[i][j] < 0 or I[i][j] >= num_customers:  # Ensure index is within range
                continue  # Skip invalid indices
    
            src_customer = train_customers.iloc[i]['customer_id']  # Source customer
            tgt_customer = train_customers.iloc[I[i][j]]['customer_id']  # Target customer
    
            # ✅ Ensure both customers exist in the customer_to_index mapping
            if src_customer in customer_to_index and tgt_customer in customer_to_index:
                customer_to_customer_edges.append(
                    (customer_to_index[src_customer], customer_to_index[tgt_customer])
                )
    
    # ✅ Convert edges to PyTorch tensor
    if customer_to_customer_edges:
        customer_to_customer_edges = torch.tensor(customer_to_customer_edges, dtype=torch.long, device=device).T
    else:
        customer_to_customer_edges = torch.empty((2, 0), dtype=torch.long, device=device)  # Handle empty case
    
    print("✅ Customer-to-Customer Similarity Graph Done")
    
    # ✅ **Ensure Edges Are on GPU**
    customer_to_product_edges = customer_to_product_edges.to(device)
    product_to_product_edges = product_to_product_edges.to(device)
    customer_to_customer_edges = customer_to_customer_edges.to(device)
    
    # ✅ **Construct PyTorch Geometric Heterogeneous Graph**
    graph = HeteroData()
    
    # ✅ **Define Graph Structure**
    graph['customer', 'buys', 'product'].edge_index = customer_to_product_edges  # No self-loops needed
    graph['product', 'co_purchased_with', 'product'].edge_index = product_to_product_edges
    graph['customer', 'similar_to', 'customer'].edge_index = customer_to_customer_edges
    
    # ✅ **Add Self-Loops Only Where Needed**
    graph['customer', 'similar_to', 'customer'].edge_index = add_self_loops(graph['customer', 'similar_to', 'customer'].edge_index)[0]
    graph['product', 'co_purchased_with', 'product'].edge_index = add_self_loops(graph['product', 'co_purchased_with', 'product'].edge_index)[0]
    
    # ❌ **No self-loops for customer-to-product (bipartite graph)**
    # graph['customer', 'buys', 'product'].edge_index = add_self_loops(graph['customer', 'buys', 'product'].edge_index)[0]  # ❌ REMOVE this
    
    # ✅ **Move Graph to GPU**
    graph = graph.to(device)
     
    print("✅ PR-Graph Constructed Successfully 🚀")
    
    # ✅ Save PR-Graph as a Dictionary (Recommended by PyG)
    graph_data = {
        "graph": graph,  # Full graph structure
    }
    
    
    # ✅ Save the entire graph in one file
    torch.save(graph_data, GRAPH_FILE)
    print(f"✅ PR-Graph and embeddings saved at {GRAPH_FILE}")
    
    # ✅ Create ZIP Archive for Future Use
    with zipfile.ZipFile(GRAPH_ZIP_FILE, 'w') as zipf:
        for file in os.listdir(GRAPH_DIR):
            zipf.write(os.path.join(GRAPH_DIR, file), arcname=file)
    
    print(f"✅ PR-Graph and Edge Index processing completed! Saved as {GRAPH_ZIP_FILE}")


✅ Loading PR Graph from Disk...
✅ PR-Graph and embeddings loaded successfully! 🚀


  graph_data = torch.load(GRAPH_FILE, map_location=device)


In [78]:
print(train_features)

RuntimeError: CUDA error: device-side assert triggered
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [67]:
# ✅ **Define Checkpoint Path**
CHECKPOINT_FILE = "/kaggle/working/checkpoint.pth"

In [68]:
# ✅ **Save Model Checkpoint**
def save_checkpoint(model, optimizer, epoch, filename=CHECKPOINT_FILE):
    checkpoint = {
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict()
    }
    torch.save(checkpoint, filename)
    print(f"✅ Model checkpoint saved at epoch {epoch}")

# ✅ **Load Model Checkpoint**
def load_checkpoint(filename, model, optimizer=None):
    if os.path.exists(filename):
        print("✅ Loading checkpoint...")
        checkpoint = torch.load(filename, map_location=device)
        model.load_state_dict(checkpoint['model_state_dict'])
        if optimizer:
            optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        print(f"✅ Resuming training from epoch {checkpoint['epoch'] + 1}")
        return checkpoint['epoch'] + 1  # Resume from the next epoch
    else:
        print("⚡ No checkpoint found. Starting from scratch.")
        return 0  # Start from epoch 0 if no checkpoint exists

In [69]:
class MultiRelationGCNLayer(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(MultiRelationGCNLayer, self).__init__()
        self.linear = nn.Linear(input_dim, output_dim)  # Linear transformation
        self.edge_weight = nn.Parameter(torch.Tensor(3))  # Separate weights for each edge type

        # Initialize weights
        nn.init.xavier_uniform_(self.linear.weight)
        nn.init.constant_(self.edge_weight, 1.0)

    def forward(self, features, customer_to_product_edge_index, product_to_product_edge_index, customer_to_customer_edge_index):
        row_cp, col_cp = customer_to_product_edge_index  # Customer → Product
        row_pp, col_pp = product_to_product_edge_index  # Product ↔ Product
        row_cc, col_cc = customer_to_customer_edge_index  # Customer ↔ Customer
        
        # Compute degree normalization (avoid division by zero)
        degree = torch.zeros(features.size(0), device=features.device)

        if torch.min(col_cc) < 0:
            print("Error: Negative indices found in col_cc!")
        if torch.max(col_cc) >= degree.shape[0]:
            print("Error: col_cc contains out-of-bounds indices!")

        
        degree.index_add_(0, col_cp, torch.ones_like(col_cp, dtype=torch.float))
        degree.index_add_(0, col_pp, torch.ones_like(col_pp, dtype=torch.float))
        degree.index_add_(0, col_cc, torch.ones_like(col_cc, dtype=torch.float))
        degree = degree.clamp(min=1).pow(-0.5)

        

        # Message Passing for Each Relation Type
        agg_cp = torch.zeros_like(features)
        agg_pp = torch.zeros_like(features)
        agg_cc = torch.zeros_like(features)

        agg_cp.index_add_(0, row_cp, features[col_cp] * degree[col_cp].view(-1, 1))
        agg_pp.index_add_(0, row_pp, features[col_pp] * degree[col_pp].view(-1, 1))
        agg_cc.index_add_(0, row_cc, features[col_cc] * degree[col_cc].view(-1, 1))

        # Weighted Aggregation
        aggregated = (
            self.edge_weight[0] * agg_cp +
            self.edge_weight[1] * agg_pp +
            self.edge_weight[2] * agg_cc
        ) / 3  # Normalize across relations

        return F.relu(self.linear(aggregated))  # Apply transformation & ReLU activation


class ImprovedCSTAR(nn.Module):
    def __init__(self, input_dim, embedding_dim, num_layers, dropout):
        super(ImprovedCSTAR, self).__init__()

        self.embedding_layer = nn.Linear(input_dim, embedding_dim)  # Learn feature transformation
        self.gcn_layers = nn.ModuleList([MultiRelationGCNLayer(embedding_dim, embedding_dim) for _ in range(num_layers)])
        self.dropout = dropout
        self.reference_embedding = nn.Parameter(torch.randn(embedding_dim, dtype=torch.float32))  # Global trajectory reference

    def forward(self, features, customer_to_product_edge_index, product_to_product_edge_index, customer_to_customer_edge_index):
        embeddings = self.embedding_layer(features.to(torch.float32))  # Convert to float32 & transform
        embeddings = F.dropout(embeddings, p=self.dropout, training=self.training)

        for gcn_layer in self.gcn_layers:
            embeddings = gcn_layer(embeddings, customer_to_product_edge_index, product_to_product_edge_index, customer_to_customer_edge_index)
            embeddings = F.dropout(embeddings, p=self.dropout, training=self.training)

        return embeddings


def inter_trajectory_loss(embeddings, reference_embedding):
    """
    Computes inter-trajectory loss to ensure different customer trajectories are correctly spaced.
    """
    return sliced_wasserstein_distance(embeddings, reference_embedding)


def intra_trajectory_loss(embeddings, positive_indices, negative_indices):
    """
    Intra-trajectory loss to ensure a trajectory is closer to its positive interactions than negatives.
    """
    pos_indices = torch.as_tensor(positive_indices, dtype=torch.long, device=embeddings.device)
    neg_indices = torch.as_tensor(negative_indices, dtype=torch.long, device=embeddings.device)

    if pos_indices.numel() == 0 or neg_indices.numel() == 0:
        return torch.tensor(0.0, requires_grad=True, device=embeddings.device)

    pos_scores = torch.einsum('ij,ij->i', embeddings[pos_indices], embeddings[pos_indices])
    neg_scores = torch.einsum('ij,ij->i', embeddings[neg_indices], embeddings[neg_indices])

    margin = 1.0 + 0.1 * torch.std(embeddings)
    return F.relu(neg_scores + margin - pos_scores).mean()

def compute_loss(embeddings, model):
    """
    Computes loss for product-to-product, customer-to-customer, and customer-to-product.
    Uses stored positive & negative indices.
    """

    # ✅ **Move Positive & Negative Indices to GPU (Precompute for Faster Batching)**
    train_articles['positive_indices'] = train_articles['positive_indices'].apply(lambda x: torch.tensor(x, dtype=torch.long, device=device))
    train_articles['negative_indices'] = train_articles['negative_indices'].apply(lambda x: torch.tensor(x, dtype=torch.long, device=device))

    train_customers['customer_to_product_positive_indices'] = train_customers['customer_to_product_positive_indices'].apply(lambda x: torch.tensor(x, dtype=torch.long, device=device))
    train_customers['negative_customer_to_product_indices'] = train_customers['negative_customer_to_product_indices'].apply(lambda x: torch.tensor(x, dtype=torch.long, device=device))

    train_customers['positive_indices'] = train_customers['positive_indices'].apply(lambda x: torch.tensor(x, dtype=torch.long, device=device))
    train_customers['negative_indices'] = train_customers['negative_indices'].apply(lambda x: torch.tensor(x, dtype=torch.long, device=device))

    total_loss = 0.0

    # ✅ Compute Product-to-Product Loss
    if 'positive_indices' in train_articles and 'negative_indices' in train_articles:
        total_loss += intra_trajectory_loss(
            embeddings, 
            torch.cat(train_articles['positive_indices'].tolist()), 
            torch.cat(train_articles['negative_indices'].tolist())
        )

    # ✅ Compute Customer-to-Customer Loss
    if 'positive_indices' in train_customers and 'negative_indices' in train_customers:
        total_loss += intra_trajectory_loss(
            embeddings, 
            torch.cat(train_customers['positive_indices'].tolist()), 
            torch.cat(train_customers['negative_indices'].tolist())
        )

    # ✅ Compute Customer-to-Product Loss
    if 'customer_to_product_positive_indices' in train_customers and 'negative_customer_to_product_indices' in train_customers:
        total_loss += intra_trajectory_loss(
            embeddings, 
            torch.cat(train_customers['customer_to_product_positive_indices'].tolist()), 
            torch.cat(train_customers['negative_customer_to_product_indices'].tolist())
        )

    # ✅ Add Inter-Trajectory Loss
    total_loss += inter_trajectory_loss(embeddings, model.reference_embedding)

    return total_loss


In [70]:
# ✅ **Move Edge Indices to GPU**
customer_to_product_edge_index = graph['customer', 'buys', 'product'].edge_index.to(device)
product_to_product_edge_index = graph['product', 'co_purchased_with', 'product'].edge_index.to(device)
customer_to_customer_edge_index = graph['customer', 'similar_to', 'customer'].edge_index.to(device)

# ✅ Ensure valid edges for customer-to-product
num_products = len(train_articles)
num_customers = len(train_customers)


def filter_invalid_edges(edge_index, num_nodes):
    """Removes out-of-bound edges from edge_index"""
    mask = (edge_index[0] < num_nodes) & (edge_index[1] < num_nodes)
    return edge_index[:, mask]

# ✅ Validate Edge Indices Before Using Them
customer_to_product_edge_index = filter_invalid_edges(graph['customer', 'buys', 'product'].edge_index, num_customers)
product_to_product_edge_index = filter_invalid_edges(graph['product', 'co_purchased_with', 'product'].edge_index, num_products)
customer_to_customer_edge_index = filter_invalid_edges(graph['customer', 'similar_to', 'customer'].edge_index, num_customers)

print(f"Filtered Edge Counts: Customer-Product={customer_to_product_edge_index.shape[1]}, "
      f"Product-Product={product_to_product_edge_index.shape[1]}, "
      f"Customer-Customer={customer_to_customer_edge_index.shape[1]}")




Filtered Edge Counts: Customer-Product=170598, Product-Product=78023, Customer-Customer=1407970


In [71]:
# # ✅ Use a smaller dataset for quick testing
# train_data = train_data.sample(n=10000, random_state=42)  
# val_data = val_data.sample(n=2000, random_state=42)
# test_data = train_data.sample(n=2000, random_state=42)

In [72]:
# ✅ **Train Model with Early Stopping & Checkpoints**
def train_cstar_model(model, optimizer, num_epochs, patience, train_features, val_features):
    best_val_loss = float('inf')
    patience_counter = 0
    best_model_state = None
    start_epoch = load_checkpoint(CHECKPOINT_FILE, model, optimizer)  # ✅ Start from checkpoint

    model.train()
    train_features = train_features.to(torch.float32).to(device)
    val_features = val_features.to(torch.float32).to(device)

    for epoch in range(start_epoch, num_epochs):
        optimizer.zero_grad()

        embeddings = model(train_features, customer_to_product_edge_index, product_to_product_edge_index, customer_to_customer_edge_index)
        loss = compute_loss(embeddings, model)

        loss.backward()
        optimizer.step()

        print(f"Epoch {epoch + 1}/{num_epochs} | Train Loss: {loss.item():.4f}")

        # Validation Phase
        model.eval()
        with torch.no_grad():
            val_embeddings = model(val_features, customer_to_product_edge_index, product_to_product_edge_index, customer_to_customer_edge_index)
            val_loss = compute_loss(val_embeddings, model)

        print(f"Validation Loss: {val_loss.item():.4f}")


        # ✅ **Early Stopping & Checkpointing**
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_model_state = model.state_dict()
            patience_counter = 0
            save_checkpoint(model, optimizer, epoch)  # ✅ Save Best Model
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print("✅ Early stopping triggered!")
                break

    # ✅ **Load Best Model Before Returning**
    if best_model_state:
        model.load_state_dict(best_model_state)
    
    return model

In [73]:
num_customers = graph['customer', 'similar_to', 'customer'].edge_index.max().item() + 1 if graph['customer', 'similar_to', 'customer'].edge_index.numel() > 0 else 0
num_products = graph['product', 'co_purchased_with', 'product'].edge_index.max().item() + 1 if graph['product', 'co_purchased_with', 'product'].edge_index.numel() > 0 else 0

graph['customer'].num_nodes = num_customers
graph['product'].num_nodes = num_products

total_nodes = num_customers + num_products

print(f"Total Customers: {num_customers}")
print(f"Total Products: {num_products}")
print(f"Total Nodes in the Graph: {total_nodes}")


Total Customers: 141059
Total Products: 25484
Total Nodes in the Graph: 166543


In [74]:
if torch.max(graph['customer', 'buys', 'product'].edge_index) >= total_nodes:
    print("Error: customer_to_product_edge_index contains out-of-bounds indices")
if torch.max(graph['product', 'co_purchased_with', 'product'].edge_index) >= total_nodes:
    print("Error: product_to_product_edge_index contains out-of-bounds indices")
if torch.max(graph['customer', 'similar_to', 'customer'].edge_index) >= total_nodes:
    print("Error: customer_to_customer_edge_index contains out-of-bounds indices")

In [75]:
customer_to_product_edge_index = customer_to_product_edge_index.to('cuda')
product_to_product_edge_index = product_to_product_edge_index.to('cuda')
customer_to_customer_edge_index = customer_to_customer_edge_index.to('cuda')



In [76]:
# ✅ Force clear CUDA memory before re-running
torch.cuda.empty_cache()

if isinstance(text_embeddings, torch.Tensor):
    text_embeddings = text_embeddings.to(torch.float32).cpu().numpy()
if isinstance(image_embeddings, torch.Tensor):
    image_embeddings = image_embeddings.to(torch.float32).cpu().numpy()
if isinstance(customer_embeddings, torch.Tensor):
    customer_embeddings = customer_embeddings.to(torch.float32).cpu().numpy()


# ✅ Ensure shape consistency
print(f"Text Embedding Shape: {text_embeddings.shape}")
print(f"Image Embedding Shape: {image_embeddings.shape}")
print(f"Customer Embedding Shape: {customer_embeddings.shape}")

# ✅ Convert NumPy arrays to PyTorch tensors first (on CPU)
text_tensor = torch.tensor(text_embeddings, dtype=torch.float32, device="cpu")
image_tensor = torch.tensor(image_embeddings, dtype=torch.float32, device="cpu")

print("Moving text_tensor to CUDA...")
text_tensor = text_tensor.clone().detach().to("cuda")
print("Moving image_tensor to CUDA...")
image_tensor = image_tensor.clone().detach().to("cuda")

# ✅ Now concatenate them on CPU first
article_embedding_matrix = torch.cat([text_tensor, image_tensor], dim=1)

# ✅ Move article_embedding_matrix to CUDA
print("Moving article_embedding_matrix to CUDA...")
article_embedding_matrix = article_embedding_matrix.clone().detach().to("cuda")

# ✅ Move customer embeddings safely
print("Moving customer_embedding_matrix to CUDA...")
customer_embedding_matrix = torch.tensor(customer_embeddings.astype(np.float32), dtype=torch.float32, device="cuda")

print("✅ Embeddings successfully moved to GPU!")

# ✅ **Define Training Parameters**
learning_rate = 5e-3
num_epochs = 2
embedding_dim = 32
num_layers = 2
dropout = 0.2
patience = 10

# ✅ **Initialize & Train Model**
model = ImprovedCSTAR(
    input_dim = article_embedding_matrix.shape[1] + customer_embedding_matrix.shape[1] + 1,
    embedding_dim=embedding_dim,
    num_layers=num_layers,
    dropout=dropout
).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# ✅ Convert features to PyTorch tensors before training
train_features = torch.tensor(train_features, dtype=torch.float32, device=device)
val_features = torch.tensor(val_features, dtype=torch.float32, device=device)

# ✅ **Train Model**
trained_model = train_cstar_model(
    model, optimizer, num_epochs, patience, train_features, val_features
)



Text Embedding Shape: (105100, 100)
Image Embedding Shape: (105100, 2048)
Customer Embedding Shape: (1371980, 64)
Moving text_tensor to CUDA...
Moving image_tensor to CUDA...
Moving article_embedding_matrix to CUDA...
Moving customer_embedding_matrix to CUDA...
✅ Embeddings successfully moved to GPU!
⚡ No checkpoint found. Starting from scratch.
Error: col_cc contains out-of-bounds indices!


RuntimeError: CUDA error: device-side assert triggered
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
# Metrics Evaluation Functions
def recall_at_k(recommended_articles, relevant_articles, k=10):
    recommended_set = set(recommended_articles[:k])
    relevant_set = set(relevant_articles)
    intersection = recommended_set.intersection(relevant_set)
    return len(intersection) / len(relevant_set) if len(relevant_set) > 0 else 0.0

def ndcg_at_k(recommended_articles, relevant_articles, k=10):
    recommended_set = set(recommended_articles[:k])
    dcg = 0.0
    idcg = 0.0

    for i in range(min(k, len(recommended_articles))):
        if recommended_articles[i] in relevant_articles:
            dcg += 1 / np.log2(i + 2)

    for i in range(min(k, len(relevant_articles))):
        idcg += 1 / np.log2(i + 2)

    return dcg / idcg if idcg > 0 else 0.0

In [None]:
# ✅ Full-batch evaluation with Recall@K, NDCG@K, and Sliced Wasserstein Distance (SWD)
def test_model_full_batch_with_metrics(model, k=10):
    model.eval()
    total_test_loss = 0.0
    all_recall_at_k = []
    all_ndcg_at_k = []

    test_features_tensor = torch.tensor(test_features, dtype=torch.float32).to(device)

    with torch.no_grad():
        # ✅ Compute embeddings for each edge type separately
        test_embeddings_product_to_product = model(test_features_tensor, product_to_product_edge_index)
        test_embeddings_customer_to_product = model(test_features_tensor, customer_to_product_edge_index)
        test_embeddings_customer_to_customer = model(test_features_tensor, customer_to_customer_edge_index)

        # ✅ **Precompute Positive & Negative Indices on GPU**
        test_articles['positive_indices'] = test_articles['positive_indices'].apply(lambda x: torch.tensor(x, dtype=torch.long, device=device))
        test_articles['negative_indices'] = test_articles['negative_indices'].apply(lambda x: torch.tensor(x, dtype=torch.long, device=device))

        test_customers['customer_to_product_positive_indices'] = test_customers['customer_to_product_positive_indices'].apply(lambda x: torch.tensor(x, dtype=torch.long, device=device))
        test_customers['negative_customer_to_product_indices'] = test_customers['negative_customer_to_product_indices'].apply(lambda x: torch.tensor(x, dtype=torch.long, device=device))

        test_customers['positive_indices'] = test_customers['positive_indices'].apply(lambda x: torch.tensor(x, dtype=torch.long, device=device))
        test_customers['negative_indices'] = test_customers['negative_indices'].apply(lambda x: torch.tensor(x, dtype=torch.long, device=device))

        # ✅ **Batch Process Instead of Iterating Over Rows**
        for customer_idx, customer_id in enumerate(test_customers['customer_id'].values):
            relevant_articles = transactions_filtered[transactions_filtered['customer_id'] == customer_id]['article_id'].unique()

            # **Compute SWD for Customer-to-Product**
            customer_embedding = test_embeddings_customer_to_product[customer_idx].unsqueeze(0)
            swd_distances_customer_to_product = [
                (i, sliced_wasserstein_distance(customer_embedding, test_embeddings_product_to_product[i].unsqueeze(0)).item())
                for i in range(test_embeddings_product_to_product.shape[0])
            ]

            # **Sort by SWD Distance (Lower is Better)**
            swd_distances_customer_to_product.sort(key=lambda x: x[1])

            # **Get Top-K Recommendations**
            top_indices_customer_to_product = [item[0] for item in swd_distances_customer_to_product[:k]]
            recommended_articles = [list(article_to_index.keys())[idx] for idx in top_indices_customer_to_product]

            # **Compute Recall@K and NDCG@K**
            recall = recall_at_k(recommended_articles, relevant_articles, k)
            ndcg = ndcg_at_k(recommended_articles, relevant_articles, k)

            all_recall_at_k.append(recall)
            all_ndcg_at_k.append(ndcg)

            # ✅ **Retrieve Positive & Negative Indices**
            pos_indices_product = test_articles.loc[test_articles["article_id"] == recommended_articles[0], "positive_indices"].values[0]
            neg_indices_product = test_articles.loc[test_articles["article_id"] == recommended_articles[0], "negative_indices"].values[0]

            pos_indices_customer_product = test_customers.loc[test_customers["customer_id"] == customer_id, "customer_to_product_positive_indices"].values[0]
            neg_indices_customer_product = test_customers.loc[test_customers["customer_id"] == customer_id, "negative_customer_to_product_indices"].values[0]

            pos_indices_customer = test_customers.loc[test_customers["customer_id"] == customer_id, "positive_indices"].values[0]
            neg_indices_customer = test_customers.loc[test_customers["customer_id"] == customer_id, "negative_indices"].values[0]

            # ✅ **Compute Inter-Trajectory and Intra-Trajectory Loss**
            inter_loss_product = inter_trajectory_loss(test_embeddings_product_to_product[customer_idx].unsqueeze(0), model.reference_embedding)
            intra_loss_product = intra_trajectory_loss(test_embeddings_product_to_product, pos_indices_product, neg_indices_product, {})

            inter_loss_customer_product = inter_trajectory_loss(test_embeddings_customer_to_product[customer_idx].unsqueeze(0), model.reference_embedding)
            intra_loss_customer_product = intra_trajectory_loss(test_embeddings_customer_to_product, pos_indices_customer_product, neg_indices_customer_product, {})

            inter_loss_customer = inter_trajectory_loss(test_embeddings_customer_to_customer[customer_idx].unsqueeze(0), model.reference_embedding)
            intra_loss_customer = intra_trajectory_loss(test_embeddings_customer_to_customer, pos_indices_customer, neg_indices_customer, {})

            # ✅ **Total Loss**
            total_test_loss += (
                inter_loss_product + intra_loss_product +
                inter_loss_customer_product + intra_loss_customer_product +
                inter_loss_customer + intra_loss_customer
            ).item()

    # ✅ **Normalize Loss**
    avg_test_loss = total_test_loss / max(len(test_customers), 1)
    avg_recall_at_k = np.mean(all_recall_at_k)
    avg_ndcg_at_k = np.mean(all_ndcg_at_k)

    # ✅ **Print Metrics**
    print(f"✅ Test Loss: {avg_test_loss:.4f}")
    print(f"✅ Recall@{k}: {avg_recall_at_k:.4f}")
    print(f"✅ NDCG@{k}: {avg_ndcg_at_k:.4f}")

    return avg_test_loss, avg_recall_at_k, avg_ndcg_at_k




In [None]:
# ✅ Example usage of testing
test_loss, recall, ndcg = test_model_full_batch_with_metrics(
    model=trained_model,
    k=10
)

In [None]:
# ✅ **Find Similar Embeddings Using Sliced Wasserstein Distance (Optimized)**
def find_similar_embeddings(target_embedding, reference_embeddings, top_n=10):
    """
    Compute Sliced Wasserstein Distance (SWD) in batch mode.
    Returns indices of the top-N most similar embeddings.
    """
    # ✅ Compute SWD distances efficiently
    distances = torch.tensor([
        sliced_wasserstein_distance(target_embedding.unsqueeze(0), reference_embeddings[i].unsqueeze(0)).item()
        for i in range(reference_embeddings.shape[0])
    ], device=device)

    # ✅ Get top-N closest indices (lower distance is better)
    return torch.argsort(distances)[:top_n].tolist()


In [None]:
# ✅ **Multi-Modal Recommendation Function**
def recommend_articles_any_input(
    model, top_n=10, image_path=None, text_input=None, numeric_input=None, 
    age=None, active=None, club_status=None
):
    """
    Multi-modal product recommendation function.
    Supports **cold-start users** by recommending based on similar customers.
    Accepts: Image, Text, Numeric Features, and Customer Demographics.
    """
    model.eval()
    input_features = []

    # ✅ **Process Image Input (if provided)**
    if image_path:
        image = Image.open(image_path).convert('RGB')
        image = transform(image).unsqueeze(0).to(device)
        with torch.no_grad():
            image_embedding = resnet_model(image).squeeze(0).to(device)
        input_features.append(image_embedding)

    # ✅ **Process Text Input (if provided)**
    if text_input:
        words = text_input.split()  
        word_embeddings = np.zeros(100)  
        valid_word_count = 0

        for word in words:
            if word in embeddings_index:  
                word_embeddings += embeddings_index[word]
                valid_word_count += 1

        if valid_word_count > 0:
            word_embeddings /= valid_word_count  
        input_features.append(torch.tensor(word_embeddings, dtype=torch.float32).to(device))

    # ✅ **Process Numeric Input (if provided)**
    if numeric_input:
        numeric_input = np.array(numeric_input).reshape(1, -1)
        scaled_numeric_input = torch.tensor(scaler.transform(numeric_input), dtype=torch.float32).to(device)
        input_features.append(scaled_numeric_input.flatten())

    # ✅ **Handle Cold Start (Customer Attributes Given)**
    if any([age is not None, active is not None, club_status is not None]):
        print("🚀 Cold-Start User Detected: Generating Customer Embedding...")

        # ✅ Create Customer Feature Tensor
        cold_start_input = pd.DataFrame({"Active": [active], "club_member_status": [club_status]})
        dummy_encoded = pd.get_dummies(cold_start_input)

        # ✅ Handle Missing Columns
        missing_cols = [col for col in categorical_features_encoded.columns if col not in dummy_encoded.columns]
        for col in missing_cols:
            dummy_encoded[col] = 0  

        dummy_encoded = dummy_encoded[categorical_features_encoded.columns].astype(np.float32).values
        normalized_age = np.zeros(1) if age is None else scaler_age.transform([[age]]).flatten()

        new_customer_features = np.hstack([normalized_age.reshape(-1, 1), dummy_encoded]).astype(np.float32)
        new_customer_features_tensor = torch.tensor(new_customer_features, dtype=torch.float32).to(device)

        # ✅ Generate Customer Embedding
        with torch.no_grad():
            new_customer_embedding = model(new_customer_features_tensor).squeeze(0).to(device)

        # ✅ Find Similar Customers
        top_similar_customers = find_similar_embeddings(new_customer_embedding, customer_embeddings, top_n)

        # ✅ Retrieve Products Purchased by Similar Customers
        recommended_articles = transactions_filtered[
            transactions_filtered["customer_id"].isin([customers.iloc[i]["customer_id"] for i in top_similar_customers])
        ]
        recommended_articles = recommended_articles.groupby("article_id").size().reset_index(name="purchase_count")
        recommended_articles = recommended_articles.sort_values("purchase_count", ascending=False).head(top_n)
        recommended_articles = articles_with_images[articles_with_images["article_id"].isin(recommended_articles["article_id"])]

        print("🔹 **Final Recommendations for Cold-Start User** 🔹\n")
        for _, row in recommended_articles.iterrows():
            print(f"🛍️ Product: {row['prod_name']}\n📜 Description: {row['text_data']}\n")
            display(Image.open(row['path']))
            print("\n")

        return recommended_articles[['article_id', 'prod_name']]

    # ✅ **Process Multi-Modal Features for Non-Cold Start Users**
    if len(input_features) == 0:
        print("⚠️ No input features provided!")
        return None

    input_features = torch.cat(input_features, dim=0).unsqueeze(0).to(device)

    # ✅ Construct PR Graph & Edge Indices
    customer_to_product_edge_index = graph['customer', 'buys', 'product'].edge_index.to(device)
    product_to_product_edge_index = graph['product', 'co_purchased_with', 'product'].edge_index.to(device)
    customer_to_customer_edge_index = graph['customer', 'similar_to', 'customer'].edge_index.to(device)


    with torch.no_grad():
        all_embeddings = model(
            test_features_tensor, 
            customer_to_product_edge_index, 
            product_to_product_edge_index, 
            customer_to_customer_edge_index  # ✅ Now all edges are correctly passed
        )


    # ✅ Find Top-N Similar Products
    top_indices = find_similar_embeddings(input_features, all_embeddings, top_n)

    # ✅ Retrieve Recommended Products
    recommended_articles = articles_with_images.iloc[top_indices][['article_id', 'prod_name', 'text_data', 'path']]

    print("🔹 **Final Recommendations Based on Multi-Modal Input** 🔹\n")
    for _, row in recommended_articles.iterrows():
        print(f"🛍️ Product: {row['prod_name']}\n📜 Description: {row['text_data']}\n")
        display(Image.open(row['path']))
        print("\n")

    return recommended_articles[['article_id', 'prod_name']]


In [None]:
# ✅ **Example Recommendations**
# Image Input Recommendation
recommend_articles_any_input(model=trained_model, top_n=5, image_path="/kaggle/input/h-and-m-personalized-fashion-recommendations/images/079/0797892010.jpg")

In [None]:
# Text Input Recommendation
recommend_articles_any_input(model=trained_model, top_n=5, text_input="Floral summer dress with short sleeves")

In [None]:
# Numeric Input Recommendation
recommend_articles_any_input(model=trained_model, top_n=5, numeric_input=[22, 5, 14, 8, 1, 3])

In [None]:
# Combined Multi-modal Recommendation
recommend_articles_any_input(model=trained_model, top_n=5, 
    image_path="/kaggle/input/h-and-m-personalized-fashion-recommendations/images/072/0720572001.jpg",
    text_input="Black mini dress",
    numeric_input=[30, 40, 50]
)


In [None]:
# Cold-Start User Recommendation (Using Age)
recommend_articles_any_input(model=trained_model, top_n=5, age=30)