In [8]:
import os, glob, math, gc
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, default_collate
from torch_geometric.data import HeteroData
from torch_geometric.utils import sort_edge_index
from sklearn.model_selection import GroupShuffleSplit, GroupKFold
from sklearn.metrics import roc_auc_score
from torchvision import transforms, models
from PIL import Image
from transformers import DistilBertTokenizer, DistilBertModel



In [9]:
# --------------------------
# Configuration
# --------------------------
class Config:
    text_max_length = 64
    batch_size = 128
    emb_dim = 256
    num_heads = 4
    dropout = 0.5
    lr = 5e-5           # Lower learning rate as recommended
    weight_decay = 1e-5
    epochs = 2
    patience = 5        # For early stopping
    k_folds = 2

config = Config()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch.backends.cudnn.benchmark = True  # may help on GPU


In [3]:
# --------------------------
# 1. Comprehensive Preprocessing
# --------------------------
def load_and_preprocess():
    articles = pd.read_csv("/kaggle/input/h-and-m-personalized-fashion-recommendations/articles.csv")
    customers = pd.read_csv("/kaggle/input/h-and-m-personalized-fashion-recommendations/customers.csv")
    transactions = pd.read_csv("/kaggle/input/h-and-m-personalized-fashion-recommendations/transactions_train.csv")
    
    # Adjust IDs
    def adjust_id(x):
        x = str(x)
        return "0" + x if len(x) == 9 else x
    transactions["article_id"] = transactions["article_id"].apply(adjust_id)
    articles["article_id"] = articles["article_id"].apply(adjust_id)
    
    # Filter customers
    customers = customers[['customer_id', 'age']].dropna(subset=['age'])
    valid_article_ids = set(articles['article_id'].unique())
    filtered_transactions = transactions[transactions['article_id'].isin(valid_article_ids)]
    
    # Cold-start handling
    transaction_counts = filtered_transactions['customer_id'].value_counts()
    frequent_customers = transaction_counts[transaction_counts > 8].index.tolist()
    cold_start_few = transaction_counts[transaction_counts <= 2].index.tolist()
    cold_start_no = list(set(customers['customer_id']) - set(filtered_transactions['customer_id']))
    
    # Stratified sampling (50-30-20)
    def sample_customers(group, target_size):
        return np.random.choice(group, size=min(len(group), target_size), replace=False)
    sample_sizes = [50000, 30000, 20000]  # total ~100k
    sampled = [
        sample_customers(frequent_customers, sample_sizes[0]),
        sample_customers(cold_start_few, sample_sizes[1]),
        sample_customers(cold_start_no, sample_sizes[2])
    ]
    sampled_customers = np.concatenate(sampled)
    sampled_customers = customers[customers['customer_id'].isin(sampled_customers)].reset_index(drop=True)
    sampled_customers = sampled_customers[['customer_id', 'age']]
    
    # Filter transactions by sampled customers and valid articles
    filtered_trans = filtered_transactions[
        (filtered_transactions['customer_id'].isin(sampled_customers['customer_id'])) &
        (filtered_transactions['article_id'].isin(valid_article_ids))
    ]
    
    # Get image paths
    all_image_paths = glob.glob("/kaggle/input/h-and-m-personalized-fashion-recommendations/images/*/*.jpg")
    valid_ids = set(os.path.splitext(os.path.basename(p))[0] for p in all_image_paths)
    def get_image_path(aid):
        subfolder = aid[:3]
        path = f"/kaggle/input/h-and-m-personalized-fashion-recommendations/images/{subfolder}/{aid}.jpg"
        return path if aid in valid_ids else None
    articles['image_path'] = articles['article_id'].apply(get_image_path)
    articles = articles.dropna(subset=['image_path']).reset_index(drop=True)
    
    # Merge article prices from transactions
    article_prices = filtered_trans[['article_id', 'price']].drop_duplicates(subset=['article_id'])
    articles = articles.merge(article_prices, on='article_id', how='inner')
    articles['detail_desc'] = articles['detail_desc'].fillna('').astype(str)
    articles = articles[['article_id', 'detail_desc', 'image_path', 'price']]
    
    valid_articles = set(articles['article_id'])
    filtered_trans = filtered_trans[filtered_trans['article_id'].isin(valid_articles)]
    
    # Validate image paths
    articles['image_exists'] = articles['image_path'].apply(os.path.exists)
    missing = articles[~articles['image_exists']]
    if len(missing) > 0:
        print(f"Found {len(missing)} articles with missing images. Samples:")
        print(missing.sample(min(5, len(missing))))
        articles = articles[articles['image_exists']].drop(columns=['image_exists'])
    else:
        print("All images validated successfully!")
    
    # Create mapped IDs
    articles = articles.reset_index(drop=True)
    articles["article_mapped_id"] = articles.index
    sampled_customers = sampled_customers.reset_index(drop=True)
    sampled_customers["customer_mapped_id"] = sampled_customers.index
    filtered_trans = filtered_trans.merge(
        articles[['article_id', 'article_mapped_id']],
        on='article_id',
        how='inner'
    ).merge(
        sampled_customers[['customer_id', 'customer_mapped_id']],
        on='customer_id',
        how='inner'
    )
    
    # --------------------------
    # Precompute raw features for products
    # --------------------------
    # Use a FeatureProcessor instance (defined below) to precompute image and text features.
    fp = FeatureProcessor()
    # Precompute image and text features (store as tensors on CPU)
    img_feats, txt_feats = [], []
    for idx, row in tqdm(articles.iterrows(), total=len(articles), desc="Precomputing product features"):
        try:
            # Image features via a pretrained ResNet18
            img = Image.open(row['image_path']).convert('RGB')
            img_tensor = fp.image_transform(img).unsqueeze(0).to(device)
            img_feat = models.resnet18(pretrained=True).eval().to(device)(img_tensor).squeeze().cpu()
            img_feats.append(img_feat)
            
            # Text features via DistilBERT
            text = row['detail_desc']
            text_enc = fp.tokenizer(text, padding='max_length', truncation=True,
                                    max_length=config.text_max_length, return_tensors='pt').to(device)
            txt_feat = DistilBertModel.from_pretrained('distilbert-base-uncased').eval().to(device)(**text_enc).last_hidden_state[:,0].squeeze().cpu()
            txt_feats.append(txt_feat)
        except Exception as e:
            print(f"Error precomputing features for article {row['article_id']}: {e}")
            img_feats.append(torch.zeros(1000))
            txt_feats.append(torch.zeros(768))
    articles['img_feat'] = img_feats
    articles['txt_feat'] = txt_feats
    
    return sampled_customers, filtered_trans, articles


In [4]:
# --------------------------
# 2. Feature Processing
# --------------------------
class FeatureProcessor:
    def __init__(self):
        self.image_transform = transforms.Compose([
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406],
                                 [0.229, 0.224, 0.225])
        ])
        self.tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

    def process_text(self, texts):
        return self.tokenizer(texts, padding='max_length', truncation=True,
                              max_length=config.text_max_length, return_tensors='pt')


In [10]:
# --------------------------
# Build a product feature dictionary for fast lookup
# --------------------------
def build_product_feature_dict(articles):
    prod_dict = {}
    for idx, row in articles.iterrows():
        prod_dict[int(row['article_mapped_id'])] = {
            'img_feat': row['img_feat'], 
            'txt_feat': row['txt_feat'],
            'price': torch.tensor(row['price'], dtype=torch.float32)
        }
    return prod_dict

In [11]:
# --------------------------
# 3. Simplified Graph Construction (unchanged)
# --------------------------
def build_graph(transactions, articles, customers):
    data = HeteroData()
    data['user'].num_nodes = len(customers)
    data['product'].num_nodes = len(articles)
    edge_index = torch.tensor([
        transactions['customer_mapped_id'].values,
        transactions['article_mapped_id'].values
    ], dtype=torch.long)
    edge_index = sort_edge_index(edge_index)
    data['user', 'buys', 'product'].edge_index = edge_index
    data['product', 'rev_buys', 'user'].edge_index = edge_index.flip(0)
    data['user'].x = torch.tensor(customers['customer_mapped_id'].values, dtype=torch.long)
    data['product'].x = torch.zeros(len(articles), dtype=torch.float32)
    data['product'].price = torch.tensor(articles.price.values, dtype=torch.float32).unsqueeze(1)
    return data

In [12]:
# --------------------------
# 4. Model Architecture
# --------------------------
from torch_geometric.nn import HGTConv

class MultiModalGNN(nn.Module):
    def __init__(self, metadata, num_users, num_products):
        super(MultiModalGNN, self).__init__()
        self.user_emb = nn.Embedding(num_users, config.emb_dim)
        # Product feature layers (used to combine raw features)
        self.img_fc = nn.Linear(1000, config.emb_dim)
        self.txt_fc = nn.Linear(768, config.emb_dim)
        self.price_encoder = nn.Sequential(
            nn.Linear(1, 64),
            nn.ReLU(),
            nn.Linear(64, config.emb_dim)
        )
        self.conv1 = HGTConv(config.emb_dim, config.emb_dim, metadata, heads=config.num_heads)
        self.conv2 = HGTConv(config.emb_dim, config.emb_dim, metadata, heads=config.num_heads)
        self.dropout = nn.Dropout(config.dropout)

    def forward(self, x_dict, edge_index_dict):
        user_ids = x_dict['user'].to(device)
        x_dict['user'] = self.user_emb(user_ids)
        x_dict = self.conv1(x_dict, edge_index_dict)
        x_dict = {k: F.gelu(v) for k, v in x_dict.items()}
        x_dict = {k: self.dropout(v) for k, v in x_dict.items()}
        x_dict = self.conv2(x_dict, edge_index_dict)
        return x_dict


In [13]:
# --------------------------
# 5. Data Splitting
# --------------------------
def create_splits(transactions):
    splitter = GroupShuffleSplit(n_splits=1, test_size=0.3, random_state=42)
    train_idx, test_idx = next(splitter.split(transactions, groups=transactions['customer_mapped_id']))
    train_trans = transactions.iloc[train_idx]
    temp_trans = transactions.iloc[test_idx]
    splitter = GroupShuffleSplit(n_splits=1, test_size=0.5, random_state=42)
    val_idx, test_idx = next(splitter.split(temp_trans, groups=temp_trans['customer_mapped_id']))
    return train_trans, temp_trans.iloc[val_idx], temp_trans.iloc[test_idx]


In [15]:
# --------------------------
# 6. Custom Neighbor Loader (unchanged)
# --------------------------
def sample_neighbors(data: HeteroData, edge_type: tuple, src_nodes: torch.Tensor, num_samples: int) -> torch.Tensor:
    edge_index = data[edge_type].edge_index
    src = edge_index[0]
    tgt = edge_index[1]
    sampled_list = []
    for node in src_nodes.tolist():
        mask = (src == node)
        candidates = tgt[mask]
        if candidates.numel() == 0:
            continue
        if candidates.numel() > num_samples:
            perm = torch.randperm(candidates.numel())[:num_samples]
            sampled = candidates[perm]
        else:
            sampled = candidates
        sampled_list.append(sampled)
    if sampled_list:
        sampled_tgts = torch.cat(sampled_list)
    else:
        sampled_tgts = torch.tensor([], dtype=torch.long)
    return torch.unique(sampled_tgts)

class CustomNeighborLoader:
    def __init__(self, data: HeteroData, input_nodes: tuple, batch_size: int,
                 num_neighbors: dict, shuffle: bool = True):
        self.data = data
        self.input_nodes = input_nodes
        self.batch_size = batch_size
        self.num_neighbors = num_neighbors
        self.shuffle = shuffle
        self.node_type = input_nodes[0]
        self.node_indices = input_nodes[1]
        if self.shuffle:
            self.node_indices = self.node_indices[torch.randperm(self.node_indices.size(0))]
        self.num_batches = math.ceil(self.node_indices.size(0) / batch_size)
    
    def __len__(self):
        return self.num_batches
    
    def __iter__(self):
        for i in range(self.num_batches):
            batch_seed = self.node_indices[i * self.batch_size: (i+1) * self.batch_size]
            n1 = self.num_neighbors.get(('user', 'buys', 'product'), [0])[0]
            sampled_products = sample_neighbors(self.data, ('user', 'buys', 'product'), batch_seed, n1)
            n2 = self.num_neighbors.get(('product', 'rev_buys', 'user'), [0])[0]
            sampled_users_hop2 = sample_neighbors(self.data, ('product', 'rev_buys', 'user'), sampled_products, n2)
            final_users = torch.unique(torch.cat([batch_seed, sampled_users_hop2]))
            final_products = sampled_products
            sub_data = HeteroData()
            sorted_users, _ = torch.sort(final_users)
            user_map = {int(u.item()): i for i, u in enumerate(sorted_users)}
            sub_data['user'].num_nodes = sorted_users.size(0)
            sub_data['user'].x = sorted_users.clone().to(torch.long)
            sorted_products, _ = torch.sort(final_products)
            prod_map = {int(p.item()): i for i, p in enumerate(sorted_products)}
            sub_data['product'].num_nodes = sorted_products.size(0)
            sub_data['product'].x = torch.zeros(len(sorted_products), dtype=torch.float32)
            edge_index = self.data['user', 'buys', 'product'].edge_index
            mask = (torch.isin(edge_index[0], sorted_users) & torch.isin(edge_index[1], sorted_products))
            sub_edge_index = edge_index[:, mask].clone()
            for j in range(sub_edge_index.size(1)):
                src = int(sub_edge_index[0, j].item())
                tgt = int(sub_edge_index[1, j].item())
                sub_edge_index[0, j] = user_map[src]
                sub_edge_index[1, j] = prod_map[tgt]
            sub_data['user', 'buys', 'product'].edge_index = sub_edge_index
            edge_index = self.data['product', 'rev_buys', 'user'].edge_index
            mask = (torch.isin(edge_index[0], sorted_products) & torch.isin(edge_index[1], sorted_users))
            sub_edge_index = edge_index[:, mask].clone()
            for j in range(sub_edge_index.size(1)):
                src = int(sub_edge_index[0, j].item())
                tgt = int(sub_edge_index[1, j].item())
                sub_edge_index[0, j] = prod_map[src]
                sub_edge_index[1, j] = user_map[tgt]
            sub_data['product', 'rev_buys', 'user'].edge_index = sub_edge_index
            seed_mask = torch.zeros(sorted_users.size(0), dtype=torch.bool)
            for u in batch_seed.tolist():
                if u in user_map:
                    seed_mask[user_map[u]] = True
            sub_data['user'].seed_mask = seed_mask
            yield sub_data


In [16]:
# --------------------------
# 7. Training & Evaluation with Improvements
# --------------------------
def train(model, train_data, val_data, optimizer, articles, prod_feature_dict, save_path='best_model.pth'):
    best_ndcg = -1
    scaler = torch.cuda.amp.GradScaler()
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=2, verbose=True)
    epochs_no_improve = 0
    for epoch in range(config.epochs):
        model.train()
        epoch_loss = 0
        loader = CustomNeighborLoader(
            data=train_data,
            input_nodes=('user', torch.arange(train_data['user'].num_nodes, device='cpu')),
            batch_size=config.batch_size,
            num_neighbors={('user', 'buys', 'product'): [10],
                           ('product', 'rev_buys', 'user'): [5]},
            shuffle=True
        )
        print(f"\nStarting epoch {epoch+1}")
        for batch_idx, batch in enumerate(loader):
            # print(f"Processing batch {batch_idx}")
            batch = batch.to(device)
            # Instead of loading product features via DataLoader, we quickly look them up.
            prod_indices = batch['product'].x.cpu().numpy().astype(int)
            # Retrieve raw product features from the dictionary and stack them.
            img_feat_list, txt_feat_list, price_list = [], [], []
            for pid in prod_indices:
                feat = prod_feature_dict.get(pid)
                if feat is None:
                    continue
                img_feat_list.append(feat['img_feat'].to(device))
                txt_feat_list.append(feat['txt_feat'].to(device))
                price_list.append(feat['price'].to(device))
            if len(img_feat_list) == 0:
                print("No product features in this batch; skipping.")
                continue
            img_feats = torch.stack(img_feat_list)
            txt_feats = torch.stack(txt_feat_list)
            prices = torch.stack(price_list)
            # Compute product embeddings using the model's product layers
            img_emb = model.img_fc(img_feats)
            txt_emb = model.txt_fc(txt_feats)
            price_emb = model.price_encoder(prices.unsqueeze(1))
            prod_emb_batch = img_emb + txt_emb + price_emb
            # Replace the product node features with the precomputed ones
            batch['product'].x = prod_emb_batch
            
            optimizer.zero_grad()
            with torch.amp.autocast('cuda',enabled=True):
                out = model(batch.x_dict, batch.edge_index_dict)
                # Use only the edges in the subgraph for training
                # pos_edges = batch['user', 'buys', 'product'].edge_index
                # # Extract user and product embeddings corresponding to these edges
                # user_edge_emb = out['user'][pos_edges[0]]
                # prod_edge_emb = out['product'][pos_edges[1]]
                # # For in-batch negatives, we will form a score matrix from all users and products in the batch
                # # Here we assume that batch['user'].x and batch['product'].x are in the same order as in out
                # user_batch_emb = out['user']
                # prod_batch_emb = out['product']
                # scores = torch.matmul(user_batch_emb, prod_batch_emb.t())  # [B, B]
                # pos_scores = scores.diag().unsqueeze(1)  # [B, 1]
                # # Create a mask to remove diagonal elements (self-positive)
                # B = scores.size(0)
                # mask = torch.eye(B, dtype=torch.bool, device=device)
                # neg_scores = scores.masked_fill(mask, -1e9)

                # Get the positive edge indices (shape: [2, E])
                pos_edges = batch['user', 'buys', 'product'].edge_index  
                # Compute positive scores directly for each edge:
                user_pos = out['user'][pos_edges[0]]
                prod_pos = out['product'][pos_edges[1]]
                pos_scores = (user_pos * prod_pos).sum(dim=1)  # shape: [E]
                
                # For each positive edge, sample one negative product from the batch
                num_edges = pos_edges.size(1)
                neg_indices = torch.randint(0, out['product'].size(0), (num_edges,), device=device)
                neg_scores = (user_pos * out['product'][neg_indices]).sum(dim=1)
                
                # Use margin ranking loss (which encourages pos_scores > neg_scores + margin)
                margin = 0.2
                loss = F.margin_ranking_loss(pos_scores, neg_scores, target=torch.ones_like(pos_scores), margin=margin)

            scaler.scale(loss).backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            scaler.step(optimizer)
            scaler.update()
            epoch_loss += loss.item()
        
        avg_loss = epoch_loss / len(loader)
        if val_data is not None:
            ndcg, recall, auc, map12, recall12 = evaluate(model, val_data, articles)
            print(f"Epoch {epoch+1}: Loss={avg_loss:.4f}, NDCG@10={ndcg:.4f}, Recall@10={recall:.4f}, AUC={auc:.4f}, MAP@12={map12:.4f}, Recall@12={recall12:.4f}")
            scheduler.step(ndcg)
            # Early stopping
            if ndcg > best_ndcg:
                best_ndcg = ndcg
                epochs_no_improve = 0
                torch.save(model.state_dict(), save_path)
                print(f"Best model saved at epoch {epoch+1}")
            else:
                epochs_no_improve += 1
                if epochs_no_improve >= config.patience:
                    print("Early stopping triggered.")
                    break

        else:
            print(f"Epoch {epoch+1}: Loss={avg_loss:.4f} (No evaluation)")
    return best_ndcg

in train data load for each fold best modal is overwritting that should not be happen. if validation is none evaluation should not be done. these changes should be done in the above training loop

In [17]:
def evaluate(model, val_data, articles):
    model.eval()
    scaler = torch.cuda.amp.GradScaler()
    loader = CustomNeighborLoader(
        data=val_data,
        input_nodes=('user', torch.arange(val_data['user'].num_nodes, device='cpu')),
        batch_size=config.batch_size,
        num_neighbors={('user', 'buys', 'product'): [10],
                       ('product', 'rev_buys', 'user'): [5]},
        shuffle=False
    )
    all_ndcgs, all_recalls, all_aucs, all_maps_12, all_recalls_12 = [], [], [], [], []
    for batch_idx, batch in enumerate(loader):
        batch = batch.to(device)
        # Load product features from precomputed dictionary
        prod_indices = batch['product'].x.cpu().numpy().astype(int)
        img_feat_list, txt_feat_list, price_list = [], [], []
        for pid in prod_indices:
            feat = prod_feature_dict.get(pid)
            if feat is None:
                continue
            img_feat_list.append(feat['img_feat'].to(device))
            txt_feat_list.append(feat['txt_feat'].to(device))
            price_list.append(feat['price'].to(device))
        if len(img_feat_list) == 0:
            continue
        img_feats = torch.stack(img_feat_list)
        txt_feats = torch.stack(txt_feat_list)
        prices = torch.stack(price_list)
        img_emb = model.img_fc(img_feats)
        txt_emb = model.txt_fc(txt_feats)
        price_emb = model.price_encoder(prices.unsqueeze(1))
        batch['product'].x = (img_emb + txt_emb + price_emb).to(device)
        try:
            with torch.amp.autocast('cuda', enabled=True):
                out = model(batch.x_dict, batch.edge_index_dict)
            user_embeddings = out['user'].detach()
            product_embeddings = out['product'].detach()
            scores = torch.mm(user_embeddings, product_embeddings.t())
            pos_edges = batch['user', 'buys', 'product'].edge_index
            
            ndcg = calculate_ndcg(scores, pos_edges, k=10)
            recall = calculate_recall(scores, pos_edges, k=10)
            auc = calculate_auc(scores, pos_edges)
            map_12 = calculate_map(scores, pos_edges, k=12)
            recall_12 = calculate_recall(scores, pos_edges, k=12)
            all_ndcgs.append(ndcg)
            all_recalls.append(recall)
            all_aucs.append(auc)
            all_maps_12.append(map_12)
            all_recalls_12.append(recall_12)
        except Exception as e:
            print(f"Error during evaluation of batch {batch_idx+1}: {e}")
            continue
    return (np.mean(all_ndcgs) if all_ndcgs else 0,
            np.mean(all_recalls) if all_recalls else 0,
            np.mean(all_aucs) if all_aucs else 0,
            np.mean(all_maps_12) if all_maps_12 else 0,
            np.mean(all_recalls_12) if all_recalls_12 else 0)


In [18]:
# --------------------------
# 8. Metrics (unchanged)
# --------------------------
def calculate_ndcg(scores, edges, k=10):
    scores = scores.detach().cpu()
    user_items = {}
    for u, i in edges.t().tolist():
        if u not in user_items:
            user_items[u] = set()
        user_items[u].add(i)
    ndcgs = []
    for u in user_items.keys():
        relevant_items = user_items[u]
        if len(relevant_items) == 0:
            continue
        user_scores = scores[u]
        top_k_items = torch.topk(user_scores, k=k).indices.tolist()
        dcg = sum(1 / np.log2(rank + 2) for rank, item_id in enumerate(top_k_items) if item_id in relevant_items)
        ideal_dcg = sum(1 / np.log2(i + 2) for i in range(min(len(relevant_items), k)))
        if ideal_dcg > 0:
            ndcgs.append(dcg / ideal_dcg)
    return np.mean(ndcgs) if ndcgs else 0.0

def calculate_recall(scores, edges, k=10):
    user_items = {}
    for u, i in edges.t().tolist():
        user_items.setdefault(u, set()).add(i)
    recalls = []
    for u in range(scores.size(0)):
        pred = set(scores[u].argsort(descending=True)[:k].tolist())
        rel = user_items.get(u, set())
        if len(rel) == 0: continue
        recalls.append(len(pred & rel)/len(rel))
    return np.mean(recalls) if recalls else 0

def calculate_auc(scores, edges):
    pos_pairs = scores[edges[0], edges[1]]
    neg_pairs = scores[torch.randint(0, scores.size(0), (len(pos_pairs),)),
                       torch.randint(0, scores.size(1), (len(pos_pairs),))]
    y_true = torch.cat([torch.ones_like(pos_pairs), torch.zeros_like(neg_pairs)])
    y_score = torch.cat([pos_pairs.sigmoid(), neg_pairs.sigmoid()])
    return roc_auc_score(y_true.cpu().numpy(), y_score.cpu().numpy())

def calculate_map(scores, edges, k=12):
    user_items = {}
    for u, i in edges.t().tolist():
        user_items.setdefault(u, set()).add(i)
    maps = []
    for u in range(scores.size(0)):
        rel = user_items.get(u, set())
        if not rel: continue
        pred = scores[u].argsort(descending=True)[:k].tolist()
        hits, ap = 0, 0
        for i, item in enumerate(pred):
            if item in rel:
                hits += 1
                ap += hits / (i + 1)
        maps.append(ap / min(len(rel), k))
    return np.mean(maps) if maps else 0.0


In [19]:
# --------------------------
# 9. Cross-Validation (unchanged)
# --------------------------
def cross_validate(articles, customers, transactions):
    kf = GroupKFold(config.k_folds)
    users = transactions.customer_mapped_id.unique()
    metrics = []
    for fold, (train_idx, val_idx) in enumerate(kf.split(users, groups=users)):
        print(f"\n=== Fold {fold+1} ===")
        train_users = users[train_idx]
        val_users = users[val_idx]
        fold_train_trans = transactions[transactions.customer_mapped_id.isin(train_users)]
        fold_val_trans = transactions[transactions.customer_mapped_id.isin(val_users)]
        fold_customers = customers[customers['customer_mapped_id'].isin(np.concatenate([train_users, val_users]))].copy()
        fold_article_ids = set(fold_train_trans.article_mapped_id).union(set(fold_val_trans.article_mapped_id))
        fold_articles = articles[articles.article_mapped_id.isin(fold_article_ids)].copy()
        train_data = build_graph(fold_train_trans, fold_articles, fold_customers)
        val_data = build_graph(fold_val_trans, fold_articles, fold_customers)
        num_users = fold_customers['customer_mapped_id'].max() + 1
        num_products = fold_articles['article_mapped_id'].max() + 1
        model = MultiModalGNN(train_data.metadata(), num_users, num_products).to(device)
        optimizer = optim.AdamW(model.parameters(), lr=config.lr, weight_decay=config.weight_decay)
        _ = train(model, train_data, val_data, optimizer, fold_articles, prod_feature_dict, save_path=f"best_model_fold{fold+1}.pth")
        del optimizer
        torch.cuda.empty_cache() 
    print(f"\nCross-validation complete.")

In [21]:
# import zipfile
import pickle

# --------------------------
# 10. Main Execution: Step-by-Step Approach
# --------------------------
if __name__ == "__main__":
    # Step 1: Preprocess data
    # sampled_customers, transactions, articles = load_and_preprocess()
    PREPROCESSED_DIR = "/kaggle/input/preprocessed-data-7/"

    # Load dataframes
    articles = pd.read_pickle(os.path.join(PREPROCESSED_DIR, "articles.pkl"))
    customers = pd.read_pickle(os.path.join(PREPROCESSED_DIR, "customers.pkl"))
    transactions = pd.read_pickle(os.path.join(PREPROCESSED_DIR, "transactions.pkl"))
    
    print(f"Articles: {len(articles)}, Customers: {len(customers)}, Transactions: {len(transactions)}")
    
    # Make this dictionary globally accessible for evaluate() and train()
    global prod_feature_dict

    with open("/kaggle/input/prod-feature-dict/prod_feature_dict.pkl", "rb") as f:
        prod_feature_dict = pickle.load(f)

    # Step 2: Build product feature dictionary from precomputed features
    # prod_feature_dict = build_product_feature_dict(articles)
    # # Define a directory to save preprocessed files
    # PROD_FEATUREDICT_DIR = "/kaggle/working/prod_feature_dict"
    # ZIP_FILE = "/kaggle/working/prod_feature_dict.zip"
    # os.makedirs(PROD_FEATUREDICT_DIR, exist_ok=True)
    
    # # Save dataframes (articles, customers, transactions)
    # with open(os.path.join(PROD_FEATUREDICT_DIR, "prod_feature_dict.pkl"), 'wb') as f:
    #     pickle.dump(prod_feature_dict, f)
    
    # # ✅ Create a ZIP archive containing all preprocessed files
    # with zipfile.ZipFile(ZIP_FILE, 'w') as zipf:
    #     for file in os.listdir(PROD_FEATUREDICT_DIR):
    #         zipf.write(os.path.join(PROD_FEATUREDICT_DIR, file), arcname=file)
    
    # print(f"✅ MULTI completed! Saved as {ZIP_FILE}")
    
    # Step 3: Create train/val/test splits
    train_trans, val_trans, test_trans = create_splits(transactions)
    # Step 5: Build graphs for train, validation, and test splits
    # train_data = build_graph(train_trans, articles, customers)
    # val_data   = build_graph(val_trans, articles, customers)
    test_data  = build_graph(test_trans, articles, customers)
    # Step 4: Cross-validate on training subset (optional)
    print("Starting cross-validation on training data...")
    cross_validate(articles, customers, train_trans)
    print("Cross-validation complete.")
    
    # ✅ Step A: Combine train + val transactions
    trainval_trans = pd.concat([train_trans, val_trans], ignore_index=True)
    
    # ✅ Step B: Rebuild graph using train + val data
    trainval_data = build_graph(trainval_trans, articles, customers)
    
    # ✅ Step C: Initialize model with same architecture
    model = MultiModalGNN(trainval_data.metadata(), customers['customer_mapped_id'].max() + 1, articles['article_mapped_id'].max() + 1).to(device)
    
    # # ✅ Step D: Load best weights from CV
    # model.load_state_dict(torch.load("best_model.pth"))
    # print("✅ Loaded best model from CV.")
    
    # # ✅ Step E: Reinitialize optimizer (for retraining)
    # optimizer = optim.AdamW(model.parameters(), lr=config.lr, weight_decay=config.weight_decay)
    
    # # ✅ Step F: Retrain model on train + val (no val_data used now)
    # print("\n🔁 Retraining on Train + Validation set...")
    # best_ndcg = train(model, trainval_data, val_data=None, optimizer=optimizer, articles=articles, prod_feature_dict=prod_feature_dict)
    # print("best_ndcg", best_ndcg)
    # # ✅ Step G: Evaluate on the test set
    # print("\n🧪 Final Evaluation on Test Set")
    # ndcg, recall, auc, map12, recall12 = evaluate(model, test_data, articles)
    # print(f"\n✅ Final Test Performance:")
    # print(f"NDCG@10: {ndcg:.4f}")
    # print(f"Recall@10: {recall:.4f}")
    # print(f"AUC: {auc:.4f}")
    # print(f"MAP@12: {map12:.4f}")
    # print(f"Recall@12: {recall12:.4f}")
    
    # # ✅ Step H: Save the retrained final model
    # torch.save({
    #     'state_dict': model.state_dict(),
    #     'metadata': trainval_data.metadata(),
    #     'config': config.__dict__
    # }, "final_model_retrained.pth")
    # print("✅ Retrained model saved as final_model_retrained.pth")






Articles: 80654, Customers: 88647, Transactions: 2092109


  edge_index = torch.tensor([


Starting cross-validation on training data...

=== Fold 1 ===


  scaler = torch.cuda.amp.GradScaler()



Starting epoch 1
No product features in this batch; skipping.


  scaler = torch.cuda.amp.GradScaler()


Error during evaluation of batch 433: selected index k out of range
Epoch 1: Loss=0.0738, NDCG@10=0.0214, Recall@10=0.0403, AUC=0.5322, MAP@12=0.0123, Recall@12=0.0493
Best model saved at epoch 1

Starting epoch 2
No product features in this batch; skipping.


  scaler = torch.cuda.amp.GradScaler()


Epoch 2: Loss=0.0340, NDCG@10=0.0290, Recall@10=0.0532, AUC=0.5728, MAP@12=0.0166, Recall@12=0.0640
Best model saved at epoch 2

Starting epoch 3
No product features in this batch; skipping.


  scaler = torch.cuda.amp.GradScaler()


Epoch 3: Loss=0.0237, NDCG@10=0.0316, Recall@10=0.0573, AUC=0.5799, MAP@12=0.0181, Recall@12=0.0687
Best model saved at epoch 3

Starting epoch 4
No product features in this batch; skipping.


  scaler = torch.cuda.amp.GradScaler()


Epoch 4: Loss=0.0205, NDCG@10=0.0326, Recall@10=0.0586, AUC=0.5850, MAP@12=0.0188, Recall@12=0.0700
Best model saved at epoch 4

Starting epoch 5


  scaler = torch.cuda.amp.GradScaler()


Epoch 5: Loss=0.0190, NDCG@10=0.0326, Recall@10=0.0590, AUC=0.5900, MAP@12=0.0187, Recall@12=0.0705

Starting epoch 6
No product features in this batch; skipping.


  scaler = torch.cuda.amp.GradScaler()


Epoch 6: Loss=0.0179, NDCG@10=0.0336, Recall@10=0.0604, AUC=0.5920, MAP@12=0.0193, Recall@12=0.0722
Best model saved at epoch 6

Starting epoch 7


  scaler = torch.cuda.amp.GradScaler()


Epoch 7: Loss=0.0177, NDCG@10=0.0331, Recall@10=0.0594, AUC=0.5943, MAP@12=0.0190, Recall@12=0.0713

Starting epoch 8


  scaler = torch.cuda.amp.GradScaler()


Epoch 8: Loss=0.0170, NDCG@10=0.0334, Recall@10=0.0602, AUC=0.5937, MAP@12=0.0191, Recall@12=0.0720

Starting epoch 9


  scaler = torch.cuda.amp.GradScaler()


Epoch 9: Loss=0.0164, NDCG@10=0.0341, Recall@10=0.0612, AUC=0.5972, MAP@12=0.0194, Recall@12=0.0732
Best model saved at epoch 9

Starting epoch 10
No product features in this batch; skipping.


  scaler = torch.cuda.amp.GradScaler()


Epoch 10: Loss=0.0158, NDCG@10=0.0337, Recall@10=0.0604, AUC=0.6000, MAP@12=0.0192, Recall@12=0.0726

Starting epoch 11
No product features in this batch; skipping.


  scaler = torch.cuda.amp.GradScaler()


Epoch 11: Loss=0.0155, NDCG@10=0.0344, Recall@10=0.0615, AUC=0.6005, MAP@12=0.0196, Recall@12=0.0740
Best model saved at epoch 11

Starting epoch 12


  scaler = torch.cuda.amp.GradScaler()


Epoch 12: Loss=0.0152, NDCG@10=0.0345, Recall@10=0.0613, AUC=0.6007, MAP@12=0.0199, Recall@12=0.0741
Best model saved at epoch 12

Starting epoch 13


  scaler = torch.cuda.amp.GradScaler()


Epoch 13: Loss=0.0149, NDCG@10=0.0355, Recall@10=0.0636, AUC=0.6022, MAP@12=0.0201, Recall@12=0.0761
Best model saved at epoch 13

Starting epoch 14


  scaler = torch.cuda.amp.GradScaler()


Epoch 14: Loss=0.0146, NDCG@10=0.0354, Recall@10=0.0629, AUC=0.6017, MAP@12=0.0202, Recall@12=0.0754

Starting epoch 15


  scaler = torch.cuda.amp.GradScaler()


Epoch 15: Loss=0.0144, NDCG@10=0.0358, Recall@10=0.0635, AUC=0.6025, MAP@12=0.0205, Recall@12=0.0760
Best model saved at epoch 15

Starting epoch 16


  scaler = torch.cuda.amp.GradScaler()


Epoch 16: Loss=0.0143, NDCG@10=0.0356, Recall@10=0.0634, AUC=0.6027, MAP@12=0.0203, Recall@12=0.0758

Starting epoch 17
No product features in this batch; skipping.


  scaler = torch.cuda.amp.GradScaler()


Epoch 17: Loss=0.0139, NDCG@10=0.0362, Recall@10=0.0645, AUC=0.6039, MAP@12=0.0206, Recall@12=0.0771
Best model saved at epoch 17

Starting epoch 18
No product features in this batch; skipping.


  scaler = torch.cuda.amp.GradScaler()


Epoch 18: Loss=0.0136, NDCG@10=0.0359, Recall@10=0.0644, AUC=0.6042, MAP@12=0.0203, Recall@12=0.0768

Starting epoch 19


  scaler = torch.cuda.amp.GradScaler()


Epoch 19: Loss=0.0135, NDCG@10=0.0367, Recall@10=0.0652, AUC=0.6058, MAP@12=0.0208, Recall@12=0.0777
Best model saved at epoch 19

Starting epoch 20
No product features in this batch; skipping.


  scaler = torch.cuda.amp.GradScaler()


Error during evaluation of batch 433: selected index k out of range
Epoch 20: Loss=0.0134, NDCG@10=0.0355, Recall@10=0.0633, AUC=0.6048, MAP@12=0.0199, Recall@12=0.0759

Starting epoch 21


  scaler = torch.cuda.amp.GradScaler()


Epoch 21: Loss=0.0134, NDCG@10=0.0365, Recall@10=0.0653, AUC=0.6051, MAP@12=0.0207, Recall@12=0.0782

Starting epoch 22
No product features in this batch; skipping.


  scaler = torch.cuda.amp.GradScaler()


Epoch 22: Loss=0.0132, NDCG@10=0.0372, Recall@10=0.0662, AUC=0.6066, MAP@12=0.0210, Recall@12=0.0787
Best model saved at epoch 22

=== Fold 2 ===


  scaler = torch.cuda.amp.GradScaler()



Starting epoch 1
No product features in this batch; skipping.


  scaler = torch.cuda.amp.GradScaler()


Epoch 1: Loss=0.0671, NDCG@10=0.0225, Recall@10=0.0427, AUC=0.4950, MAP@12=0.0129, Recall@12=0.0519
Best model saved at epoch 1

Starting epoch 2


  scaler = torch.cuda.amp.GradScaler()


Epoch 2: Loss=0.0361, NDCG@10=0.0259, Recall@10=0.0480, AUC=0.5230, MAP@12=0.0149, Recall@12=0.0580
Best model saved at epoch 2

Starting epoch 3
No product features in this batch; skipping.


  scaler = torch.cuda.amp.GradScaler()


Epoch 3: Loss=0.0245, NDCG@10=0.0295, Recall@10=0.0535, AUC=0.5678, MAP@12=0.0169, Recall@12=0.0645
Best model saved at epoch 3

Starting epoch 4


  scaler = torch.cuda.amp.GradScaler()


Epoch 4: Loss=0.0213, NDCG@10=0.0331, Recall@10=0.0587, AUC=0.5704, MAP@12=0.0188, Recall@12=0.0706
Best model saved at epoch 4

Starting epoch 5


  scaler = torch.cuda.amp.GradScaler()


Epoch 5: Loss=0.0201, NDCG@10=0.0352, Recall@10=0.0626, AUC=0.5754, MAP@12=0.0199, Recall@12=0.0751
Best model saved at epoch 5

Starting epoch 6
No product features in this batch; skipping.


  scaler = torch.cuda.amp.GradScaler()


Epoch 6: Loss=0.0193, NDCG@10=0.0364, Recall@10=0.0638, AUC=0.5782, MAP@12=0.0206, Recall@12=0.0767
Best model saved at epoch 6

Starting epoch 7


  scaler = torch.cuda.amp.GradScaler()


Epoch 7: Loss=0.0187, NDCG@10=0.0371, Recall@10=0.0651, AUC=0.5801, MAP@12=0.0209, Recall@12=0.0785
Best model saved at epoch 7

Starting epoch 8


  scaler = torch.cuda.amp.GradScaler()


Epoch 8: Loss=0.0182, NDCG@10=0.0375, Recall@10=0.0659, AUC=0.5842, MAP@12=0.0212, Recall@12=0.0793
Best model saved at epoch 8

Starting epoch 9
No product features in this batch; skipping.


  scaler = torch.cuda.amp.GradScaler()


Epoch 9: Loss=0.0172, NDCG@10=0.0393, Recall@10=0.0685, AUC=0.5842, MAP@12=0.0222, Recall@12=0.0819
Best model saved at epoch 9

Starting epoch 10
No product features in this batch; skipping.


  scaler = torch.cuda.amp.GradScaler()


Epoch 10: Loss=0.0168, NDCG@10=0.0402, Recall@10=0.0699, AUC=0.5876, MAP@12=0.0226, Recall@12=0.0837
Best model saved at epoch 10

Starting epoch 11


  scaler = torch.cuda.amp.GradScaler()


Epoch 11: Loss=0.0166, NDCG@10=0.0407, Recall@10=0.0710, AUC=0.5891, MAP@12=0.0228, Recall@12=0.0847
Best model saved at epoch 11

Starting epoch 12


  scaler = torch.cuda.amp.GradScaler()


Epoch 12: Loss=0.0165, NDCG@10=0.0415, Recall@10=0.0717, AUC=0.5908, MAP@12=0.0234, Recall@12=0.0855
Best model saved at epoch 12

Starting epoch 13


  scaler = torch.cuda.amp.GradScaler()


Epoch 13: Loss=0.0163, NDCG@10=0.0419, Recall@10=0.0722, AUC=0.5895, MAP@12=0.0236, Recall@12=0.0858
Best model saved at epoch 13

Starting epoch 14
No product features in this batch; skipping.


  scaler = torch.cuda.amp.GradScaler()


Epoch 14: Loss=0.0159, NDCG@10=0.0425, Recall@10=0.0730, AUC=0.5930, MAP@12=0.0239, Recall@12=0.0872
Best model saved at epoch 14

Starting epoch 15


  scaler = torch.cuda.amp.GradScaler()


Epoch 15: Loss=0.0158, NDCG@10=0.0434, Recall@10=0.0744, AUC=0.5933, MAP@12=0.0244, Recall@12=0.0885
Best model saved at epoch 15

Starting epoch 16


  scaler = torch.cuda.amp.GradScaler()


Epoch 16: Loss=0.0156, NDCG@10=0.0428, Recall@10=0.0735, AUC=0.5934, MAP@12=0.0241, Recall@12=0.0877

Starting epoch 17


  scaler = torch.cuda.amp.GradScaler()


Epoch 17: Loss=0.0158, NDCG@10=0.0437, Recall@10=0.0747, AUC=0.5952, MAP@12=0.0246, Recall@12=0.0886
Best model saved at epoch 17

Starting epoch 18


  scaler = torch.cuda.amp.GradScaler()


Epoch 18: Loss=0.0154, NDCG@10=0.0429, Recall@10=0.0734, AUC=0.5945, MAP@12=0.0242, Recall@12=0.0877

Starting epoch 19


  scaler = torch.cuda.amp.GradScaler()


Epoch 19: Loss=0.0157, NDCG@10=0.0435, Recall@10=0.0747, AUC=0.5973, MAP@12=0.0244, Recall@12=0.0893

Starting epoch 20
No product features in this batch; skipping.


  scaler = torch.cuda.amp.GradScaler()


Epoch 20: Loss=0.0150, NDCG@10=0.0441, Recall@10=0.0753, AUC=0.5971, MAP@12=0.0248, Recall@12=0.0896
Best model saved at epoch 20

Starting epoch 21


  scaler = torch.cuda.amp.GradScaler()


Epoch 21: Loss=0.0153, NDCG@10=0.0444, Recall@10=0.0758, AUC=0.5964, MAP@12=0.0249, Recall@12=0.0902
Best model saved at epoch 21

Starting epoch 22
No product features in this batch; skipping.


  scaler = torch.cuda.amp.GradScaler()


Epoch 22: Loss=0.0147, NDCG@10=0.0442, Recall@10=0.0752, AUC=0.5973, MAP@12=0.0248, Recall@12=0.0894

Cross-validation complete.
Cross-validation complete.


fold1
Epoch 22: Loss=0.0132, NDCG@10=0.0372, Recall@10=0.0662, AUC=0.6066, MAP@12=0.0210, Recall@12=0.0787
Best model saved at epoch 22
fold 2
Epoch 22: Loss=0.0147, NDCG@10=0.0442, Recall@10=0.0752, AUC=0.5973, MAP@12=0.0248, Recall@12=0.0894


In [20]:
import pickle

# # ✅ Step A: Combine train + val transactions
# trainval_trans = pd.concat([train_trans, val_trans], ignore_index=True)

# # ✅ Step B: Rebuild graph using train + val data
# trainval_data = build_graph(trainval_trans, articles, customers)

# # ✅ Step C: Initialize model with same architecture
# model = MultiModalGNN(trainval_data.metadata(), customers['customer_mapped_id'].max() + 1, articles['article_mapped_id'].max() + 1).to(device)

# # ✅ Step D: Load best weights from CV
# model.load_state_dict(torch.load("best_model.pth"))
# print("✅ Loaded best model from CV.")

# # ✅ Step E: Reinitialize optimizer (for retraining)
# optimizer = optim.AdamW(model.parameters(), lr=config.lr, weight_decay=config.weight_decay)

# # ✅ Step F: Retrain model on train + val (no val_data used now)
# print("\n🔁 Retraining on Train + Validation set...")
# best_ndcg = train(model, trainval_data, val_data=None, optimizer=optimizer, articles=articles, prod_feature_dict=prod_feature_dict)
# test_data  = build_graph(test_trans, articles, customers)
# # ✅ Step G: Evaluate on the test set
# print("\n🧪 Final Evaluation on Test Set")
# ndcg, recall, auc = evaluate(model, test_data, articles)
# print(f"\n✅ Final Test Performance:")
# print(f"NDCG@10: {ndcg:.4f}")
# print(f"Recall@10: {recall:.4f}")
# print(f"AUC: {auc:.4f}")

# # ✅ Step H: Save the retrained final model
# torch.save({
#     'state_dict': model.state_dict(),
#     'metadata': trainval_data.metadata(),
#     'config': config.__dict__
# }, "final_model_retrained.pth")
# print("✅ Retrained model saved as final_model_retrained.pth")
PREPROCESSED_DIR = "/kaggle/input/preprocessed-data-7/"

# Load dataframes
articles = pd.read_pickle(os.path.join(PREPROCESSED_DIR, "articles.pkl"))
customers = pd.read_pickle(os.path.join(PREPROCESSED_DIR, "customers.pkl"))
transactions = pd.read_pickle(os.path.join(PREPROCESSED_DIR, "transactions.pkl"))

print(f"Articles: {len(articles)}, Customers: {len(customers)}, Transactions: {len(transactions)}")

# Make this dictionary globally accessible for evaluate() and train()
global prod_feature_dict

with open("/kaggle/input/prod-feature-dict/prod_feature_dict.pkl", "rb") as f:
    prod_feature_dict = pickle.load(f)

train_trans, val_trans, test_trans = create_splits(transactions)

test_data  = build_graph(test_trans, articles, customers)


# ✅ Step A: Combine train + val transactions
trainval_trans = pd.concat([train_trans, val_trans], ignore_index=True)

# ✅ Step B: Rebuild graph using train + val data
trainval_data = build_graph(trainval_trans, articles, customers)

# Load the checkpoint
checkpoint = torch.load("/kaggle/input/not-cold-start-halfly-trained/other/default/1/best_model (1).pth", weights_only=True)
state_dict = checkpoint
model = MultiModalGNN(trainval_data.metadata(), customers['customer_mapped_id'].max() + 1, articles['article_mapped_id'].max() + 1).to(device)

# Get the checkpoint user embeddings and current model's user embeddings
old_user_emb = state_dict["user_emb.weight"]   # Shape: [55299, 256]
new_user_emb = model.user_emb.weight             # Shape: [88647, 256]

# Check how many rows to copy
num_overlap = old_user_emb.size(0)

# Replace the first num_overlap rows of the new model's embedding with the loaded weights
new_user_emb.data[:num_overlap] = old_user_emb

# Optionally, leave the remaining rows as is (randomly initialized) or initialize them as desired
state_dict["user_emb.weight"] = new_user_emb

# Now load state_dict with strict=False to avoid errors on the rest of the mismatched keys
model.load_state_dict(state_dict, strict=False)

print("✅ Loaded best model from CV.")


# ✅ Step E: Reinitialize optimizer (for retraining)
optimizer = optim.AdamW(model.parameters(), lr=config.lr, weight_decay=config.weight_decay)

# ✅ Step F: Retrain model on train + val (no val_data used now)
print("\n🔁 Retraining on Train + Validation set...")
best_ndcg = train(model, trainval_data, val_data=None, optimizer=optimizer, articles=articles, prod_feature_dict=prod_feature_dict)
print("best_ndcg", best_ndcg)
# ✅ Step G: Evaluate on the test set
print("\n🧪 Final Evaluation on Test Set")
ndcg, recall, auc, map12, recall12 = evaluate(model, test_data, articles)
print(f"\n✅ Final Test Performance:")
print(f"NDCG@10: {ndcg:.4f}")
print(f"Recall@10: {recall:.4f}")
print(f"AUC: {auc:.4f}")
print(f"MAP@12: {map12:.4f}")
print(f"Recall@12: {recall12:.4f}")

# ✅ Step H: Save the retrained final model
torch.save({
    'state_dict': model.state_dict(),
    'metadata': trainval_data.metadata(),
    'config': config.__dict__
}, "final_model_retrained.pth")
print("✅ Retrained model saved as final_model_retrained.pth")


Articles: 80654, Customers: 88647, Transactions: 2092109


  edge_index = torch.tensor([


✅ Loaded best model from CV.

🔁 Retraining on Train + Validation set...

Starting epoch 1


  scaler = torch.cuda.amp.GradScaler()


Epoch 1: Loss=0.0199 (No evaluation)

Starting epoch 2
Epoch 2: Loss=0.0196 (No evaluation)
best_ndcg -1

🧪 Final Evaluation on Test Set


  scaler = torch.cuda.amp.GradScaler()



✅ Final Test Performance:
NDCG@10: 0.1006
Recall@10: 0.1783
AUC: 0.7079
MAP@12: 0.0648
Recall@12: 0.2076
✅ Retrained model saved as final_model_retrained.pth


In [None]:
emd dim 256

fold 1
Epoch 1: Loss=0.0754, NDCG=0.0356, Recall=0.0688, AUC=0.5062
Epoch 2: Loss=0.0386, NDCG=0.0432, Recall=0.0816, AUC=0.5930
Epoch 3: Loss=0.0268, NDCG=0.0499, Recall=0.0926, AUC=0.6063
fold 2
Epoch 1: Loss=0.0758, NDCG=0.0349, Recall=0.0681, AUC=0.4909
Epoch 2: Loss=0.0350, NDCG=0.0431, Recall=0.0818, AUC=0.6007
Epoch 3: Loss=0.0264, NDCG=0.0481, Recall=0.0894, AUC=0.6121
fold 3
Epoch 1: Loss=0.0750, NDCG=0.0333, Recall=0.0654, AUC=0.5014
Epoch 2: Loss=0.0350, NDCG=0.0423, Recall=0.0803, AUC=0.5975
Epoch 3: Loss=0.0254, NDCG=0.0467, Recall=0.0878, AUC=0.6083


Final Test Performance:
NDCG@10: 0.0817
Recall@10: 0.1535
AUC: 0.6685

In [None]:
em dim 384
fold 1
Epoch 1: Loss=0.1663, NDCG@10=0.0101, Recall@10=0.0190, AUC=0.4867, MAP@12=0.0059, Recall@12=0.0211

In [None]:
em dim 128
fold 1
Epoch 1: Loss=0.1755, NDCG@10=0.0036, Recall@10=0.0066, AUC=0.4642, MAP@12=0.0018, Recall@12=0.0084

In [None]:
batch_size = 128
emb_dim = 256
final training
fold 1
Epoch 1: Loss=0.0772, NDCG@10=0.0341, Recall@10=0.0658, AUC=0.4826, MAP@12=0.0205, Recall@12=0.0796
Epoch 2: Loss=0.0338, NDCG@10=0.0431, Recall@10=0.0815, AUC=0.5986, MAP@12=0.0258, Recall@12=0.0977
Epoch 3: Loss=0.0258, NDCG@10=0.0470, Recall@10=0.0882, AUC=0.6086, MAP@12=0.0281, Recall@12=0.1056
Epoch 4: Loss=0.0229, NDCG@10=0.0510, Recall@10=0.0937, AUC=0.6146, MAP@12=0.0307, Recall@12=0.1120
Epoch 5: Loss=0.0210, NDCG@10=0.0544, Recall@10=0.0986, AUC=0.6199, MAP@12=0.0326, Recall@12=0.1170
Epoch 6: Loss=0.0202, NDCG@10=0.0549, Recall@10=0.0996, AUC=0.6202, MAP@12=0.0330, Recall@12=0.1186

In [None]:
22