In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("khadijamouhta/embedingsctr")

print("Path to dataset files:", path)

In [1]:
# 1. Cloner ton repo
!git clone https://github.com/khadijamouhtaj55-blip/Competition.git
%cd Competition

# 2. Installer les d√©pendances minimales
!pip install fuxictr==2.3.7


Cloning into 'Competition'...
remote: Enumerating objects: 43, done.[K
remote: Counting objects: 100% (43/43), done.[K
remote: Compressing objects: 100% (32/32), done.[K
remote: Total 43 (delta 12), reused 35 (delta 8), pack-reused 0 (from 0)[K
Receiving objects: 100% (43/43), 64.26 KiB | 2.47 MiB/s, done.
Resolving deltas: 100% (12/12), done.
/kaggle/working/Competition
Collecting fuxictr==2.3.7
  Downloading fuxictr-2.3.7-py3-none-any.whl.metadata (29 kB)
Collecting keras-preprocessing (from fuxictr==2.3.7)
  Downloading Keras_Preprocessing-1.1.2-py2.py3-none-any.whl.metadata (1.9 kB)
Downloading fuxictr-2.3.7-py3-none-any.whl (88 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m88.1/88.1 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading Keras_Preprocessing-1.1.2-py2.py3-none-any.whl (42 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚î

In [1]:
!pip install gensim




In [4]:
import os
import torch
import pandas as pd
import numpy as np
from PIL import Image
from tqdm.auto import tqdm
from torch.utils.data import Dataset, DataLoader
from transformers import CLIPProcessor, CLIPModel
from sklearn.decomposition import PCA
from gensim.models import Word2Vec 

# =============================================================================
# CONFIGURATION
# =============================================================================
DATASET_ROOT = "/kaggle/input/embedingsctr"
IMG_DIR = os.path.join(DATASET_ROOT, "item_images")
ITEM_INFO_PATH = os.path.join(DATASET_ROOT, "item_info (1).parquet")
TRAIN_PATH = os.path.join(DATASET_ROOT, "train.parquet")
OUTPUT_PATH = "/kaggle/working/item_info_updated.parquet"

BATCH_SIZE = 128
NUM_WORKERS = 2
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
TARGET_DIM = 128
W2V_WINDOW = 5
W2V_MIN_COUNT = 5
# =============================================================================

# --- CLASSE MultimodalDataset (VERSION CORRIG√âE) ---
class MultimodalDataset(Dataset):
    """Dataset pour images + texte (avec gestion des arrays dans les tags)"""
    
    def __init__(self, df, img_dir, processor): 
        self.df = df
        self.img_dir = img_dir
        self.processor = processor
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        item_id = row['item_id']
        img_path = os.path.join(self.img_dir, f"{item_id}.jpg")
        
        # --- 1. Extraction et Nettoyage du Texte (Logique Corrig√©e) ---
        text_parts = []
        
        # Titre (Scalaire)
        if 'title' in row and pd.notna(row['title']):
            text_parts.append(str(row['title']))
        
        # Tags (Gestion des listes/arrays) - FIX ICI
        if 'item_tags' in row:
            tags_value = row['item_tags']
            
            # V√©rifier d'abord si c'est un array ou une liste
            if isinstance(tags_value, (list, np.ndarray)):
                # Pour les arrays/listes, v√©rifier la longueur
                if len(tags_value) > 0:
                    # Filtrer les valeurs non-NaN
                    valid_tags = [str(tag) for tag in tags_value if pd.notna(tag)]
                    if valid_tags:
                        text_parts.append(" ".join(valid_tags))
            # Sinon, si c'est un scalaire, v√©rifier s'il est NaN
            elif pd.notna(tags_value):
                text_parts.append(str(tags_value))

        # --- 2. Construction du Prompt Final ---
        text = " [SEP] ".join(text_parts).strip()
        
        if not text:
             text = f"Image of product with ID {item_id}" 
        
        # Image
        try:
            image = Image.open(img_path).convert("RGB")
        except:
            image = Image.new('RGB', (224, 224), color='black')
            
        return {'text': text, 'image': image}


def collate_fn(batch):
    """Fonction de collation pour le DataLoader CLIP."""
    texts = [item['text'] for item in batch]
    images = [item['image'] for item in batch]
    inputs = processor(text=texts, images=images, return_tensors="pt", padding=True, truncation=True)
    return inputs


# =============================================================================
# FONCTIONS D'EMBEDDING
# =============================================================================

def extract_content_embeddings(df_items, processor, model, target_dim):
    """√âtape B: Extraction CLIP + PCA 512D -> 128D."""
    
    print("\nüîÑ Extraction des embeddings de Contenu (CLIP)...")
    dataset = MultimodalDataset(df_items, IMG_DIR, processor) 
    dataloader = DataLoader(
        dataset, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS, shuffle=False, collate_fn=collate_fn
    )

    all_embeddings = []
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Processing CLIP"):
            batch = {k: v.to(DEVICE) for k, v in batch.items()}
            outputs = model(**batch)
            
            # Fusion image + texte
            fused = (outputs.image_embeds + outputs.text_embeds) / 2.0
            fused = fused / fused.norm(dim=-1, keepdim=True)
            
            all_embeddings.append(fused.cpu().numpy())

    embeddings_512d = np.concatenate(all_embeddings, axis=0)
    
    # R√©duction PCA
    pca = PCA(n_components=target_dim, random_state=42)
    embeddings_128d = pca.fit_transform(embeddings_512d)
    
    # Normalisation finale
    norms = np.linalg.norm(embeddings_128d, axis=1, keepdims=True)
    embeddings_128d = embeddings_128d / (norms + 1e-8)
    
    return dict(zip(df_items['item_id'], embeddings_128d))


def extract_collaborative_embeddings(df_items, train_path, target_dim):
    """√âtape A: Entra√Ænement Word2Vec (Skip-gram) pour le signal collaboratif."""
    
    print("\nüîÑ Extraction des embeddings Collaboratifs (Word2Vec)...")
    
    # Charger les s√©quences
    df_train = pd.read_parquet(train_path)
    sequences = [
        [str(item_id) for item_id in seq] 
        for seq in df_train['item_seq'] 
        if isinstance(seq, (list, np.ndarray)) and len(seq) > 1
    ]
    
    # Entra√Æner Word2Vec (Skip-gram est implicite par d√©faut)
    model_w2v = Word2Vec(
        sentences=sequences,
        vector_size=target_dim,
        window=W2V_WINDOW,
        min_count=W2V_MIN_COUNT,
        sg=1, # Skip-gram
        workers=NUM_WORKERS,
        seed=42
    )
    
    collaborative_embeddings = {}
    
    # Extraire les embeddings pour tous les items du df_items
    for item_id in tqdm(df_items['item_id'], desc="Extracting W2V"):
        item_id_str = str(item_id)
        if item_id_str in model_w2v.wv:
            collaborative_embeddings[item_id] = model_w2v.wv[item_id_str].astype(np.float32)
        else:
            # Utiliser un vecteur nul pour les items non vus (rares)
            collaborative_embeddings[item_id] = np.zeros(target_dim, dtype=np.float32)
            
    # Normalisation
    for item_id, emb in collaborative_embeddings.items():
        collaborative_embeddings[item_id] = emb / (np.linalg.norm(emb) + 1e-8)
            
    return collaborative_embeddings


def fuse_embeddings(df_items, content_embs, collab_embs, target_dim):
    """√âtape C: Fusion par moyenne pond√©r√©e (alpha=0.5)."""
    
    fused_embeddings = {}
    
    for item_id in tqdm(df_items['item_id'], desc="Fusing Embeddings"):
        emb_content = content_embs.get(item_id, np.zeros(target_dim, dtype=np.float32))
        emb_collab = collab_embs.get(item_id, np.zeros(target_dim, dtype=np.float32))
        
        # Simple fusion par moyenne
        emb_fused = (emb_content + emb_collab) / 2.0
        
        # Normalisation finale
        emb_fused = emb_fused / (np.linalg.norm(emb_fused) + 1e-8)
        
        fused_embeddings[item_id] = emb_fused
        
    return fused_embeddings

# =============================================================================
# MAIN
# =============================================================================
print(f"üöÄ TASK 1: FUSION MULTIMODAL + COLLABORATIF")
print("="*80)

# 1. CHARGEMENT ITEM_INFO
print(f"\n{'='*80}\n√âTAPE 1: CHARGEMENT ITEM_INFO\n{'='*80}")
try:
    df_items = pd.read_parquet(ITEM_INFO_PATH)
except FileNotFoundError:
    print(f"‚ùå ERREUR: Fichier non trouv√© √† {ITEM_INFO_PATH}.")
    raise
    
print(f"‚úÖ {len(df_items)} items charg√©s")

# 2. INITIALISATION CLIP
print(f"\n{'='*80}\n√âTAPE 2: INITIALISATION CLIP\n{'='*80}")
model_name = "openai/clip-vit-base-patch32"
processor = CLIPProcessor.from_pretrained(model_name)
model = CLIPModel.from_pretrained(model_name)
model.to(DEVICE)
model.eval()
print("‚úÖ CLIP charg√©")

# 3. EXTRACTION D'EMBEDDINGS
print(f"\n{'='*80}\n√âTAPE 3: EXTRACTION DES DEUX EMBEDDINGS (128D)\n{'='*80}")

# 3.1. Content-Based (CLIP + PCA)
content_embs = extract_content_embeddings(df_items, processor, model, TARGET_DIM)

# 3.2. Collaborative-Based (Word2Vec)
collaborative_embs = extract_collaborative_embeddings(df_items, TRAIN_PATH, TARGET_DIM)


# 4. FUSION
print(f"\n{'='*80}\n√âTAPE 4: FUSION COLLABORATIF + CONTENT\n{'='*80}")
fused_embs = fuse_embeddings(df_items, content_embs, collaborative_embs, TARGET_DIM)
fused_emb_list = [fused_embs[item_id] for item_id in df_items['item_id']]

# 5. UPDATE ITEM_INFO
print(f"\n{'='*80}\n√âTAPE 5: UPDATE ITEM_INFO.PARQUET\n{'='*80}")
if 'item_emb_d128' in df_items.columns:
    df_items = df_items.drop(columns=['item_emb_d128'], errors='ignore')

df_items['item_emb_d128'] = fused_emb_list

# Sauvegarder
df_items.to_parquet(OUTPUT_PATH, index=False)
print(f"‚úÖ Fichier sauvegard√©: {OUTPUT_PATH}")
print(f"\n{'='*80}\nüéâ TASK 1 FUSIONN√âE TERMIN√âE !\n{'='*80}")

2025-12-14 19:09:27.396423: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1765739367.582660      47 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1765739367.636986      47 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

üöÄ TASK 1: FUSION MULTIMODAL + COLLABORATIF

√âTAPE 1: CHARGEMENT ITEM_INFO


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


‚úÖ 91718 items charg√©s

√âTAPE 2: INITIALISATION CLIP


preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/605M [00:00<?, ?B/s]

‚úÖ CLIP charg√©

√âTAPE 3: EXTRACTION DES DEUX EMBEDDINGS (128D)

üîÑ Extraction des embeddings de Contenu (CLIP)...


Processing CLIP:   0%|          | 0/717 [00:00<?, ?it/s]


üîÑ Extraction des embeddings Collaboratifs (Word2Vec)...


Extracting W2V:   0%|          | 0/91718 [00:00<?, ?it/s]


√âTAPE 4: FUSION COLLABORATIF + CONTENT


Fusing Embeddings:   0%|          | 0/91718 [00:00<?, ?it/s]


√âTAPE 5: UPDATE ITEM_INFO.PARQUET
‚úÖ Fichier sauvegard√©: /kaggle/working/item_info_updated.parquet

üéâ TASK 1 FUSIONN√âE TERMIN√âE !


In [7]:
# ==================================================================================
# MM-CTR TASK 2: DCN-DIN (Adapt√© √† Embeddings Fusionn√©s 128D)
# ==================================================================================

import os
import gc
import sys
import zipfile
import subprocess
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import roc_auc_score
from tqdm import tqdm
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

# 1. SETUP
# ----------------------------------------------------------------------------------
try:
    import polars as pl
except ImportError:
    subprocess.check_call([sys.executable, "-m", "pip", "install", "polars"])
    import polars as pl

# ==================================================================================
# 2. CONFIGURATION (Utilise l'output de votre script de fusion)
# ==================================================================================

class Config:
    # CHEMINS (Utilise le chemin de sortie de votre script de fusion)
    BASE_DIR = '/kaggle/working'
    DATA_DIR = '/kaggle/input/embedingsctr'
    
    # ‚ö° MODIFICATION CL√â : Utilise votre fichier item_info_updated.parquet
    EMB_PATH = BASE_DIR + '/item_info_updated.parquet' 
    
    MODEL_SAVE_DIR = BASE_DIR
    PRED_SAVE_DIR = BASE_DIR

    # HYPERPARAM√àTRES
    DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
    
    # ‚ö° MODIFICATION CL√â : Dimension des Embeddings = 128D
    EMBED_DIM = 128 
    
    SIDE_EMBED_DIM = 8 
    CROSS_LAYERS = 3 
    HIDDEN_UNITS = [128, 64]
    DROPOUT = 0.1

    # Entra√Ænement
    BATCH_SIZE = 4096
    LR = 1e-3
    EPOCHS = 2 

config = Config()
os.makedirs(config.MODEL_SAVE_DIR, exist_ok=True)
os.makedirs(config.PRED_SAVE_DIR, exist_ok=True)
print(f"DEVICE: {config.DEVICE}, Embed Dim: {config.EMBED_DIM}")

# ==================================================================================
# 3. ADVANCED LAYERS (Corrig√© __init__)
# ==================================================================================
class Dice(nn.Module):
    def __init__(self, num_features, dim=2):
        super(Dice, self).__init__()
        self.bn = nn.BatchNorm1d(num_features, eps=1e-9)
        self.sig = nn.Sigmoid()
        if dim == 2:
            self.alpha = nn.Parameter(torch.zeros((num_features,)))
        else:
            self.alpha = nn.Parameter(torch.zeros((num_features, 1)))

    def forward(self, x):
        p = self.sig(self.bn(x))
        return p * x + (1 - p) * self.alpha * x

class CrossNet(nn.Module):
    """Deep & Cross Network Layer (Corrig√© __init__)"""
    def __init__(self, in_features, layer_num=2):
        super().__init__()
        self.layer_num = layer_num
        self.kernels = nn.ParameterList([nn.Parameter(torch.Tensor(in_features, 1)) for _ in range(layer_num)])
        self.biases = nn.ParameterList([nn.Parameter(torch.Tensor(in_features)) for _ in range(layer_num)])

        for i in range(layer_num):
            nn.init.xavier_normal_(self.kernels[i])
            nn.init.zeros_(self.biases[i])

    def forward(self, inputs):
        x_0 = inputs.unsqueeze(2)
        x_l = x_0
        for i in range(self.layer_num):
            xl_w = torch.matmul(x_l.transpose(1, 2), self.kernels[i])
            dot_ = torch.matmul(x_0, xl_w)
            x_l = dot_ + self.biases[i].unsqueeze(1) + x_l
        return x_l.squeeze(2)
# ==================================================================================
# 4. DATA LOADING (Adapt√© √† item_emb_d128)
# ==================================================================================
def load_assets():
    print("  Loading FUSED Embeddings (128D) from custom file...")
    try:
        # NOTE: Pandas est utilis√© ici car Polars a des probl√®mes avec les arrays NumPy
        df_emb = pd.read_parquet(config.EMB_PATH) 
    except Exception as e:
        print(f"ERREUR: Impossible de lire le fichier d'embeddings √† {config.EMB_PATH}.")
        print("Assurez-vous d'avoir ex√©cut√© le script de fusion avant.")
        raise e

    # Cr√©ation de la carte ID -> Index
    real_ids = df_emb['item_id'].to_list()
    id_to_idx = {real_id: i + 1 for i, real_id in enumerate(real_ids)}

    # Extraction des embeddings fusionn√©s (128D)
    # ‚ö° MODIFICATION : On utilise 'item_emb_d128' √† la place de v1/v2
    combined_emb = np.stack(df_emb['item_emb_d128'].to_numpy(), dtype=np.float32)

    # Ajout d'une ligne de padding (index 0)
    padding_row = np.zeros((1, config.EMBED_DIM), dtype=np.float32)
    final_matrix = np.vstack([padding_row, combined_emb])

    print(f" Combined Matrix Shape: {final_matrix.shape}")
    del df_emb, combined_emb
    gc.collect()
    return torch.tensor(final_matrix), id_to_idx

try:
    PRETRAINED_WEIGHTS, ID_MAP = load_assets()
except Exception as e:
    print(f"Arr√™t d√ª √† l'erreur de chargement: {e}")
    sys.exit(1)


# ==================================================================================
# 4. DATA LOADING (Corrig√© __init__ pour l'extraction de l'ID de test)
# ==================================================================================
# ... (load_assets et PRETRAINED_WEIGHTS restent inchang√©s) ...

# ==================================================================================
# 4. DATA LOADING (Corrig√© __init__ pour l'extraction de l'ID de test V2)
# ==================================================================================
# ... (load_assets et PRETRAINED_WEIGHTS restent inchang√©s) ...

class RichDataset(Dataset):
    def __init__(self, parquet_path, id_map, is_test=False):
        print(f" Reading {os.path.basename(parquet_path)}...")
        df = pl.read_parquet(parquet_path)

        def map_ids(id_array):
            return np.array([id_map.get(x, 0) for x in id_array], dtype=np.int32)

        self.target = map_ids(df['item_id'].to_numpy())
        
        seq_matrix = np.stack(df['item_seq'].to_numpy())
        self.history = map_ids(seq_matrix.flatten()).reshape(seq_matrix.shape)
        
        self.likes = df['likes_level'].to_numpy().astype(np.int32) 
        self.views = df['views_level'].to_numpy().astype(np.int32)

        self.is_test = is_test
        
        if not is_test:
            self.label = df['label'].to_numpy().astype(np.float32)
            self.ids = None
        else:
            self.label = np.zeros(len(df), dtype=np.float32)
            
            # ‚ö° FIX CRITIQUE V2: Cr√©e l'ID s√©quentiel directement √† partir de la longueur du DF.
            # C'est la m√©thode la plus s√ªre et n'interf√®re pas avec les colonnes Polars.
            self.ids = np.arange(len(df), dtype=np.int32)
            
        del df, seq_matrix
        gc.collect()

    def __len__(self):
        return len(self.label)

    def __getitem__(self, idx):
        return (self.history[idx], self.target[idx], self.likes[idx], self.views[idx], self.label[idx])
# ... (Le reste du code reste inchang√©) ...
  

# ... (Le reste du code reste inchang√©) ...
     
# ==================================================================================
# 5. DCN MODEL (Utilise EMBED_DIM = 128)
# ==================================================================================
class DCN_DIN(nn.Module):
    def __init__(self, pretrained_weights):
        super().__init__()
        num_items, embed_dim = pretrained_weights.shape # embed_dim = 128

        self.item_embedding = nn.Embedding(num_items, embed_dim, padding_idx=0)
        self.item_embedding.weight.data.copy_(pretrained_weights)
        self.item_embedding.weight.requires_grad = True

        self.likes_embedding = nn.Embedding(20, config.SIDE_EMBED_DIM)
        self.views_embedding = nn.Embedding(20, config.SIDE_EMBED_DIM)

        att_input_dim = embed_dim * 4
        self.att_mlp = nn.Sequential(
            nn.Linear(att_input_dim, 80),
            nn.Sigmoid(),
            nn.Linear(80, 40),
            nn.Sigmoid(),
            nn.Linear(40, 1)
        )

        self.input_dim = embed_dim * 2 + config.SIDE_EMBED_DIM * 2
        self.cross_net = CrossNet(self.input_dim, layer_num=config.CROSS_LAYERS)

        deep_layers = []
        curr_dim = self.input_dim
        for hidden in config.HIDDEN_UNITS:
            deep_layers.append(nn.Linear(curr_dim, hidden))
            deep_layers.append(Dice(hidden))
            deep_layers.append(nn.Dropout(config.DROPOUT))
            curr_dim = hidden
        self.deep_net = nn.Sequential(*deep_layers)

        self.final_linear = nn.Linear(curr_dim + self.input_dim, 1)

    def attention(self, target, history, mask):
        # ... (Logique DIN inchang√©e)
        seq_len = history.size(1)
        target_tile = target.expand(-1, seq_len, -1)
        inp = torch.cat([target_tile, history, target_tile - history, target_tile * history], dim=-1)
        scores = self.att_mlp(inp).masked_fill(mask.unsqueeze(-1) == 0, -1e9)
        return (torch.softmax(scores, dim=1) * history).sum(dim=1)

    def forward(self, history, target, likes, views):
        # ... (Logique Forward inchang√©e)
        hist_emb = self.item_embedding(history)
        target_emb = self.item_embedding(target).unsqueeze(1)

        mask = (history != 0)
        user_interest = self.attention(target_emb, hist_emb, mask)

        features = torch.cat([
            target_emb.squeeze(1),
            user_interest,
            self.likes_embedding(likes),
            self.views_embedding(views)
        ], dim=1)

        cross_out = self.cross_net(features)
        deep_out = self.deep_net(features)
        stack = torch.cat([cross_out, deep_out], dim=1)
        return self.final_linear(stack).squeeze()
# ==================================================================================
# 6. TRAINING (Utilise les fichiers train.parquet et valid (1).parquet)
# ==================================================================================
def train_engine():
    # Chemins adapt√©s aux fichiers du dossier 'embedingsctr'
    train_ds = RichDataset(os.path.join(config.DATA_DIR, 'train.parquet'), ID_MAP)
    valid_ds = RichDataset(os.path.join(config.DATA_DIR, 'valid (1).parquet'), ID_MAP)

    train_loader = DataLoader(train_ds, batch_size=config.BATCH_SIZE, shuffle=True, num_workers=2)
    valid_loader = DataLoader(valid_ds, batch_size=config.BATCH_SIZE*2, shuffle=False)

    model = DCN_DIN(PRETRAINED_WEIGHTS).to(config.DEVICE)
    criterion = nn.BCEWithLogitsLoss()
    optimizer = optim.AdamW(model.parameters(), lr=config.LR, weight_decay=1e-4)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.2, patience=1)

    best_auc = 0.0
    patience = 0

    print("\n STARTING TRAINING ")
    for epoch in range(config.EPOCHS):
        model.train()
        total_loss = 0

        for history, target, likes, views, label in tqdm(train_loader, desc=f"Epoch {epoch+1}/{config.EPOCHS}"):
            history, target = history.to(config.DEVICE).long(), target.to(config.DEVICE).long()
            likes, views = likes.to(config.DEVICE).long(), views.to(config.DEVICE).long()
            label = label.to(config.DEVICE).float()

            optimizer.zero_grad()
            logits = model(history, target, likes, views)
            loss = criterion(logits, label)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            total_loss += loss.item()

        model.eval()
        preds, labels = [], []
        with torch.no_grad():
            for history, target, likes, views, label in valid_loader:
                history, target = history.to(config.DEVICE).long(), target.to(config.DEVICE).long()
                likes, views = likes.to(config.DEVICE).long(), views.to(config.DEVICE).long()
                logits = model(history, target, likes, views)
                preds.extend(torch.sigmoid(logits).cpu().numpy())
                labels.extend(label.numpy())

        val_auc = roc_auc_score(labels, preds)
        current_lr = optimizer.param_groups[0]['lr']
        print(f"üìä Epoch {epoch+1}: Loss={total_loss/len(train_loader):.4f} | Val AUC={val_auc:.4f} | LR={current_lr:.1e}")

        scheduler.step(val_auc)

        if val_auc > best_auc:
            best_auc = val_auc
            torch.save(model.state_dict(), os.path.join(config.MODEL_SAVE_DIR, 'dcn_best.pt'))
            print(f"    Best Model Saved! AUC: {best_auc:.4f}")
            patience = 0
        else:
            patience += 1
            if patience >= 3:
                print("    Early Stopping")
                break

    print(f"\n Training Complete! Best AUC: {best_auc:.4f}")
    return model

def generate_submission(model):
    print("\nüîÆ Generating Predictions...")
    # Chemin adapt√© aux fichiers du dossier 'embedingsctr'
    test_ds = RichDataset(os.path.join(config.DATA_DIR, 'test (1).parquet'), ID_MAP, is_test=True) 
    test_loader = DataLoader(test_ds, batch_size=config.BATCH_SIZE*2, shuffle=False)

    model_path = os.path.join(config.MODEL_SAVE_DIR, 'dcn_best.pt')
    if os.path.exists(model_path):
        model.load_state_dict(torch.load(model_path, map_location=config.DEVICE))
    
    model.eval()

    all_preds = []
    with torch.no_grad():
        for history, target, likes, views, _ in tqdm(test_loader):
            history, target = history.to(config.DEVICE).long(), target.to(config.DEVICE).long()
            likes, views = likes.to(config.DEVICE).long(), views.to(config.DEVICE).long()
            logits = model(history, target, likes, views)
            all_preds.extend(torch.sigmoid(logits).cpu().numpy())

    # Cr√©ation du DataFrame de soumission avec les 4 colonnes requises
    df = pd.DataFrame({
        'ID': test_ds.ids, 
        'Task1': 0.0,
        'Task2': 0.0,
        'Task1&2': all_preds # Place la pr√©diction dans Task1&2 (CTR)
    })
    
    path = os.path.join(config.PRED_SAVE_DIR, 'prediction.csv')
    # ‚ö° FIX : Enregistrer les pr√©dictions dans Task1&2, Task1 et Task2 restent √† 0
    df.to_csv(path, index=False, float_format='%.8f') 

    zip_path = os.path.join(config.PRED_SAVE_DIR, 'submission_task2.zip')
    with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as z:
        z.write(path, 'prediction.csv')
    
    print(f" Soumission pr√™te: {zip_path}")
    print(f" CSV saved at: {path}")
    return zip_path
# ==================================================================================
# 7. RUN EVERYTHING
# ==================================================================================
if __name__ == "__main__":
    print("="*70)
    print(" MM-CTR TASK 2 - DCN-DIN (avec vos embeddings 128D) STARTING")
    print("="*70)
    
    gc.collect()
    if config.DEVICE == 'cuda':
        torch.cuda.empty_cache()
    
    try:
        model = train_engine()
        submission_path = generate_submission(model)
        
        print("\n" + "="*70)
        print(" ALL DONE!")
        print(f" T√©l√©chargez ce fichier et soumettez-le: {submission_path}")
        print("="*70)
    except Exception as e:
        print(f"\n‚ùå Une erreur s'est produite pendant l'entra√Ænement ou la pr√©diction : {e}")

DEVICE: cuda, Embed Dim: 128
  Loading FUSED Embeddings (128D) from custom file...
 Combined Matrix Shape: (91719, 128)
 MM-CTR TASK 2 - DCN-DIN (avec vos embeddings 128D) STARTING
 Reading train.parquet...
 Reading valid (1).parquet...

 STARTING TRAINING 


Epoch 1/2: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 879/879 [01:07<00:00, 13.04it/s]


üìä Epoch 1: Loss=0.1661 | Val AUC=0.8108 | LR=1.0e-03
    Best Model Saved! AUC: 0.8108


Epoch 2/2: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 879/879 [01:06<00:00, 13.17it/s]


üìä Epoch 2: Loss=0.0384 | Val AUC=0.8517 | LR=1.0e-03
    Best Model Saved! AUC: 0.8517

 Training Complete! Best AUC: 0.8517

üîÆ Generating Predictions...
 Reading test (1).parquet...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 47/47 [00:04<00:00, 10.62it/s]


 Soumission pr√™te: /kaggle/working/submission_task2.zip
 CSV saved at: /kaggle/working/prediction.csv

 ALL DONE!
 T√©l√©chargez ce fichier et soumettez-le: /kaggle/working/submission_task2.zip


#### 