## Installation & Setup


In [2]:
import os
import gc
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import ReduceLROnPlateau
from sklearn.metrics import roc_auc_score
from tqdm.notebook import tqdm
from google.colab import drive

# 1. Setup
if not os.path.exists('/content/drive'):
    drive.mount('/content/drive')

BASE_DIR = "/content/drive/MyDrive/MMCTR_Project"
DATA_DIR = os.path.join(BASE_DIR, "data")
CHECKPOINT_DIR = os.path.join(BASE_DIR, "checkpoints")
if not os.path.exists(CHECKPOINT_DIR): os.makedirs(CHECKPOINT_DIR)

# Chemins des embeddings
TXT_EMB_PATH = os.path.join(DATA_DIR, "my_text_embeddings_full.npy")
IMG_EMB_PATH = os.path.join(DATA_DIR, "my_clip_embeddings.npy") # Ou ton fichier image existant

# 2. CRÉATION DU MAPPING (Vital pour > 0.90 AUC)
print("🔧 Construction du Mapping ID -> Index...")

# On charge tous les IDs uniques
df_feat = pd.read_parquet(os.path.join(DATA_DIR, "item_feature.parquet"), columns=['item_id'])
sorted_ids = df_feat['item_id'].sort_values().unique()

# Dictionnaire : ID Réel -> Index (0, 1, 2...)
# On ajoute +1 car l'index 0 sera réservé au "Padding" (vide)
id2idx = {id_val: idx + 1 for idx, id_val in enumerate(sorted_ids)}

# Fonction rapide de conversion
def map_ids_func(id_list):
    # Renvoie 0 si l'ID n'est pas trouvé (Padding)
    return [id2idx.get(i, 0) for i in id_list]

VOCAB_SIZE = len(id2idx) + 1
print(f"✅ Mapping terminé. Nombre d'items uniques : {VOCAB_SIZE}")

🔧 Construction du Mapping ID -> Index...
✅ Mapping terminé. Nombre d'items uniques : 91718


## Extraction Features VISUELLES (CLIP)

In [3]:
# --- 1. DATASET INTELLIGENT ---
class GrandmasterDataset(Dataset):
    def __init__(self, parquet_file, max_len=20):
        print(f"📂 Chargement de {parquet_file}...")
        path = os.path.join(DATA_DIR, parquet_file)
        df = pd.read_parquet(path)

        # Mapping des IDs Cibles
        print("   -> Mapping des Target IDs...")
        self.ids = np.array([id2idx.get(i, 0) for i in df['item_id'].values])

        # Mapping de l'Historique (C'est un peu long mais nécessaire)
        print("   -> Mapping de l'Historique...")
        raw_seq = np.stack(df['item_seq'].values)[:, :max_len]
        self.hist = np.zeros_like(raw_seq)

        # Boucle optimisée
        for i, row in enumerate(raw_seq):
            self.hist[i] = [id2idx.get(x, 0) for x in row]

        # Métadonnées
        self.likes = df['likes_level'].values if 'likes_level' in df else np.zeros(len(df), dtype=int)
        self.views = df['views_level'].values if 'views_level' in df else np.zeros(len(df), dtype=int)
        self.labels = df['label'].values if 'label' in df else np.zeros(len(df), dtype=float)

    def __len__(self): return len(self.ids)
    def __getitem__(self, idx):
        return {
            'hist_ids': torch.tensor(self.hist[idx], dtype=torch.long),
            'target_id': torch.tensor(self.ids[idx], dtype=torch.long),
            't_likes': torch.tensor(self.likes[idx], dtype=torch.long),
            't_views': torch.tensor(self.views[idx], dtype=torch.long),
            'label': torch.tensor(self.labels[idx], dtype=torch.float32)
        }

# --- 2. ARCHITECTURE HYBRIDE ROBUSTE ---
class GrandmasterModel(nn.Module):
    def __init__(self, txt_path, img_path):
        super().__init__()

        # A. Embeddings Contenu (Gelés)
        # On ajoute une ligne de zéros au début pour le padding (Index 0)
        txt_raw = np.load(txt_path)
        img_raw = np.load(img_path)

        # Padding manuel pour aligner avec le vocabulaire (index 0 = vide)
        txt_pad = np.vstack([np.zeros((1, txt_raw.shape[1])), txt_raw])
        img_pad = np.vstack([np.zeros((1, img_raw.shape[1])), img_raw])

        self.txt_emb = nn.Embedding.from_pretrained(torch.tensor(txt_pad, dtype=torch.float32), freeze=True)
        self.img_emb = nn.Embedding.from_pretrained(torch.tensor(img_pad, dtype=torch.float32), freeze=True)

        # B. Embeddings Apprenables
        self.id_emb = nn.Embedding(VOCAB_SIZE, 32, padding_idx=0)
        self.meta_emb = nn.Embedding(20, 8, padding_idx=0)

        # C. Fusion & Projection (Avec DROPOUT fort)
        # Input = 384(Txt) + 512(Img) + 32(ID) + 8(Like) + 8(View) = 944
        self.fusion_dim = 384 + 512 + 32 + 16
        self.projector = nn.Sequential(
            nn.Linear(self.fusion_dim, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.5) # 🔥 Anti-Overfitting
        )

        # D. DIN Attention
        self.att_net = nn.Sequential(
            nn.Linear(256 * 3, 128),
            nn.Tanh(),
            nn.Linear(128, 1)
        )

        # E. Deep Tower
        self.deep = nn.Sequential(
            nn.Linear(256 * 2, 256), nn.BatchNorm1d(256), nn.ReLU(), nn.Dropout(0.4),
            nn.Linear(256, 128), nn.BatchNorm1d(128), nn.ReLU(), nn.Dropout(0.3),
            nn.Linear(128, 1)
        )
        self.sigmoid = nn.Sigmoid()

    def get_features(self, ids, likes, views):
        # Concaténation de tout
        x = torch.cat([
            self.txt_emb(ids),
            self.img_emb(ids),
            self.id_emb(ids),
            self.meta_emb(likes),
            self.meta_emb(views)
        ], dim=-1)

        # Projection
        return self.projector(x.view(-1, self.fusion_dim)).view(ids.shape[0], -1, 256)

    def forward(self, hist_ids, target_id, t_likes, t_views, **kwargs):
        # Target
        t_vec = self.get_features(target_id.unsqueeze(1), t_likes.unsqueeze(1), t_views.unsqueeze(1)).squeeze(1)

        # History (Meta à 0 pour l'historique)
        h_vec = self.get_features(hist_ids, torch.zeros_like(hist_ids), torch.zeros_like(hist_ids))

        # Attention Mechanism
        mask = (hist_ids == 0)
        t_exp = t_vec.unsqueeze(1).expand(-1, h_vec.size(1), -1)

        att_input = torch.cat([t_exp, h_vec, t_exp * h_vec], dim=-1)
        att_score = self.att_net(att_input).squeeze(-1)
        att_score = att_score.masked_fill(mask, -1e9)
        att_score = torch.softmax(att_score, dim=1).unsqueeze(-1)

        user_vec = torch.sum(h_vec * att_score, dim=1)

        # Final
        out = torch.cat([user_vec, t_vec], dim=1)
        return self.sigmoid(self.deep(out))

## Extraction Features TEXTUELLES (LLM)

In [4]:
def train_grandmaster(epochs=30):
    print("\n🚀 Démarrage Entraînement GRANDMASTER...")
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"   -> Device: {device}")

    BATCH_SIZE = 1024

    # Chargement
    train_ds = GrandmasterDataset("train.parquet")
    valid_ds = GrandmasterDataset("valid.parquet")

    train_dl = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=0)
    valid_dl = DataLoader(valid_ds, batch_size=BATCH_SIZE*2, shuffle=False)

    # Initialisation
    model = GrandmasterModel(TXT_EMB_PATH, IMG_EMB_PATH).to(device)

    # Optimiseur avec régularisation (Weight Decay)
    optimizer = torch.optim.AdamW(model.parameters(), lr=0.001, weight_decay=1e-3)
    scheduler = ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=1)
    criterion = nn.BCELoss()

    best_auc = 0

    for ep in range(epochs):
        model.train()
        total_loss = 0
        pbar = tqdm(train_dl, desc=f"Epoch {ep+1}/{epochs}")

        for batch in pbar:
            inputs = {k:v.to(device) for k,v in batch.items() if k != 'label'}
            label = batch['label'].to(device).unsqueeze(1)

            optimizer.zero_grad()
            output = model(**inputs)
            loss = criterion(output, label)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            pbar.set_postfix({'loss': f"{loss.item():.4f}"})

        # Validation
        model.eval()
        preds, targets = [], []
        with torch.no_grad():
            for batch in valid_dl:
                inputs = {k:v.to(device) for k,v in batch.items() if k != 'label'}
                output = model(**inputs)
                preds.extend(output.cpu().numpy().flatten())
                targets.extend(batch['label'].numpy().flatten())

        auc = roc_auc_score(targets, preds)
        avg_loss = total_loss / len(train_dl)

        print(f"🏁 Ep {ep+1} | Loss: {avg_loss:.4f} | Val AUC: {auc:.5f}")

        scheduler.step(auc)

        # Sauvegarde
        if auc > best_auc:
            best_auc = auc
            torch.save(model.state_dict(), os.path.join(CHECKPOINT_DIR, "grandmaster_model.pth"))
            print(f"🔥 RECORD ! Modèle sauvegardé (AUC: {auc:.5f})")

    print(f"\n🏆 Entraînement terminé. Meilleur AUC: {best_auc:.5f}")

# LANCEZ LA BÊTE !
train_grandmaster()


🚀 Démarrage Entraînement GRANDMASTER...
   -> Device: cuda
📂 Chargement de train.parquet...
   -> Mapping des Target IDs...
   -> Mapping de l'Historique...
📂 Chargement de valid.parquet...
   -> Mapping des Target IDs...
   -> Mapping de l'Historique...


Epoch 1/30:   0%|          | 0/3516 [00:00<?, ?it/s]

🏁 Ep 1 | Loss: 0.3765 | Val AUC: 0.60927
🔥 RECORD ! Modèle sauvegardé (AUC: 0.60927)


Epoch 2/30:   0%|          | 0/3516 [00:00<?, ?it/s]

🏁 Ep 2 | Loss: 0.2115 | Val AUC: 0.61950
🔥 RECORD ! Modèle sauvegardé (AUC: 0.61950)


Epoch 3/30:   0%|          | 0/3516 [00:00<?, ?it/s]

🏁 Ep 3 | Loss: 0.1707 | Val AUC: 0.64258
🔥 RECORD ! Modèle sauvegardé (AUC: 0.64258)


Epoch 4/30:   0%|          | 0/3516 [00:00<?, ?it/s]

🏁 Ep 4 | Loss: 0.1471 | Val AUC: 0.66271
🔥 RECORD ! Modèle sauvegardé (AUC: 0.66271)


Epoch 5/30:   0%|          | 0/3516 [00:00<?, ?it/s]

🏁 Ep 5 | Loss: 0.1292 | Val AUC: 0.67757
🔥 RECORD ! Modèle sauvegardé (AUC: 0.67757)


Epoch 6/30:   0%|          | 0/3516 [00:00<?, ?it/s]

🏁 Ep 6 | Loss: 0.1153 | Val AUC: 0.69333
🔥 RECORD ! Modèle sauvegardé (AUC: 0.69333)


Epoch 7/30:   0%|          | 0/3516 [00:00<?, ?it/s]

🏁 Ep 7 | Loss: 0.1042 | Val AUC: 0.70793
🔥 RECORD ! Modèle sauvegardé (AUC: 0.70793)


Epoch 8/30:   0%|          | 0/3516 [00:00<?, ?it/s]

🏁 Ep 8 | Loss: 0.0949 | Val AUC: 0.71648
🔥 RECORD ! Modèle sauvegardé (AUC: 0.71648)


Epoch 9/30:   0%|          | 0/3516 [00:00<?, ?it/s]

🏁 Ep 9 | Loss: 0.0876 | Val AUC: 0.72833
🔥 RECORD ! Modèle sauvegardé (AUC: 0.72833)


Epoch 10/30:   0%|          | 0/3516 [00:00<?, ?it/s]

🏁 Ep 10 | Loss: 0.0810 | Val AUC: 0.73778
🔥 RECORD ! Modèle sauvegardé (AUC: 0.73778)


Epoch 11/30:   0%|          | 0/3516 [00:00<?, ?it/s]

🏁 Ep 11 | Loss: 0.0757 | Val AUC: 0.73954
🔥 RECORD ! Modèle sauvegardé (AUC: 0.73954)


Epoch 12/30:   0%|          | 0/3516 [00:00<?, ?it/s]

🏁 Ep 12 | Loss: 0.0711 | Val AUC: 0.75304
🔥 RECORD ! Modèle sauvegardé (AUC: 0.75304)


Epoch 13/30:   0%|          | 0/3516 [00:00<?, ?it/s]

🏁 Ep 13 | Loss: 0.0675 | Val AUC: 0.75561
🔥 RECORD ! Modèle sauvegardé (AUC: 0.75561)


Epoch 14/30:   0%|          | 0/3516 [00:00<?, ?it/s]

🏁 Ep 14 | Loss: 0.0639 | Val AUC: 0.76215
🔥 RECORD ! Modèle sauvegardé (AUC: 0.76215)


Epoch 15/30:   0%|          | 0/3516 [00:00<?, ?it/s]

🏁 Ep 15 | Loss: 0.0611 | Val AUC: 0.76858
🔥 RECORD ! Modèle sauvegardé (AUC: 0.76858)


Epoch 16/30:   0%|          | 0/3516 [00:00<?, ?it/s]

🏁 Ep 16 | Loss: 0.0586 | Val AUC: 0.77259
🔥 RECORD ! Modèle sauvegardé (AUC: 0.77259)


Epoch 17/30:   0%|          | 0/3516 [00:00<?, ?it/s]

🏁 Ep 17 | Loss: 0.0564 | Val AUC: 0.77364
🔥 RECORD ! Modèle sauvegardé (AUC: 0.77364)


Epoch 18/30:   0%|          | 0/3516 [00:00<?, ?it/s]

🏁 Ep 18 | Loss: 0.0543 | Val AUC: 0.77679
🔥 RECORD ! Modèle sauvegardé (AUC: 0.77679)


Epoch 19/30:   0%|          | 0/3516 [00:00<?, ?it/s]

🏁 Ep 19 | Loss: 0.0524 | Val AUC: 0.77994
🔥 RECORD ! Modèle sauvegardé (AUC: 0.77994)


Epoch 20/30:   0%|          | 0/3516 [00:00<?, ?it/s]

🏁 Ep 20 | Loss: 0.0509 | Val AUC: 0.78395
🔥 RECORD ! Modèle sauvegardé (AUC: 0.78395)


Epoch 21/30:   0%|          | 0/3516 [00:00<?, ?it/s]

🏁 Ep 21 | Loss: 0.0496 | Val AUC: 0.79498
🔥 RECORD ! Modèle sauvegardé (AUC: 0.79498)


Epoch 22/30:   0%|          | 0/3516 [00:00<?, ?it/s]

🏁 Ep 22 | Loss: 0.0481 | Val AUC: 0.79321


Epoch 23/30:   0%|          | 0/3516 [00:00<?, ?it/s]

🏁 Ep 23 | Loss: 0.0470 | Val AUC: 0.79734
🔥 RECORD ! Modèle sauvegardé (AUC: 0.79734)


Epoch 24/30:   0%|          | 0/3516 [00:00<?, ?it/s]

🏁 Ep 24 | Loss: 0.0461 | Val AUC: 0.79626


Epoch 25/30:   0%|          | 0/3516 [00:00<?, ?it/s]

🏁 Ep 25 | Loss: 0.0451 | Val AUC: 0.80468
🔥 RECORD ! Modèle sauvegardé (AUC: 0.80468)


Epoch 26/30:   0%|          | 0/3516 [00:00<?, ?it/s]

🏁 Ep 26 | Loss: 0.0442 | Val AUC: 0.80883
🔥 RECORD ! Modèle sauvegardé (AUC: 0.80883)


Epoch 27/30:   0%|          | 0/3516 [00:00<?, ?it/s]

🏁 Ep 27 | Loss: 0.0432 | Val AUC: 0.81440
🔥 RECORD ! Modèle sauvegardé (AUC: 0.81440)


Epoch 28/30:   0%|          | 0/3516 [00:00<?, ?it/s]

🏁 Ep 28 | Loss: 0.0423 | Val AUC: 0.81129


Epoch 29/30:   0%|          | 0/3516 [00:00<?, ?it/s]

🏁 Ep 29 | Loss: 0.0416 | Val AUC: 0.81454
🔥 RECORD ! Modèle sauvegardé (AUC: 0.81454)


Epoch 30/30:   0%|          | 0/3516 [00:00<?, ?it/s]

🏁 Ep 30 | Loss: 0.0413 | Val AUC: 0.81242

🏆 Entraînement terminé. Meilleur AUC: 0.81454


## Dataset & Modèle "CHAMPION" (Architecture Hybride)


In [5]:
def predict_and_submit():
    print("🔮 Génération des prédictions finales...")
    device = torch.device("cuda")

    # Dataset de Test (utilise aussi le mapping !)
    test_ds = GrandmasterDataset("test.parquet")
    test_dl = DataLoader(test_ds, batch_size=2048, shuffle=False)

    # Chargement Modèle
    model = GrandmasterModel(TXT_EMB_PATH, IMG_EMB_PATH).to(device)
    model.load_state_dict(torch.load(os.path.join(CHECKPOINT_DIR, "grandmaster_model.pth")))
    model.eval()

    preds = []
    with torch.no_grad():
        for batch in tqdm(test_dl, desc="Predicting"):
            inputs = {k:v.to(device) for k,v in batch.items() if k != 'label'}
            output = model(**inputs)
            preds.extend(output.cpu().numpy().flatten())

    # Création CSV
    df_test = pd.read_parquet(os.path.join(DATA_DIR, "test.parquet"), columns=['ID'])

    submission = pd.DataFrame({
        'ID': df_test['ID'],
        'Task1': preds,
        'Task2': preds,
        'Task1&2': preds
    })

    out_file = "submission_grandmaster.csv"
    submission.to_csv(out_file, index=False)
    print(f"✅ Fichier prêt : {out_file}")

    from google.colab import files
    files.download(out_file)

predict_and_submit()

🔮 Génération des prédictions finales...
📂 Chargement de test.parquet...
   -> Mapping des Target IDs...
   -> Mapping de l'Historique...


Predicting:   0%|          | 0/186 [00:00<?, ?it/s]

✅ Fichier prêt : submission_grandmaster.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>