In [2]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
import time

# Config Hardware
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Working on {device}")

def load_criteo_sample(filepath='data/criteo/train.txt', num_rows=1000000):
    print(f"‚è≥ Chargement de {num_rows} lignes depuis {filepath}...")
    
    cols = ['label'] + [f'I{i}' for i in range(1, 14)] + [f'C{i}' for i in range(1, 27)]
    
    # Lecture (on pr√©cise header=None car le fichier n'a pas d'en-t√™te)
    df = pd.read_csv(filepath, sep='\t', names=cols, nrows=num_rows, header=None)
    
    # --- PREPROCESSING NUMERIQUE (BLIND√â) ---
    print("‚öôÔ∏è Nettoyage Num√©rique (Force Numeric + Log + Scaling)...")
    
    # 1. Force la conversion en nombres (transforme les erreurs/strings en NaN)
    # C'est l'√©tape qui manquait : elle nettoie les donn√©es corrompues
    for col in [f'I{i}' for i in range(1, 14)]:
        df[col] = pd.to_numeric(df[col], errors='coerce')
    
    # 2. R√©cup√©ration des valeurs + Remplissage des trous
    x_num = df[[f'I{i}' for i in range(1, 14)]].fillna(0).values.astype(np.float32)
    
    # 3. Log transformation (log1p est plus s√ªr que log)
    x_num = np.log1p(x_num)
    
    # 4. SAFETY CHECK : On remplace les Infinis √©ventuels par 0
    x_num = np.nan_to_num(x_num)
    
    # 5. Scaling
    scaler = StandardScaler()
    x_num = scaler.fit_transform(x_num)
    
    # --- PREPROCESSING CATEGORIEL ---
    print("‚öôÔ∏è Nettoyage Cat√©goriel (Hashing Trick)...")
    x_cat = np.zeros((len(df), 26), dtype=np.int64)
    VOCAB_SIZE = 20000 
    
    for i, col in enumerate([f'C{i}' for i in range(1, 27)]):
        # On convertit tout en string avant de hasher pour √©viter les bugs de type
        df[col] = df[col].fillna("missing").astype(str).apply(lambda x: hash(x) % VOCAB_SIZE)
        x_cat[:, i] = df[col].values
        
    y = df['label'].values.astype(np.float32)
    
    print(f"‚úÖ Donn√©es pr√™tes et propres : {len(y)} √©chantillons.")
    return x_num, x_cat, y, VOCAB_SIZE

# Dataset PyTorch
class CriteoDataset(Dataset):
    def __init__(self, x_num, x_cat, y):
        self.x_num = torch.tensor(x_num, dtype=torch.float32)
        self.x_cat = torch.tensor(x_cat, dtype=torch.long)
        self.y = torch.tensor(y, dtype=torch.float32)
    def __len__(self): return len(self.y)
    def __getitem__(self, idx): return self.x_num[idx], self.x_cat[idx], self.y[idx]

Working on cpu


In [3]:
# --- A. TON DSN (Spectral Interaction) ---
from SBN import MultiBasisBilinearLayer

class CriteoDSN(nn.Module):
    def __init__(self, vocab_size, embed_dim=16):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, embed_dim)
        # Input Size = 13 (Num) + 26 * Embed_Dim (Cat)
        self.input_dim = 13 + (26 * embed_dim)
        
        # Le Cerveau Spectral
        self.dsn = MultiBasisBilinearLayer(self.input_dim, 1, num_bases=1)
        
    def forward(self, x_num, x_cat):
        # Flatten des embeddings : [Batch, 26 * 16]
        embs = self.emb(x_cat).view(x_cat.size(0), -1)
        # Concat√©nation Totale
        x = torch.cat([x_num, embs], dim=1)
        return self.dsn(x).squeeze(-1)

# --- B. LE MLP (Baseline Standard) ---
class CriteoMLP(nn.Module):
    def __init__(self, vocab_size, embed_dim=16):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, embed_dim)
        self.input_dim = 13 + (26 * embed_dim)
        
        # Tour MLP Classique (ReLU)
        self.mlp = nn.Sequential(
            nn.Linear(self.input_dim, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, 1)
        )
        
    def forward(self, x_num, x_cat):
        embs = self.emb(x_cat).view(x_cat.size(0), -1)
        x = torch.cat([x_num, embs], dim=1)
        return self.mlp(x).squeeze(-1)

In [8]:
import time
import torch
import torch.nn as nn
from sklearn.metrics import roc_auc_score
import numpy as np

def train_eval(model, train_loader, val_loader, name="Model"):
    optimizer = torch.optim.AdamW(model.parameters(), lr=0.0005)
    criterion = nn.BCEWithLogitsLoss()
    
    print(f"\nü•ä D√©marrage Entra√Ænement : {name}")
    print("-" * 50)
    
    for epoch in range(10): 
        t0 = time.time()
        model.train()
        total_loss = 0
        total_grad_norm = 0.0 # Variable pour cumuler les normes
        
        for x_n, x_c, y in train_loader:
            x_n, x_c, y = x_n.to(device), x_c.to(device), y.to(device)
            optimizer.zero_grad()
            logits = model(x_n, x_c)
            loss = criterion(logits, y)
            loss.backward()
            
            # --- CALCUL DE LA NORME DU GRADIENT ---
            # On calcule la norme L2 de tous les gradients concat√©n√©s
            batch_norm = 0.0
            for p in model.parameters():
                if p.grad is not None:
                    param_norm = p.grad.detach().data.norm(2)
                    batch_norm += param_norm.item() ** 2
            batch_norm = batch_norm ** 0.5
            
            total_grad_norm += batch_norm
            # --------------------------------------
            
            optimizer.step()
            total_loss += loss.item()
            
        # Moyennes pour l'√©poque
        avg_loss = total_loss / len(train_loader)
        avg_grad_norm = total_grad_norm / len(train_loader)

        # Validation
        model.eval()
        all_preds, all_y = [], []
        with torch.no_grad():
            for x_n, x_c, y in val_loader:
                x_n, x_c, y = x_n.to(device), x_c.to(device), y.to(device)
                logits = model(x_n, x_c)
                all_preds.append(torch.sigmoid(logits).cpu().numpy())
                all_y.append(y.cpu().numpy())
                
        auc = roc_auc_score(np.concatenate(all_y), np.concatenate(all_preds))
        dt = time.time() - t0
        
        # Affichage avec la Norme du Gradient
        print(f"Epoch {epoch+1} | Loss: {avg_loss:.4f} | VAL AUC: {auc:.4f} | Grad Norm: {avg_grad_norm:.4f} | Time: {dt:.1f}s")

In [4]:
# --- EXECUTION DU DUEL ---

# 1. Chargement (Adapte le chemin 'train.txt' si besoin)
x_num, x_cat, y, vocab_size = load_criteo_sample('data/criteo/train.txt', num_rows=1000000)

# 2. Split Train/Val (80/20)
split = int(0.8 * len(y))
train_ds = CriteoDataset(x_num[:split], x_cat[:split], y[:split])
val_ds = CriteoDataset(x_num[split:], x_cat[split:], y[split:])

train_loader = DataLoader(train_ds, batch_size=1024, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=1024)

‚è≥ Chargement de 1000000 lignes depuis data/criteo/train.txt...
‚öôÔ∏è Nettoyage Num√©rique (Force Numeric + Log + Scaling)...


  x_num = np.log1p(x_num)
  x_num = np.log1p(x_num)


‚öôÔ∏è Nettoyage Cat√©goriel (Hashing Trick)...
‚úÖ Donn√©es pr√™tes et propres : 1000000 √©chantillons.


In [51]:
# 3. FIGHT !
dsn = CriteoDSN(vocab_size).to(device)
mlp = CriteoMLP(vocab_size).to(device)

train_eval(mlp, train_loader, val_loader, name="Baseline MLP")
train_eval(dsn, train_loader, val_loader, name="Ton DSN (Spectral)")


ü•ä D√©marrage Entra√Ænement : Baseline MLP
--------------------------------------------------
Epoch 1 | Loss: 0.4933 | VAL AUC: 0.7610 | Grad Norm: 0.2130 | Time: 9.4s
Epoch 2 | Loss: 0.4767 | VAL AUC: 0.7678 | Grad Norm: 0.1615 | Time: 10.0s
Epoch 3 | Loss: 0.4682 | VAL AUC: 0.7718 | Grad Norm: 0.1494 | Time: 13.7s
Epoch 4 | Loss: 0.4615 | VAL AUC: 0.7737 | Grad Norm: 0.1423 | Time: 11.4s
Epoch 5 | Loss: 0.4554 | VAL AUC: 0.7743 | Grad Norm: 0.1380 | Time: 9.7s
Epoch 6 | Loss: 0.4497 | VAL AUC: 0.7760 | Grad Norm: 0.1366 | Time: 10.1s
Epoch 7 | Loss: 0.4437 | VAL AUC: 0.7756 | Grad Norm: 0.1379 | Time: 8.7s
Epoch 8 | Loss: 0.4385 | VAL AUC: 0.7742 | Grad Norm: 0.1393 | Time: 8.8s
Epoch 9 | Loss: 0.4335 | VAL AUC: 0.7737 | Grad Norm: 0.1427 | Time: 8.9s
Epoch 10 | Loss: 0.4277 | VAL AUC: 0.7713 | Grad Norm: 0.1461 | Time: 8.7s

ü•ä D√©marrage Entra√Ænement : Ton DSN (Spectral)
--------------------------------------------------
Epoch 1 | Loss: 0.4948 | VAL AUC: 0.7595 | Grad Norm: 0

KeyboardInterrupt: 

In [5]:
import torch
import torch.nn as nn
import torch.nn.init as init

# --- 1. COUCHE SPECTRALE VECTORIS√âE (Optimis√©e Vitesse) ---
class VectorizedSpectralLayer(nn.Module):
    def __init__(self, in_features, out_features, num_bases=2): # Bases r√©duites √† 2 pour la vitesse
        super().__init__()
        self.num_bases = num_bases
        
        # On utilise une seule matrice de poids [Bases, In, In] pour √©viter les boucles Python lentes
        # Mais pour rester simple et compatible, on garde les Linear mais on r√©duit la dimension
        self.right_bases = nn.ModuleList([nn.Linear(in_features, 32) for _ in range(num_bases)]) # Projection Low-Rank
        self.left_bases = nn.ModuleList([nn.Linear(in_features, 32) for _ in range(num_bases)])
        
        self.eigen_weights = nn.Linear(32 * num_bases, out_features)
        
    def forward(self, x):
        # x: [Batch, In_Features]
        
        all_interactions = []
        for i in range(self.num_bases):
            # Interaction dans un espace r√©duit (32 dim) pour aller tr√®s vite
            r = self.right_bases[i](x)
            l = self.left_bases[i](x)
            all_interactions.append(r * l) # Interaction Bilin√©aire
            
        # [Batch, 32 * Num_Bases]
        combined = torch.cat(all_interactions, dim=-1)
        return self.eigen_weights(combined)

# --- 2. ARCHITECTURE "SPECTRAL CROSS" ---
class CriteoSpectralCross(nn.Module):
    def __init__(self, vocab_size, embed_dim=16):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, embed_dim)
        self.num_proj = nn.Linear(13, 13 * embed_dim)
        
        # 39 champs * 16 dim = 624 dimensions
        self.total_dim = (13 + 26) * embed_dim
        
        # --- L'INNOVATION ---
        # Au lieu d'un simple Linear(624, 256), on met ton Spectral Layer
        # Il force le croisement des donn√©es AVANT de r√©duire la dimension
        self.spectral_layer = VectorizedSpectralLayer(self.total_dim, 256, num_bases=2)
        self.bn1 = nn.BatchNorm1d(256)
        
        # Finition MLP classique pour la d√©cision
        self.final_mlp = nn.Sequential(
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, 1)
        )
        
    def forward(self, x_num, x_cat):
        batch_size = x_num.size(0)
        
        # 1. Prepare Inputs
        num_vecs = self.num_proj(x_num).view(batch_size, -1) # [Batch, 13*16]
        cat_vecs = self.emb(x_cat).view(batch_size, -1)      # [Batch, 26*16]
        
        # 2. Global Concatenation (Comme le MLP)
        # C'est crucial pour la vitesse : on ne fait qu'une seule grosse op√©ration
        x = torch.cat([num_vecs, cat_vecs], dim=1) # [Batch, 624]
        
        # 3. Spectral Interaction & Compression
        x = self.spectral_layer(x) # [Batch, 256]
        x = self.bn1(x)
        
        # 4. D√©cision
        return self.final_mlp(x).squeeze(-1)

In [6]:
import time
import torch
import torch.nn as nn
from sklearn.metrics import roc_auc_score
import numpy as np

def train_eval(model, train_loader, val_loader, name="Model"):
    # CHANGEMENT 1 : Weight Decay ajust√© √† 1e-3 (Le juste milieu pour stabiliser sans tuer)
    optimizer = torch.optim.AdamW(model.parameters(), lr=0.0005, weight_decay=1e-3)
    criterion = nn.BCEWithLogitsLoss()
    
    print(f"\nü•ä D√©marrage Entra√Ænement : {name}")
    print("-" * 50)
    
    for epoch in range(10): 
        t0 = time.time()
        model.train()
        total_loss = 0
        total_grad_norm = 0.0
        
        for x_n, x_c, y in train_loader:
            x_n, x_c, y = x_n.to(device), x_c.to(device), y.to(device)
            optimizer.zero_grad()
            
            logits = model(x_n, x_c)
            loss = criterion(logits, y)
            loss.backward()
            
            # CHANGEMENT 2 : Clipping strict √† 0.2
            # On force le DSN √† √™tre aussi stable que le MLP (qui est naturellement √† ~0.20)
            norm_tensor = torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=0.2)
            
            total_grad_norm += norm_tensor.item()
            
            optimizer.step()
            total_loss += loss.item()
            
        # Moyennes
        avg_loss = total_loss / len(train_loader)
        avg_grad_norm = total_grad_norm / len(train_loader)

        # Validation
        model.eval()
        all_preds, all_y = [], []
        with torch.no_grad():
            for x_n, x_c, y in val_loader:
                x_n, x_c, y = x_n.to(device), x_c.to(device), y.to(device)
                logits = model(x_n, x_c)
                probs = torch.sigmoid(logits)
                all_preds.append(probs.cpu().numpy())
                all_y.append(y.cpu().numpy())
                
        auc = roc_auc_score(np.concatenate(all_y), np.concatenate(all_preds))
        dt = time.time() - t0
        
        print(f"Epoch {epoch+1} | Loss: {avg_loss:.4f} | VAL AUC: {auc:.4f} | Grad Norm: {avg_grad_norm:.4f} | Time: {dt:.1f}s")

In [10]:
import torch
import torch.nn as nn
from SBN2 import DeepMultiBasisBilinearNet  # <--- On utilise TON fichier

class CriteoDeepWrapper(nn.Module):
    def __init__(self, vocab_size, embed_dim=16, num_bases=4, rank_factor=0.5):
        super().__init__()

        # --- 1. Pr√©paration des Features (Standard) ---
        self.emb = nn.Embedding(vocab_size, embed_dim)
        self.num_proj = nn.Linear(13, 13 * embed_dim)

        # Dimension totale = (13 num√©riques + 26 cat√©gorielles) * 16
        input_dim = (13 + 26) * embed_dim  # 624

        # --- 2. TON BACKBONE (DeepMultiBasisBilinearNet) ---
        # Architecture : 624 -> 256 -> 128 -> 1
        # rank_factor contr√¥le la compression Low-Rank interne
        self.backbone = DeepMultiBasisBilinearNet(
            layers_dim=[input_dim, 32,16, 1],
            num_bases=num_bases,
            rank_factor=rank_factor,
            ortho_mode='cayley',      # Ou 'hard', 'cayley', None
            use_final_linear=False,  # Pour la couche de sortie (classification)
            use_layernorm=True,     # Pour la stabilit√©
            use_residual=False       # Pour la profondeur
        )

    def forward(self, x_num, x_cat):
        batch_size = x_num.size(0)

        # A. Vectorisation
        num_vecs = self.num_proj(x_num).view(batch_size, -1)
        cat_vecs = self.emb(x_cat).view(batch_size, -1)
        x = torch.cat([num_vecs, cat_vecs], dim=1)

        # B. Passage dans ton r√©seau
        logits = self.backbone(x)

        return logits.squeeze(-1)

# --- Instanciation ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [8]:

model = CriteoDeepWrapper(
    vocab_size=20000,
    embed_dim=16,
    num_bases=2,
    rank_factor=0.1 # Rank = Dim * 0.5
).to(device)
# Tu peux lancer le training direct.

# Rappel MLP Score √† battre : ~0.7763
train_eval(model, train_loader, val_loader, name="Deep Low-Rank DSN")

# --- 3. RELANCE LE DUEL ---

VOCAB_SIZE = 20000 
# On garde les m√™mes hyperparam√®tres
model_spectral = CriteoSpectralCross(vocab_size=VOCAB_SIZE, embed_dim=16).to(device)
mlp_baseline = CriteoMLP(vocab_size=VOCAB_SIZE, embed_dim=16).to(device) # Ta classe MLP pr√©c√©dente

print("‚ö° Lancement MLP (Rappel)...")
train_eval(mlp_baseline, train_loader, val_loader, name="Baseline MLP")

print("\n‚ö° Lancement SPECTRAL CROSS (Optimis√©)...")
train_eval(model_spectral,train_loader, val_loader,  name="Spectral Cross DSN")


ü•ä D√©marrage Entra√Ænement : Deep Low-Rank DSN
--------------------------------------------------
Epoch 1 | Loss: 0.5124 | VAL AUC: 0.7524 | Grad Norm: 0.3497 | Time: 9.7s
Epoch 2 | Loss: 0.4802 | VAL AUC: 0.7620 | Grad Norm: 0.2645 | Time: 9.4s
Epoch 3 | Loss: 0.4707 | VAL AUC: 0.7671 | Grad Norm: 0.2783 | Time: 9.4s
Epoch 4 | Loss: 0.4626 | VAL AUC: 0.7685 | Grad Norm: 0.2820 | Time: 9.4s
Epoch 5 | Loss: 0.4552 | VAL AUC: 0.7676 | Grad Norm: 0.2998 | Time: 9.2s
Epoch 6 | Loss: 0.4472 | VAL AUC: 0.7652 | Grad Norm: 0.3180 | Time: 9.6s


KeyboardInterrupt: 

In [22]:
import time
import torch
import torch.nn as nn
from SBN2 import DeepMultiBasisBilinearNet
from sklearn.metrics import roc_auc_score
import numpy as np

# Assure-toi d'utiliser le mod√®le 32-16 (le bon compromis)
model = CriteoDeepWrapper(vocab_size=20000, embed_dim=16, num_bases=4, rank_factor=0.25).to(device)

def train_eval(model, train_loader, val_loader, name="Cosine-Stabilized"):
    # 1. WEIGHT DECAY : On passe √† 2e-3. C'est la "gravit√©" qui emp√™che la norme de monter.
    optimizer = torch.optim.Adam(model.parameters(), lr=0.0002)#, weight_decay=2e-5)
    
    # 2. SCHEDULER COSINE : Il freine en continu. Pas de surprise.
    # T_max=10 correspond √† ton nombre d'√©poques.
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=10)
    
    criterion = nn.BCEWithLogitsLoss()
    
    print(f"\nü•ä D√©marrage Entra√Ænement : {name}")
    print("-" * 50)
    
    for epoch in range(10): 
        t0 = time.time()
        model.train()
        total_loss = 0
        total_grad_norm = 0.0
        
        for x_n, x_c, y in train_loader:
            x_n, x_c, y = x_n.to(device), x_c.to(device), y.to(device)
            optimizer.zero_grad()
            logits = model(x_n, x_c)
            loss = criterion(logits, y)
            loss.backward()
            
            # 3. CLIPPING : Toujours √† 0.20. C'est notre garde-fou.
            norm_tensor = torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=0.20)
            total_grad_norm += norm_tensor.item()
            
            optimizer.step()
            total_loss += loss.item()
        
        # Le Scheduler avance √† chaque √©poque (ind√©pendamment de l'AUC)
        scheduler.step()
            
        avg_loss = total_loss / len(train_loader)
        avg_grad_norm = total_grad_norm / len(train_loader)

        model.eval()
        all_preds, all_y = [], []
        with torch.no_grad():
            for x_n, x_c, y in val_loader:
                x_n, x_c, y = x_n.to(device), x_c.to(device), y.to(device)
                logits = model(x_n, x_c)
                probs = torch.sigmoid(logits)
                all_preds.append(probs.cpu().numpy())
                all_y.append(y.cpu().numpy())
                
        auc = roc_auc_score(np.concatenate(all_y), np.concatenate(all_preds))
        dt = time.time() - t0
        
        current_lr = optimizer.param_groups[0]['lr']
        
        print(f"Epoch {epoch+1} | Loss: {avg_loss:.4f} | VAL AUC: {auc:.4f} | Norm: {avg_grad_norm:.4f} | LR: {current_lr:.1e} | Time: {dt:.1f}s")

In [23]:
import time
import torch
import torch.nn as nn
from SBN2 import DeepMultiBasisBilinearNet
from sklearn.metrics import roc_auc_score
import numpy as np

class CriteoDeepWrapper(nn.Module):
    def __init__(self, vocab_size, embed_dim=16, num_bases=4, rank_factor=0.25):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, embed_dim)
        
        # 1. Normalisation d√®s l'entr√©e num√©rique (Batch Norm 1D sur les 13 features)
        self.input_norm = nn.BatchNorm1d(13)
        self.num_proj = nn.Linear(13, 13 * embed_dim)
        
        input_dim = (13 + 26) * embed_dim 

        self.backbone = DeepMultiBasisBilinearNet(
            layers_dim=[input_dim,32, 1],
            num_bases=num_bases,
            rank_factor=rank_factor,
            ortho_mode='cayley',   # On garde Cayley, c'est tr√®s bien pour W
            use_final_linear=False,
            use_layernorm=True,
            use_residual=False       
        )

    def forward(self, x_num, x_cat):
        # 2. Pr√©-traitement Robuste : Log + BatchNorm
        # On applique log(1+x) pour √©craser les ordres de grandeur (1000 -> 6.9)
        x_num = torch.log1p(x_num.clamp(min=0)) 
        x_num = self.input_norm(x_num)
        
        batch_size = x_num.size(0)
        num_vecs = self.num_proj(x_num).view(batch_size, -1)
        cat_vecs = self.emb(x_cat).view(batch_size, -1)
        x = torch.cat([num_vecs, cat_vecs], dim=1)
        
        logits = self.backbone(x)
        return logits.squeeze(-1)

# ... Instanciation ...
model = CriteoDeepWrapper(vocab_size=20000, embed_dim=16, num_bases=4, rank_factor=0.25).to(device)

def train_eval(model, train_loader, val_loader, name="Robust-Input-DSN"):
    optimizer = torch.optim.Adam(model.parameters(), lr=0.0002)#, weight_decay=1e-3)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=10)
    criterion = nn.BCEWithLogitsLoss()
    
    print(f"\nü•ä D√©marrage Entra√Ænement : {name}")
    print("-" * 50)
    
    for epoch in range(10): 
        t0 = time.time()
        model.train()
        total_loss = 0
        total_grad_norm = 0.0 # Juste pour l'affichage
        
        for x_n, x_c, y in train_loader:
            x_n, x_c, y = x_n.to(device), x_c.to(device), y.to(device)
            optimizer.zero_grad()
            logits = model(x_n, x_c)
            loss = criterion(logits, y)
            loss.backward()
            
            # --- CHANGEMENT CRUCIAL ICI ---
            # 3. On utilise clip_grad_VALUE_ au lieu de Norm.
            # Cela coupe les t√™tes qui d√©passent sans √©craser les petits gradients.
            torch.nn.utils.clip_grad_value_(model.parameters(), clip_value=0.1)
            
            # (Optionnel) On calcule quand m√™me la norme pour voir si elle baisse
            # Mais on ne l'utilise plus pour clipper.
            with torch.no_grad():
                batch_norm = 0.0
                for p in model.parameters():
                    if p.grad is not None:
                        batch_norm += p.grad.norm(2).item() ** 2
                batch_norm = batch_norm ** 0.5
                total_grad_norm += batch_norm
            
            optimizer.step()
            total_loss += loss.item()
        
        scheduler.step()
        
        avg_loss = total_loss / len(train_loader)
        avg_grad_norm = total_grad_norm / len(train_loader)
        current_lr = optimizer.param_groups[0]['lr']

        # Validation...
        model.eval()
        all_preds, all_y = [], []
        with torch.no_grad():
            for x_n, x_c, y in val_loader:
                # IMPORTANT : Appliquer la m√™me transfo log en validation !
                # (Le mod√®le le fait dans le forward, donc c'est bon ici)
                x_n, x_c, y = x_n.to(device), x_c.to(device), y.to(device)
                logits = model(x_n, x_c)
                all_preds.append(torch.sigmoid(logits).cpu().numpy())
                all_y.append(y.cpu().numpy())
                
        auc = roc_auc_score(np.concatenate(all_y), np.concatenate(all_preds))
        dt = time.time() - t0
        
        print(f"Epoch {epoch+1} | Loss: {avg_loss:.4f} | VAL AUC: {auc:.4f} | Norm: {avg_grad_norm:.4f} | LR: {current_lr:.1e} | Time: {dt:.1f}s")

In [24]:
# Rappel MLP Score √† battre : ~0.7763
train_eval(model, train_loader, val_loader, name="Deep Low-Rank DSN")


ü•ä D√©marrage Entra√Ænement : Deep Low-Rank DSN
--------------------------------------------------
Epoch 1 | Loss: 0.4977 | VAL AUC: 0.7566 | Norm: 0.2904 | LR: 2.0e-04 | Time: 57.2s
Epoch 2 | Loss: 0.4715 | VAL AUC: 0.7618 | Norm: 0.2771 | LR: 1.8e-04 | Time: 53.8s
Epoch 3 | Loss: 0.4556 | VAL AUC: 0.7596 | Norm: 0.3379 | LR: 1.6e-04 | Time: 53.9s
Epoch 4 | Loss: 0.4359 | VAL AUC: 0.7535 | Norm: 0.4248 | LR: 1.3e-04 | Time: 54.9s
Epoch 5 | Loss: 0.4127 | VAL AUC: 0.7432 | Norm: 0.5210 | LR: 1.0e-04 | Time: 192.4s
Epoch 6 | Loss: 0.3891 | VAL AUC: 0.7365 | Norm: 0.6103 | LR: 6.9e-05 | Time: 179.6s
Epoch 7 | Loss: 0.3681 | VAL AUC: 0.7272 | Norm: 0.6861 | LR: 4.1e-05 | Time: 168.4s
Epoch 8 | Loss: 0.3511 | VAL AUC: 0.7236 | Norm: 0.7429 | LR: 1.9e-05 | Time: 67.6s
Epoch 9 | Loss: 0.3392 | VAL AUC: 0.7201 | Norm: 0.7761 | LR: 4.9e-06 | Time: 68.0s
Epoch 10 | Loss: 0.3327 | VAL AUC: 0.7190 | Norm: 0.7866 | LR: 0.0e+00 | Time: 1151.9s
