In [None]:
_Note préliminaire :_ Ce notebook a été éxécuté avec l'environnement jupyter-pytorch-gpu. De nombreuses dépendances y sont présentes et ne sont pas explicitement installées dans ce notebook. La cellule ci-dessous installe les dépendances supplémentaires. Du fait de l'installation de SentencePiece, il faut **impérativement redémarrer** le kernel jupyter après.

In [None]:
!pip install pandas numpy html2text torch transformers tqdm SentencePiece

# Entrainement et évaluation du modèle

Ce notebook détaille l'entraînement du modèle.

In [None]:
import pandas as pd
df_disruptions = pd.read_feather("data/objects_disruptions.feather")

In [None]:
from html2text import html2text

# Prétraitement des données
df_disruptions["text"] = df_disruptions["title"] + "\n\n" + df_disruptions["message"].apply(html2text)
df_disruptions["begin"] = pd.to_datetime(df_disruptions["begin"], format="%Y%m%dT%H%M%S", errors="coerce")
df_disruptions["end"] = pd.to_datetime(df_disruptions["end"], format="%Y%m%dT%H%M%S", errors="coerce")

# Calcul des jours, heures et minutes à partir de duration
df_disruptions["duration_days"] = (df_disruptions["end"] - df_disruptions["begin"]).dt.days
df_disruptions["duration_hours"] = (df_disruptions["end"] - df_disruptions["begin"]).dt.seconds // 3600  # Heures restantes
df_disruptions["duration_minutes"] = ((df_disruptions["end"] - df_disruptions["begin"]).dt.seconds % 3600) // 60  # Minutes restantes

In [None]:
# Normalisation robuste des durées (prise en compte des outliers)
quantiles_days = df_disruptions["duration_days"].quantile([0.25, 0.75])
quantiles_hours = df_disruptions["duration_hours"].quantile([0.25, 0.75])
quantiles_minutes = df_disruptions["duration_minutes"].quantile([0.25, 0.75])

# Calcul de l'IQR (Interquartile Range)
iqr_days = quantiles_days[0.75] - quantiles_days[0.25]
iqr_hours = quantiles_hours[0.75] - quantiles_hours[0.25]
iqr_minutes = quantiles_minutes[0.75] - quantiles_minutes[0.25]

# Normalisation par IQR
df_disruptions["duration_days_normalized"] = (df_disruptions["duration_days"] - quantiles_days[0.25]) / iqr_days
df_disruptions["duration_hours_normalized"] = (df_disruptions["duration_hours"] - quantiles_hours[0.25]) / iqr_hours
df_disruptions["duration_minutes_normalized"] = (df_disruptions["duration_minutes"] - quantiles_minutes[0.25]) / iqr_minutes

In [None]:
# One-hot encoding des colonnes catégoriques, avec drop_first pour économiser des colonnes.
df = pd.get_dummies(
    df_disruptions[["text", "duration_days", "duration_hours", "duration_minutes", "cause", "severity", "object_type", "line_mode"]],
    columns=["cause", "severity", "object_type", "line_mode"],
    drop_first=True
)

targets = [k for k in df.columns if k != "text"]

# Conversion des colonnes booléennes en float
for col in targets[1:]:
    df[col] = df[col].astype(float)

In [None]:
test_size = 0.2  # 20% pour le test
train_size = 1 - test_size

# Diviser le DataFrame
df_shuffled = df.sample(frac=1, random_state=123456789).reset_index(drop=True)  # Mélanger
train_df = df_shuffled.iloc[:int(len(df) * train_size)]
test_df = df_shuffled.iloc[int(len(df) * train_size):]

# Texte (entrée) et cibles
X_train = train_df["text"].values.tolist()
X_test = test_df["text"].values.tolist()
y_train = train_df[targets].values
y_test = test_df[targets].values


In [None]:
from transformers import CamembertTokenizer

# Charger le tokenizer
tokenizer = CamembertTokenizer.from_pretrained("camembert-base")

# Tokeniser les ensembles d'entraînement et de test
def tokenize_texts(texts):
    return tokenizer(
        texts,
        padding=True,
        truncation=True,
        max_length=128,
        return_tensors="pt"
    )

train_encodings = tokenize_texts(X_train)
test_encodings = tokenize_texts(X_test)

In [None]:
import torch
from torch.utils.data import Dataset

class MultiTaskDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            key: val[idx] for key, val in self.encodings.items()
        } | {"labels": torch.tensor(self.labels[idx], dtype=torch.float32)}

train_dataset = MultiTaskDataset(train_encodings, y_train)
test_dataset = MultiTaskDataset(test_encodings, y_test)

In [None]:
from torch.utils.data import DataLoader

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

In [None]:
from transformers import CamembertModel
import torch.nn as nn

class CamembertForMultiTask(nn.Module):
    def __init__(self, num_outputs):
        super(CamembertForMultiTask, self).__init__()
        self.model = CamembertModel.from_pretrained("camembert-base")
        self.regression_head = nn.Linear(self.model.config.hidden_size, num_outputs)

    def forward(self, input_ids, attention_mask):
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
        cls_embedding = outputs.last_hidden_state[:, 0, :]  # Utilise [CLS]
        return self.regression_head(cls_embedding)

# Instancie le modèle
model = CamembertForMultiTask(num_outputs=y_train.shape[1]).to("cuda")

class MultiTaskLoss(nn.Module):
    def __init__(self):
        super(MultiTaskLoss, self).__init__()
        self.mse = nn.MSELoss()
        self.bce = nn.BCEWithLogitsLoss()

    def forward(self, predictions, labels):
        # Séparer les cibles continues et binaires
        duration_preds = predictions[:, :3]  # Les trois premières colonnes : jours, heures, minutes
        duration_labels = labels[:, :3]

        binary_preds = predictions[:, 3:]  # Autres cibles booléennes
        binary_labels = labels[:, 3:]

        # Calcul des pertes
        mse_loss = self.mse(duration_preds, duration_labels)
        bce_loss = self.bce(binary_preds, binary_labels)

        # Retourner les pertes sous forme de scalaires
        return mse_loss.item(), bce_loss.item(), (mse_loss + bce_loss).item()

In [None]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=2e-5)
loss_fn = MultiTaskLoss()

In [None]:
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

for epoch in range(7):
    model.train()
    train_loss = 0
    train_mse_loss = 0
    train_bce_loss = 0

    for batch in tqdm(train_loader):
        optimizer.zero_grad()
    
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
    
        predictions = model(input_ids, attention_mask)
        mse_loss, bce_loss, loss = loss_fn(predictions, labels)
    
        # Convertir 'loss' en tenseur avant d'appeler backward
        loss_tensor = torch.tensor(loss, requires_grad=True).to(device)
        loss_tensor.backward()
    
        optimizer.step()
    
        train_loss += loss
        train_mse_loss += mse_loss
        train_bce_loss += bce_loss

    print(f"Epoch {epoch + 1}, Training Loss: {train_loss / len(train_loader):.4f}")
    print(f"Epoch {epoch + 1}, Training MSE Loss: {train_mse_loss / len(train_loader):.4f}")
    print(f"Epoch {epoch + 1}, Training BCE Loss: {train_bce_loss / len(train_loader):.4f}")

    torch.save(model.state_dict(), "checkpoint/epoch_" + str(epoch) + ".pt")

    # Calcul et affichage de la validation loss
    model.eval()
    val_loss = 0
    val_mse_loss = 0
    val_bce_loss = 0
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)
    
            predictions = model(input_ids, attention_mask)
            mse_loss, bce_loss, loss = loss_fn(predictions, labels)
    
            val_loss += loss
            val_mse_loss += mse_loss
            val_bce_loss += bce_loss

    print(f"Epoch {epoch + 1}, Validation Loss: {val_loss / len(test_loader):.4f}")
    print(f"Epoch {epoch + 1}, Validation MSE Loss: {val_mse_loss / len(test_loader):.4f}")
    print(f"Epoch {epoch + 1}, Validation BCE Loss: {val_bce_loss / len(test_loader):.4f}")