In [2]:
!pip install wget

Collecting wget
  Downloading wget-3.2.zip (10 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25ldone
[?25h  Created wheel for wget: filename=wget-3.2-py3-none-any.whl size=9685 sha256=211a60b4c7b23ee460e87a1750d37c6fcfa721572d6bd665bf3534efa22c8669
  Stored in directory: /home/onyxia/.cache/pip/wheels/01/46/3b/e29ffbe4ebe614ff224bad40fc6a5773a67a163251585a13a9
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2


In [1]:
!pip install pandas transformers

Collecting pandas
  Downloading pandas-2.2.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (89 kB)
Collecting transformers
  Downloading transformers-4.49.0-py3-none-any.whl.metadata (44 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2025.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting huggingface-hub<1.0,>=0.26.0 (from transformers)
  Downloading huggingface_hub-0.29.2-py3-none-any.whl.metadata (13 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2024.11.6-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting safetensors>=0.4.1 (from transformers)
  Downloading safetensors-0.5.3-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Downloading pandas-2.2.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86

In [3]:
import os
import json
import numpy as np

# Définir le dossier contenant les fichiers JSONL à modifier
dataset_folders = [
    "OpenLLMText_Human"
]

# Fonction pour ajouter les nouvelles métadonnées au champ "extra"
def enrich_data(file_path, text_source):
    updated_data = []
    
    # Charger le fichier JSONL ligne par ligne
    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()

    num_samples = len(lines)  # Nombre d'exemples dans le fichier

    # Générer les nouvelles métadonnées
    enriched_metadata = {
        "text_source": [text_source] * num_samples,
        "time_spent_per_word": np.random.uniform(0.1, 0.5, num_samples),
        "num_deletions": np.random.randint(5, 20, num_samples),
        "num_rewrites": np.random.randint(3, 15, num_samples),
        "copy_paste_usage": np.random.uniform(0, 0.3, num_samples),
        "pauses": np.random.uniform(1, 10, num_samples),
        "sentence_reordering": np.random.uniform(0.1, 0.5, num_samples),
    }

    # Modifier chaque ligne du fichier JSONL
    for i, line in enumerate(lines):
        data = json.loads(line.strip())  # Convertir en dict
        data["extra"].update({  # Ajouter les nouvelles informations
            "text_source": enriched_metadata["text_source"][i],
            "time_spent_per_word": float(enriched_metadata["time_spent_per_word"][i]),
            "num_deletions": int(enriched_metadata["num_deletions"][i]),
            "num_rewrites": int(enriched_metadata["num_rewrites"][i]),
            "copy_paste_usage": float(enriched_metadata["copy_paste_usage"][i]),
            "pauses": float(enriched_metadata["pauses"][i]),
            "sentence_reordering": float(enriched_metadata["sentence_reordering"][i])
        })
        updated_data.append(data)

    # Sauvegarder le fichier modifié
    new_file_path = file_path.replace(".jsonl", "_enriched.jsonl")
    with open(new_file_path, "w", encoding="utf-8") as f:
        for entry in updated_data:
            f.write(json.dumps(entry) + "\n")

    print(f"Fichier enrichi sauvegardé : {new_file_path}")

# Appliquer la transformation à tous les fichiers JSONL
for folder in dataset_folders:
    for filename in os.listdir(folder):
        if filename.endswith(".jsonl"):
            file_path = os.path.join(folder, filename)
            enrich_data(file_path, text_source=folder.replace("OpenLLMText_", ""))  # Déduire la source du texte


Fichier enrichi sauvegardé : OpenLLMText_Human/valid-dirty_enriched.jsonl
Fichier enrichi sauvegardé : OpenLLMText_Human/valid-dirty_enriched_enriched.jsonl
Fichier enrichi sauvegardé : OpenLLMText_Human/train-dirty_enriched.jsonl
Fichier enrichi sauvegardé : OpenLLMText_Human/train-dirty_enriched_enriched.jsonl
Fichier enrichi sauvegardé : OpenLLMText_Human/test-dirty_enriched_enriched.jsonl
Fichier enrichi sauvegardé : OpenLLMText_Human/test-dirty_enriched.jsonl


In [7]:
import os
import json
import pandas as pd

# Dossiers contenant les fichiers enrichis
dataset_folders = {
    "Human": "OpenLLMText_Human",
    "ChatGPT": "OpenLLMText_ChatGPT",
}

data = []

# Charger uniquement Human et ChatGPT
for source, folder in dataset_folders.items():
    for filename in os.listdir(folder):
        if filename.endswith("_enriched.jsonl"):
            file_path = os.path.join(folder, filename)
            with open(file_path, "r", encoding="utf-8") as f:
                for line in f:
                    entry = json.loads(line.strip())
                    text = entry["text"]
                    extra = entry["extra"]

                    # Créer un dictionnaire avec le texte, le label et les métadonnées séparées
                    data.append({
                        "text": text,
                        "label": 0 if source == "Human" else 1,  # 0 = Humain, 1 = IA
                        "meta_time_spent_per_word": extra["time_spent_per_word"],
                        "meta_num_deletions": extra["num_deletions"],
                        "meta_num_rewrites": extra["num_rewrites"],
                        "meta_copy_paste_usage": extra["copy_paste_usage"],
                        "meta_pauses": extra["pauses"],
                        "meta_sentence_reordering": extra["sentence_reordering"],
                    })

# Convertir en DataFrame
df = pd.DataFrame(data)

# Séparer les textes et les métadonnées
df_meta = df.filter(like="meta_").astype(float)  # Extraire uniquement les colonnes méta

print("✅ Données chargées avec séparation des métadonnées")
print(df.head())  # Vérifier le DataFrame complet
print(df_meta.head())  # Vérifier les métadonnées seules


✅ Données chargées avec séparation des métadonnées
                                                text  label  \
0  Wednesday, April 6th, 2016\n\n"It is shameful ...      0   
1  SAN FRANCISCO (BCN)— A civil lawsuit filed Wed...      0   
2  Automated wheel changer. Image: Rio Tinto.\n\n...      0   
3  A Washington Post reporter admitted that Presi...      0   
4  An a-scientific paper, poor contribution of NG...      0   

   meta_time_spent_per_word  meta_num_deletions  meta_num_rewrites  \
0                  0.142871                  14                 10   
1                  0.488851                   8                 10   
2                  0.260543                  14                 13   
3                  0.107372                  13                  3   
4                  0.161164                   8                 10   

   meta_copy_paste_usage  meta_pauses  meta_sentence_reordering  
0               0.199902     8.300962                  0.182648  
1               0

In [7]:
# Division en ensembles d'entraînement et de test
train_texts, test_texts, train_meta, test_meta, train_labels, test_labels = train_test_split(
    df["text"].tolist(), df_meta.values, df["label"].values, test_size=0.2, random_state=42)

# Définition du Dataset
class TextMetaDataset(Dataset):
    def __init__(self, texts, meta, labels):
        self.texts = texts
        self.meta = torch.tensor(meta, dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.long)
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        encoding = tokenizer(self.texts[idx], padding='max_length', truncation=True, max_length=512, return_tensors="pt")
        return {"input_ids": encoding["input_ids"].squeeze(0),
                "attention_mask": encoding["attention_mask"].squeeze(0),
                "meta": self.meta[idx],
                "label": self.labels[idx]}

# Charger les données
train_dataset = TextMetaDataset(train_texts, train_meta, train_labels)
test_dataset = TextMetaDataset(test_texts, test_meta, test_labels)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

# Définition du modèle
class TextMetaClassifier(nn.Module):
    def __init__(self):
        super(TextMetaClassifier, self).__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.meta_fc = nn.Sequential(
            nn.Linear(6, 16),
            nn.ReLU(),
            nn.Linear(16, 8),
            nn.ReLU()
        )
        self.fc = nn.Linear(self.bert.config.hidden_size + 8, 1)
    
    def forward(self, input_ids, attention_mask, meta):
        bert_output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        text_embedding = bert_output.pooler_output
        meta_embedding = self.meta_fc(meta)
        combined = torch.cat((text_embedding, meta_embedding), dim=1)
        return torch.sigmoid(self.fc(combined))

# Initialisation du modèle
model = TextMetaClassifier()
criterion = nn.BCELoss()
optimizer = optim.AdamW(model.parameters(), lr=2e-5)

def train_model(model, train_loader, optimizer, criterion, epochs=3):
    model.train()
    for epoch in range(epochs):
        for batch in train_loader:
            input_ids, attention_mask, meta, labels = batch["input_ids"], batch["attention_mask"], batch["meta"], batch["label"]
            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask, meta).squeeze()
            loss = criterion(outputs, labels.float())
            loss.backward()
            optimizer.step()
        print(f"Epoch {epoch + 1}, Loss: {loss.item()}")




In [18]:
# Division en ensembles d'entraînement et de test
train_texts, test_texts, train_meta, test_meta, train_labels, test_labels = train_test_split(
    df["text"].tolist(), df_meta.values, df["label"].values, test_size=0.2, random_state=42)

#from transformers import BertTokenizer

# Définition du Dataset
class TextMetaDataset(Dataset):
    def __init__(self, texts, meta, labels):
        self.texts = texts
        self.meta = torch.tensor(meta, dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.long)
        self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")  # Ajout du tokenizer
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        encoding = self.tokenizer(self.texts[idx], padding='max_length', truncation=True, max_length=512, return_tensors="pt")
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "meta": self.meta[idx],
            "label": self.labels[idx]
        }

# Charger les données
train_dataset = TextMetaDataset(train_texts, train_meta, train_labels)
test_dataset = TextMetaDataset(test_texts, test_meta, test_labels)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

# Définition du modèle
class TextMetaClassifier(nn.Module):
    def __init__(self):
        super(TextMetaClassifier, self).__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.meta_fc = nn.Sequential(
            nn.Linear(6, 16),
            nn.ReLU(),
            nn.Linear(16, 8),
            nn.ReLU()
        )
        self.fc = nn.Linear(self.bert.config.hidden_size + 8, 1)
    
    def forward(self, input_ids, attention_mask, meta):
        bert_output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        text_embedding = bert_output.pooler_output
        meta_embedding = self.meta_fc(meta)
        combined = torch.cat((text_embedding, meta_embedding), dim=1)
        return torch.sigmoid(self.fc(combined))

# Initialisation du modèle
model = TextMetaClassifier()
criterion = nn.BCELoss()
optimizer = optim.AdamW(model.parameters(), lr=2e-5)

def train_model(model, train_loader, optimizer, criterion, epochs=3):
    model.train()
    for epoch in range(epochs):
        for batch in train_loader:
            input_ids, attention_mask, meta, labels = batch["input_ids"], batch["attention_mask"], batch["meta"], batch["label"]
            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask, meta).squeeze()
            loss = criterion(outputs, labels.float())
            loss.backward()
            optimizer.step()
        print(f"Epoch {epoch + 1}, Loss: {loss.item()}")




In [17]:
for name, param in model.named_parameters():
    print(f"{name}: {param.device}")  # Doit afficher "cuda:0" ou "cpu" partout


bert.embeddings.word_embeddings.weight: cpu
bert.embeddings.position_embeddings.weight: cpu
bert.embeddings.token_type_embeddings.weight: cpu
bert.embeddings.LayerNorm.weight: cpu
bert.embeddings.LayerNorm.bias: cpu
bert.encoder.layer.0.attention.self.query.weight: cpu
bert.encoder.layer.0.attention.self.query.bias: cpu
bert.encoder.layer.0.attention.self.key.weight: cpu
bert.encoder.layer.0.attention.self.key.bias: cpu
bert.encoder.layer.0.attention.self.value.weight: cpu
bert.encoder.layer.0.attention.self.value.bias: cpu
bert.encoder.layer.0.attention.output.dense.weight: cpu
bert.encoder.layer.0.attention.output.dense.bias: cpu
bert.encoder.layer.0.attention.output.LayerNorm.weight: cpu
bert.encoder.layer.0.attention.output.LayerNorm.bias: cpu
bert.encoder.layer.0.intermediate.dense.weight: cpu
bert.encoder.layer.0.intermediate.dense.bias: cpu
bert.encoder.layer.0.output.dense.weight: cpu
bert.encoder.layer.0.output.dense.bias: cpu
bert.encoder.layer.0.output.LayerNorm.weight: cpu


In [12]:
from transformers import BertModel
model = BertModel.from_pretrained("bert-base-uncased")
print("BERT chargé avec succès !")


BERT chargé avec succès !


In [18]:
import os
import json
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from torch.cuda.amp import autocast
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Définition du device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Modèle chargé sur: {device}")

# Mise à jour de la batch size pour accélérer l'évaluation
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=True)

# Fonction d'évaluation sans les métadonnées
def evaluate_model_without_meta(model, test_loader, criterion, max_batches=10):
    model.eval()
    all_preds = []
    all_labels = []
    total_loss = 0.0
    
    with torch.no_grad():
        for i, batch in enumerate(test_loader):
            if i >= max_batches:
                break  # Arrêter l'évaluation après 'max_batches' batchs
            
            input_ids, attention_mask, labels = (
                batch["input_ids"].to(device),
                batch["attention_mask"].to(device),
                batch["label"].to(device),
            )

            with autocast():  # Accélère l'inférence sur GPU
                outputs = model.bert(input_ids=input_ids, attention_mask=attention_mask).pooler_output
                outputs = model.fc(outputs).squeeze()
                loss = criterion(outputs, labels.float())
            
            total_loss += loss.item()
            
            preds = (outputs >= 0.5).long()
            all_preds.append(preds.cpu())
            all_labels.append(labels.cpu())
            
            print(f"Batch {i+1}/{max_batches} - Loss: {loss.item():.4f}")  # Suivi de l'évaluation
    
    all_preds = torch.cat(all_preds).numpy()
    all_labels = torch.cat(all_labels).numpy()
    
    avg_loss = total_loss / max_batches
    accuracy = accuracy_score(all_labels, all_preds)
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average="binary")

    print(f"✅ Validation Loss (sans métadonnées): {avg_loss:.4f}")
    print(f"✅ Accuracy (sans métadonnées): {accuracy:.4f}")
    print(f"✅ Precision: {precision:.4f}, Recall: {recall:.4f}, F1-score: {f1:.4f}")

# Exécuter l'évaluation sans les métadonnées avec optimisation
evaluate_model_without_meta(model, test_loader, criterion, max_batches=10)


Modèle chargé sur: cuda


  with autocast():  # Accélère l'inférence sur GPU


RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument index in method wrapper_CUDA__index_select)

In [19]:
import os
import json
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from torch.cuda.amp import autocast
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Définition du device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Modèle chargé sur: {device}")

# Mise à jour de la batch size pour accélérer l'évaluation
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=True)

# Fonction d'évaluation sans les métadonnées
def evaluate_model_without_meta(model, test_loader, criterion, max_batches=10):
    model.to(device)  # Assurer que le modèle est bien sur le bon device
    model.eval()
    all_preds = []
    all_labels = []
    total_loss = 0.0
    
    with torch.no_grad():
        for i, batch in enumerate(test_loader):
            if i >= max_batches:
                break  # Arrêter l'évaluation après 'max_batches' batchs
            
            input_ids, attention_mask, labels = (
                batch["input_ids"].to(device),
                batch["attention_mask"].to(device),
                batch["label"].to(device),
            )

            with autocast():  # Accélère l'inférence sur GPU
                outputs = model.bert(input_ids=input_ids, attention_mask=attention_mask).pooler_output.to(device)
                outputs = model.fc(outputs).squeeze().to(device)
                loss = criterion(outputs, labels.float())
            
            total_loss += loss.item()
            
            preds = (outputs >= 0.5).long()
            all_preds.append(preds.cpu())
            all_labels.append(labels.cpu())
            
            print(f"Batch {i+1}/{max_batches} - Loss: {loss.item():.4f}")  # Suivi de l'évaluation
    
    all_preds = torch.cat(all_preds).numpy()
    all_labels = torch.cat(all_labels).numpy()
    
    avg_loss = total_loss / max_batches
    accuracy = accuracy_score(all_labels, all_preds)
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average="binary")

    print(f"✅ Validation Loss (sans métadonnées): {avg_loss:.4f}")
    print(f"✅ Accuracy (sans métadonnées): {accuracy:.4f}")
    print(f"✅ Precision: {precision:.4f}, Recall: {recall:.4f}, F1-score: {f1:.4f}")

# Exécuter l'évaluation sans les métadonnées avec optimisation
evaluate_model_without_meta(model, test_loader, criterion, max_batches=10)

Modèle chargé sur: cuda


  with autocast():  # Accélère l'inférence sur GPU


RuntimeError: mat1 and mat2 shapes cannot be multiplied (16x768 and 776x1)

In [None]:
# Réduction du dataset pour un entraînement rapide
df = df.sample(n=5000, random_state=42)
df_meta = df.filter(like="meta_").astype(float)

# Division en ensembles d'entraînement et de test
train_texts, test_texts, train_meta, test_meta, train_labels, test_labels = train_test_split(
    df["text"].tolist(), df_meta.values, df["label"].values, test_size=0.2, random_state=42)

# Définition du Dataset
class TextMetaDataset(Dataset):
    def __init__(self, texts, meta, labels):
        self.texts = texts
        self.meta = torch.tensor(meta, dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.long)
        self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        encoding = self.tokenizer(self.texts[idx], padding='max_length', truncation=True, max_length=512, return_tensors="pt")
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "meta": self.meta[idx],
            "label": self.labels[idx]
        }

# Charger les données
train_dataset = TextMetaDataset(train_texts, train_meta, train_labels)
test_dataset = TextMetaDataset(test_texts, test_meta, test_labels)

train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False)

# Définition du modèle
class TextMetaClassifier(nn.Module):
    def __init__(self):
        super(TextMetaClassifier, self).__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.meta_fc = nn.Sequential(
            nn.Linear(6, 16),
            nn.ReLU(),
            nn.Linear(16, 8),
            nn.ReLU()
        )
        self.fc = nn.Linear(self.bert.config.hidden_size + 8, 1)
    
    def forward(self, input_ids, attention_mask, meta):
        bert_output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        text_embedding = bert_output.pooler_output
        meta_embedding = self.meta_fc(meta)
        combined = torch.cat((text_embedding, meta_embedding), dim=1)
        return torch.sigmoid(self.fc(combined))

# Initialisation du modèle
model = TextMetaClassifier()
criterion = nn.BCELoss()
optimizer = optim.AdamW(model.parameters(), lr=2e-5)

# Fonction de sauvegarde et chargement des checkpoints
def save_checkpoint(epoch, model, optimizer, loss, path="checkpoint.pth"):
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'loss': loss,
    }, path)
    print(f"✅ Modèle sauvegardé après l'epoch {epoch}")

def train_model(model, train_loader, optimizer, criterion, epochs=3):
    model.train()
    for epoch in range(epochs):
        running_loss = 0.0
        for i, batch in enumerate(train_loader):
            input_ids, attention_mask, meta, labels = batch["input_ids"], batch["attention_mask"], batch["meta"], batch["label"]
            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask, meta).squeeze()
            loss = criterion(outputs, labels.float())
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()
            if i % 10 == 0:  # Affichage toutes les 10 itérations
                print(f"Epoch {epoch + 1}, Batch {i}, Loss: {loss.item()}")
        print(f"Epoch {epoch + 1} terminé, Loss Moyenne: {running_loss / len(train_loader)}")
        save_checkpoint(epoch, model, optimizer, running_loss / len(train_loader))

train_model(model, train_loader, optimizer, criterion)



Epoch 1, Batch 0, Loss: 0.6796258687973022


In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
# Définition du device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
def evaluate_model(model, test_loader, criterion):
    model.eval()
    all_preds = []
    all_labels = []
    total_loss = 0.0
    
    with torch.no_grad():
        for batch in test_loader:
            input_ids, attention_mask, meta, labels = (
                batch["input_ids"].to(device),
                batch["attention_mask"].to(device),
                batch["meta"].to(device),
                batch["label"].to(device),
            )

            outputs = model(input_ids, attention_mask, meta).squeeze()
            loss = criterion(outputs, labels.float())
            total_loss += loss.item()
            
            preds = (outputs >= 0.5).long()  # Seuil de classification
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    avg_loss = total_loss / len(test_loader)
    accuracy = accuracy_score(all_labels, all_preds)
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average="binary")

    print(f"✅ Validation Loss: {avg_loss:.4f}")
    print(f"✅ Accuracy: {accuracy:.4f}")
    print(f"✅ Precision: {precision:.4f}, Recall: {recall:.4f}, F1-score: {f1:.4f}")

# Exécuter l'évaluation
evaluate_model(model, test_loader, criterion)


In [21]:
import os
import json
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from torch.cuda.amp import autocast
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Définition du device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Réduction de l'échantillon de test pour accélérer l'évaluation
test_subset_size = min(100, len(test_loader.dataset))  # Limite à 100 échantillons max
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=True)

# Fonction d'évaluation
def evaluate_model(model, test_loader, criterion, max_batches=250):
    model.eval()
    all_preds = []
    all_labels = []
    total_loss = 0.0
    
    with torch.no_grad():
        for i, batch in enumerate(test_loader):
            if i >= max_batches:
                break  # Arrêter l'évaluation après 'max_batches' batchs
            
            input_ids, attention_mask, meta, labels = (
                batch["input_ids"].to(device),
                batch["attention_mask"].to(device),
                batch["meta"].to(device),
                batch["label"].to(device),
            )

            outputs = model(input_ids, attention_mask, meta).squeeze()
            loss = criterion(outputs, labels.float())
            
            total_loss += loss.item()
            
            preds = (outputs >= 0.5).long()
            all_preds.append(preds.cpu())
            all_labels.append(labels.cpu())
            
            print(f"Batch {i+1}/{max_batches} - Loss: {loss.item():.4f}")  # Suivi de l'évaluation
    
    all_preds = torch.cat(all_preds).numpy()
    all_labels = torch.cat(all_labels).numpy()
    
    avg_loss = total_loss / max_batches
    accuracy = accuracy_score(all_labels, all_preds)
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average="binary")

    print(f"✅ Validation Loss: {avg_loss:.4f}")
    print(f"✅ Accuracy: {accuracy:.4f}")
    print(f"✅ Precision: {precision:.4f}, Recall: {recall:.4f}, F1-score: {f1:.4f}")

# Exécuter l'évaluation avec un échantillon réduit
evaluate_model(model, test_loader, criterion, max_batches=250)


Batch 1/250 - Loss: 0.3057
Batch 2/250 - Loss: 0.0052
Batch 3/250 - Loss: 0.1583
Batch 4/250 - Loss: 0.0033
Batch 5/250 - Loss: 0.0024
Batch 6/250 - Loss: 0.0101
Batch 7/250 - Loss: 0.0220
Batch 8/250 - Loss: 0.0030
Batch 9/250 - Loss: 0.9458
Batch 10/250 - Loss: 0.0010
Batch 11/250 - Loss: 0.9965
Batch 12/250 - Loss: 0.0019
Batch 13/250 - Loss: 0.0006
Batch 14/250 - Loss: 0.0070
Batch 15/250 - Loss: 0.3241
Batch 16/250 - Loss: 0.0030
Batch 17/250 - Loss: 0.0106
Batch 18/250 - Loss: 0.0043
Batch 19/250 - Loss: 0.0010
Batch 20/250 - Loss: 0.0044
Batch 21/250 - Loss: 0.6886
Batch 22/250 - Loss: 0.0032
Batch 23/250 - Loss: 0.6866
Batch 24/250 - Loss: 0.2109
Batch 25/250 - Loss: 0.0673
Batch 26/250 - Loss: 0.0155
Batch 27/250 - Loss: 0.1304
Batch 28/250 - Loss: 0.0048
Batch 29/250 - Loss: 0.0223
Batch 30/250 - Loss: 0.0041
Batch 31/250 - Loss: 0.0031
Batch 32/250 - Loss: 0.0031
Batch 33/250 - Loss: 0.0009
Batch 34/250 - Loss: 0.0050
Batch 35/250 - Loss: 0.0016
Batch 36/250 - Loss: 0.0076
B

In [None]:
import torch

# Définition du device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"✅ Utilisation de {device}")

# Charger le modèle
model = TextMetaClassifier().to(device)  # Met le modèle sur le bon device
checkpoint = torch.load("checkpoint.pth", map_location=device)  # Charge sur le bon device
model.load_state_dict(checkpoint["model_state_dict"])

# Restaurer l'optimiseur sur le bon device
optimizer = optim.AdamW(model.parameters(), lr=2e-5)
optimizer.load_state_dict(checkpoint["optimizer_state_dict"])

# Déplacer les valeurs de l'optimiseur vers le bon device
for param_group in optimizer.param_groups:
    param_group['params'] = [p.to(device) for p in param_group['params']]

print(f"✅ Modèle restauré à l'epoch {checkpoint['epoch']} avec loss {checkpoint['loss']:.4f}")


In [None]:
import torch

# Charger le modèle
model = TextMetaClassifier()
checkpoint = torch.load("checkpoint.pth", map_location=torch.device("cuda" if torch.cuda.is_available() else "cpu"))
model.load_state_dict(checkpoint["model_state_dict"])

# Restaurer l'optimiseur
optimizer.load_state_dict(checkpoint["optimizer_state_dict"])

print(f"✅ Modèle restauré à l'epoch {checkpoint['epoch']} avec loss {checkpoint['loss']:.4f}")




In [2]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")


In [10]:
import os
import json
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader

In [4]:
!pip install scikit-learn pandas transformers

Collecting scikit-learn
  Downloading scikit_learn-1.6.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Collecting pandas
  Downloading pandas-2.2.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (89 kB)
Collecting transformers
  Downloading transformers-4.49.0-py3-none-any.whl.metadata (44 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Downloading scipy-1.15.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Downloading joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2025.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting huggingface-hub<1.0,>=0.26.0 (from transformers)
  Downloading huggingface_hub-0.29.2-py3-none-any.whl.metadata (13 kB)
Collecting regex!=2019.12.17 (from transfo