In [3]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import joblib

# Assurez-vous que le modèle et les données utilisent le bon dispositif (GPU si disponible)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Charger le dataset nettoyé
data_cleaned = pd.read_csv('data\dataset_cleaned.csv')  

# Étape 1 : Encodage des étiquettes (maladies)
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(data_cleaned['Disease'])  # 'Disease' est la colonne des maladies

# Étape 2 : Séparer les données en jeu d'entraînement et de test
X = data_cleaned['Symptoms']  # 'Symptoms' est la colonne des symptômes
y = y_encoded  # Les maladies encodées

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Sauvegarder les jeux d'entraînement et de test
joblib.dump(X_train, 'X_train.pkl')
joblib.dump(X_test, 'X_test.pkl')
joblib.dump(y_train, 'y_train.pkl')
joblib.dump(y_test, 'y_test.pkl')

# Charger le tokenizer BioBERT
tokenizer = BertTokenizer.from_pretrained('dmis-lab/biobert-base-cased-v1.1')

# Tokenisation des symptômes pour le jeu d'entraînement et le jeu de test
tokens_train = [tokenizer(text, padding='max_length', max_length=128, truncation=True, return_tensors='pt') for text in X_train]
tokens_test = [tokenizer(text, padding='max_length', max_length=128, truncation=True, return_tensors='pt') for text in X_test]

# Création des datasets PyTorch
train_dataset = TensorDataset(torch.cat([t['input_ids'] for t in tokens_train]),
                              torch.cat([t['attention_mask'] for t in tokens_train]),
                              torch.tensor(y_train, dtype=torch.long))

test_dataset = TensorDataset(torch.cat([t['input_ids'] for t in tokens_test]),
                             torch.cat([t['attention_mask'] for t in tokens_test]),
                             torch.tensor(y_test, dtype=torch.long))

# Création des DataLoader pour charger les données par batch
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8)

# Étape 3 : Chargement du modèle BioBERT avec une couche de classification
num_labels = len(label_encoder.classes_)  # Nombre de classes à prédire
model = BertForSequenceClassification.from_pretrained('dmis-lab/biobert-base-cased-v1.1', num_labels=num_labels)
model.to(device)  # Déplacer le modèle vers le GPU si disponible

# Étape 4 : Définir l'optimiseur et la fonction de perte
optimizer = optim.AdamW(model.parameters(), lr=5e-5)
loss_fn = nn.CrossEntropyLoss()

# Étape 5 : Entraîner le modèle
def train_model(model, train_loader, optimizer, loss_fn, epochs=3):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch in train_loader:
            input_ids, attention_mask, labels = [x.to(device) for x in batch]
            labels = labels.long()  # S'assurer que les étiquettes sont de type LongTensor
            optimizer.zero_grad()  # Réinitialiser les gradients
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss  # Calculer la perte
            loss.backward()  # Calculer les gradients
            optimizer.step()  # Mettre à jour les poids
            total_loss += loss.item()
            print(f"Batch Loss: {loss.item()}")  # Suivre la perte à chaque batch
        print(f"Epoch {epoch+1}, Total Loss: {total_loss / len(train_loader)}")

# Lancer l'entraînement
train_model(model, train_loader, optimizer, loss_fn, epochs=3)

# Étape 6 : Évaluer le modèle
def evaluate_model(model, test_loader):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():  # Désactiver le calcul des gradients pendant l'évaluation
        for batch in test_loader:
            input_ids, attention_mask, labels = [x.to(device) for x in batch]
            outputs = model(input_ids, attention_mask=attention_mask)
            predictions = torch.argmax(outputs.logits, dim=-1)
            all_preds.extend(predictions.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    return all_preds, all_labels

# Obtenir les prédictions
y_pred, y_true = evaluate_model(model, test_loader)

# Afficher les résultats
from sklearn.metrics import classification_report
print(classification_report(y_true, y_pred, target_names=label_encoder.classes_))




  data_cleaned = pd.read_csv('data\dataset_cleaned.csv')
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch Loss: 3.7364487648010254
Batch Loss: 3.919140338897705
Batch Loss: 3.786372184753418
Batch Loss: 3.7489748001098633
Batch Loss: 3.588350534439087
Batch Loss: 3.882540464401245
Batch Loss: 3.735654830932617
Batch Loss: 3.6168875694274902
Batch Loss: 3.846874952316284
Batch Loss: 3.7607717514038086
Batch Loss: 3.856377601623535
Batch Loss: 3.749178647994995
Batch Loss: 3.65640926361084
Batch Loss: 3.5020320415496826
Batch Loss: 3.7195613384246826
Batch Loss: 3.6311843395233154
Batch Loss: 3.9716429710388184
Batch Loss: 3.6655733585357666
Batch Loss: 3.75846004486084
Batch Loss: 3.532353401184082
Batch Loss: 3.4184277057647705
Batch Loss: 3.5307247638702393
Batch Loss: 3.7607669830322266
Batch Loss: 3.670769214630127
Batch Loss: 3.5704519748687744
Batch Loss: 3.379662036895752
Batch Loss: 3.3096632957458496
Batch Loss: 3.5416059494018555
Batch Loss: 3.5491385459899902
Batch Loss: 3.569098949432373
Batch Loss: 3.5530004501342773
Batch Loss: 3.710785388946533
Batch Loss: 3.40361070632

In [4]:
# Étape 7 : Sauvegarder le modèle entraîné
torch.save(model.state_dict(), 'model_biobert.pth')


In [5]:
# Charger le modèle sauvegardé
model = BertForSequenceClassification.from_pretrained('dmis-lab/biobert-base-cased-v1.1', num_labels=num_labels)
model.load_state_dict(torch.load('model_biobert.pth'))  # Charger les poids sauvegardés
model.to(device)  # Déplacer le modèle sur le GPU si disponible
model.eval()  # Mettre le modèle en mode évaluation (pas de mise à jour des poids)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  model.load_state_dict(torch.load('model_biobert.pth'))  # Charger les poids sauvegardés


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [6]:
import pandas as pd

data=pd.read_csv('data\dataset_cleaned.csv')

num_diseases = data['Disease'].nunique()

print(f"Le nombre total de maladies dans le dataset est : {num_diseases}")

Le nombre total de maladies dans le dataset est : 41


  data=pd.read_csv('data\dataset_cleaned.csv')
