In [3]:
!pip install torch-scatter torch-sparse torch-cluster torch-spline-conv -f https://data.pyg.org/whl/torch-1.13.1+cpu.html
!pip install torch-geometric

Looking in links: https://data.pyg.org/whl/torch-1.13.1+cpu.html


In [4]:

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.preprocessing import LabelEncoder, StandardScaler
from torch_geometric.data import Data
from torch_geometric.nn import GATConv
from sentence_transformers import SentenceTransformer
from collections import Counter
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from google.colab import drive
drive.mount('/content/drive', force_remount=True)


Mounted at /content/drive


In [5]:
df = pd.read_csv("/content/drive/MyDrive/sarra/ML/new_train_top_k_reduced/mimic_lllm_v2.csv")
df.fillna("unknown", inplace=True)

In [6]:
cat_cols = ["GENDER", "LANGUAGE", "INSURANCE", "RELIGION", "MARITAL_STATUS", "ETHNICITY", "Maladie_chronique"]
label_encoders = {}
for col in cat_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    label_encoders[col] = le

In [7]:
df["age"] = pd.to_numeric(df["age"], errors="coerce").fillna(0)
df["age"] = (df["age"] - df["age"].mean()) / df["age"].std()

In [8]:
top_k = 60
drug_counts = Counter(df["DRUG"])
top_k_drugs = set([drug for drug, _ in drug_counts.most_common(top_k)])
df = df[df["DRUG"].isin(top_k_drugs)].reset_index(drop=True)

drug_encoder = LabelEncoder()
df["DRUG"] = drug_encoder.fit_transform(df["DRUG"].astype(str))

In [9]:
#####################################
# 2. Textual embeddings
#####################################
st_model = SentenceTransformer('all-MiniLM-L6-v2')
df['texte_combine'] = df["Symptômes"] + " " + df["Allergies"] + " " + df["Traitement_régulier"]
patient_text_embeddings = st_model.encode(df['texte_combine'].tolist())

patient_features = df[["age"] + cat_cols].values
combined_patient_features = np.concatenate([patient_features, patient_text_embeddings], axis=1)
scaler = StandardScaler()
combined_patient_features = scaler.fit_transform(combined_patient_features)
feature_dim = combined_patient_features.shape[1]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [10]:
#####################################
# 3. Build graph with 8 relations
#####################################
node_map = {}
node_counter = 0
def get_node_id(ntype, raw_id):
    global node_counter
    key = f"{ntype}_{raw_id}"
    if key not in node_map:
        node_map[key] = node_counter
        node_counter += 1
    return node_map[key]

for pid in df["SUBJECT_ID"].unique():
    get_node_id("patient", pid)

edge_src, edge_dst, edge_time, edge_attr, edge_names = [], [], [], [], []
labels_list = []
target_nodes = []

In [11]:
def build_note(row):
    return f"{row['Symptômes']} | {row['Allergies']} | {row['Traitement_régulier']}"

In [12]:
for i, row in df.iterrows():
    pid = get_node_id("patient", row["SUBJECT_ID"])
    aid = get_node_id("admission", row["HADM_ID"])
    did = get_node_id("diagnosis", row["CD9_CODE_DIAGNOSIS"])
    prid = get_node_id("procedure", row["ICD-9- PROCEDURES"])
    mid = get_node_id("medication", row["DRUG"])
    note_text = build_note(row)
    nid = get_node_id("note", note_text)

    t = pd.to_datetime(row["ADMITTIME"]).timestamp()

    relations = [
        ('patient', 'HAS_ADMISSION', 'admission', row["SUBJECT_ID"], row["HADM_ID"]),
        ('patient', 'HAS_DIAGNOSIS', 'diagnosis', row["SUBJECT_ID"], row["CD9_CODE_DIAGNOSIS"]),
        ('patient', 'UNDERWENT_PROCEDURE', 'procedure', row["SUBJECT_ID"], row["ICD-9- PROCEDURES"]),
        ('patient', 'PRESCRIBED_MEDICATION', 'medication', row["SUBJECT_ID"], row["DRUG"]),
        ('admission', 'ASSOCIATED_DIAGNOSIS', 'diagnosis', row["HADM_ID"], row["CD9_CODE_DIAGNOSIS"]),
        ('admission', 'ASSOCIATED_PROCEDURE', 'procedure', row["HADM_ID"], row["ICD-9- PROCEDURES"]),
        ('admission', 'ASSOCIATED_MEDICATION', 'medication', row["HADM_ID"], row["DRUG"]),
        ('admission', 'HAS_NOTE', 'note', row["HADM_ID"], note_text)
    ]

    for src_type, _, tgt_type, src_val, tgt_val in relations:
        s = get_node_id(src_type, src_val)
        d = get_node_id(tgt_type, tgt_val)
        edge_src.extend([s, d])  # Add bidirectional edges
        edge_dst.extend([d, s])
        edge_time.extend([t, t])
        edge_attr.extend([[1], [1]])
        edge_names.extend([f"{src_type}_to_{tgt_type}", f"{tgt_type}_to_{src_type}"])

    labels_list.append(row["DRUG"])
    target_nodes.append(aid)

print("Nombre de nœuds :", node_counter)
print("Nombre d'arcs :", len(edge_src))

Nombre de nœuds : 15037
Nombre d'arcs : 136224


In [13]:
#####################################
# 4. Features for patient nodes
#####################################
x = torch.zeros((node_counter, feature_dim))
for i, row in df.iterrows():
    key = f"patient_{row['SUBJECT_ID']}"
    pid = node_map[key]
    if i == df.index[df["SUBJECT_ID"] == row["SUBJECT_ID"]][0]:
        x[pid] = torch.tensor(combined_patient_features[i], dtype=torch.float)

In [14]:
#####################################
# 5. Build PyG Data object
#####################################
data = Data(
    x=x,
    edge_index=torch.tensor([edge_src, edge_dst], dtype=torch.long),
    edge_attr=torch.tensor(edge_attr, dtype=torch.float)
)
data.edge_names = edge_names

In [15]:
#####################################
# 6. Define GAT model
#####################################
class GATNet(nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, heads=4, dropout=0.6):
        super(GATNet, self).__init__()
        self.conv1 = GATConv(in_channels, hidden_channels, heads=heads, dropout=dropout)
        self.conv2 = GATConv(hidden_channels * heads, hidden_channels, heads=1, concat=False, dropout=dropout)
        self.lin = nn.Linear(hidden_channels, out_channels)
        self.dropout = dropout

    def forward(self, x, edge_index):
        x = F.dropout(x, p=self.dropout, training=self.training)
        x = self.conv1(x, edge_index)
        x = F.elu(x)
        x = F.dropout(x, p=self.dropout, training=self.training)
        x = self.conv2(x, edge_index)
        x = F.elu(x)
        x = self.lin(x)
        return x

In [16]:
#####################################
# 7. Classification Preparation
#####################################
target_nodes_tensor = torch.tensor(target_nodes, dtype=torch.long)
y_tensor = torch.tensor(labels_list, dtype=torch.long)
train_idx, test_idx = train_test_split(
    np.arange(len(target_nodes)), test_size=0.2, random_state=42, stratify=y_tensor.numpy()
)
train_idx = torch.tensor(train_idx, dtype=torch.long)
test_idx = torch.tensor(test_idx, dtype=torch.long)

# Class weights
class_weights = compute_class_weight(class_weight="balanced", classes=np.unique(y_tensor.numpy()), y=y_tensor.numpy())
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float)

# Device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
data = data.to(device)
target_nodes_tensor = target_nodes_tensor.to(device)
y_tensor = y_tensor.to(device)
train_idx = train_idx.to(device)
test_idx = test_idx.to(device)
class_weights_tensor = class_weights_tensor.to(device)

In [17]:
#####################################
# 8. Train the model
#####################################
model = GATNet(in_channels=feature_dim, hidden_channels=128, out_channels=len(torch.unique(y_tensor))).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.003, weight_decay=5e-4)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=50, gamma=0.5)
criterion = nn.CrossEntropyLoss(weight=class_weights_tensor)

num_epochs = 400
best_loss = float('inf')
patience = 50
trigger_times = 0

for epoch in range(1, num_epochs + 1):
    model.train()
    optimizer.zero_grad()
    out = model(data.x, data.edge_index)
    loss = criterion(out[target_nodes_tensor[train_idx]], y_tensor[train_idx])
    loss.backward()
    optimizer.step()
    scheduler.step()

    if loss.item() < best_loss:
        best_loss = loss.item()
        trigger_times = 0
    else:
        trigger_times += 1
        if trigger_times >= patience:
            print(f"Early stopping at epoch {epoch}")
            break

    if epoch % 10 == 0:
        model.eval()
        with torch.no_grad():
            out = model(data.x, data.edge_index)
            _, pred = out[target_nodes_tensor[test_idx]].max(dim=1)
            acc = (pred == y_tensor[test_idx]).sum().item() / test_idx.size(0)
        print(f"Epoch {epoch}, Loss: {loss.item():.4f}, Test Accuracy: {acc*100:.2f}%")


Epoch 10, Loss: 4.0833, Test Accuracy: 3.46%
Epoch 20, Loss: 4.0751, Test Accuracy: 3.05%
Epoch 30, Loss: 4.0241, Test Accuracy: 6.34%
Epoch 40, Loss: 4.0162, Test Accuracy: 16.97%
Epoch 50, Loss: 3.8833, Test Accuracy: 13.33%
Epoch 60, Loss: 3.8131, Test Accuracy: 14.91%
Epoch 70, Loss: 3.7380, Test Accuracy: 24.54%
Epoch 80, Loss: 3.7260, Test Accuracy: 27.19%
Epoch 90, Loss: 3.7613, Test Accuracy: 28.19%
Epoch 100, Loss: 3.4685, Test Accuracy: 24.90%
Epoch 110, Loss: 3.4671, Test Accuracy: 33.71%
Epoch 120, Loss: 3.2701, Test Accuracy: 42.22%
Epoch 130, Loss: 3.4308, Test Accuracy: 46.10%
Epoch 140, Loss: 3.1864, Test Accuracy: 47.56%
Epoch 150, Loss: 3.1456, Test Accuracy: 51.73%
Epoch 160, Loss: 3.0585, Test Accuracy: 54.84%
Epoch 170, Loss: 3.0795, Test Accuracy: 64.24%
Epoch 180, Loss: 3.0508, Test Accuracy: 66.24%
Epoch 190, Loss: 3.0590, Test Accuracy: 65.00%
Epoch 200, Loss: 3.0522, Test Accuracy: 64.36%
Epoch 210, Loss: 3.0610, Test Accuracy: 67.35%
Epoch 220, Loss: 2.9024, 

In [18]:
#####################################
# 9. Final evaluation
#####################################
model.eval()
with torch.no_grad():
    out = model(data.x, data.edge_index)
    _, pred = out[target_nodes_tensor].max(dim=1)

y_true = y_tensor.cpu().numpy()
y_pred = pred.cpu().numpy()

print("\n=== Rapport de classification ===")
print(classification_report(y_true, y_pred))
print("Accuracy: {:.2f}%".format(accuracy_score(y_true, y_pred) * 100))
print("Precision: {:.2f}%".format(precision_score(y_true, y_pred, average='weighted') * 100))
print("Recall: {:.2f}%".format(recall_score(y_true, y_pred, average='weighted') * 100))
print("F1 Score: {:.2f}%".format(f1_score(y_true, y_pred, average='weighted') * 100))


=== Rapport de classification ===
              precision    recall  f1-score   support

           0       0.77      0.84      0.80       314
           1       0.95      0.99      0.97       144
           2       0.69      1.00      0.82       134
           3       0.98      0.72      0.83       240
           4       0.61      1.00      0.76        79
           5       0.71      1.00      0.83       121
           6       0.15      1.00      0.27        49
           7       0.44      0.74      0.56        97
           8       1.00      0.66      0.80       121
           9       1.00      0.82      0.90       121
          10       1.00      1.00      1.00        91
          11       1.00      0.87      0.93       445
          12       0.49      1.00      0.66        61
          13       0.87      1.00      0.93        45
          14       0.56      1.00      0.72        44
          15       0.76      1.00      0.87       136
          16       0.90      1.00      0.95   

In [19]:
# Sauvegarder le modèle après entraînement
torch.save(model.state_dict(), "best_gat_model.pth")
print("✅ Modèle sauvegardé sous best_gat_model.pth")


✅ Modèle sauvegardé sous best_gat_model.pth


In [24]:
from sentence_transformers import SentenceTransformer
import numpy as np
import torch

# === INPUT UTILISATEUR ===
age_utilisateur = 29
sexe = "1"
symptomes = "fatigue, vision floue, soif excessive"
maladie_chronique = "; - DM2 (Diabetes Mellitus type 2); - ESRD (End-Stage Renal Disease); - Sarcoidosis; - COPD (Chronic Obstructive Pulmonary Disease); - CHF (Congestive Heart Failure)"
allergies = "aucune"
traitement_regulier = "metformine, insuline"

# === 1. Encoder les données démographiques ===
demo_vect = np.zeros((1, data.x.shape[1] - 384))  # shape = (1, nb_features_démographiques)

# Normaliser l'âge
mean_age = df['age'].mean()
std_age = df['age'].std()
demo_vect[0, 0] = (age_utilisateur - mean_age) / std_age

# Encoder les variables catégorielles si besoin
demo_vect[0, 1] = label_encoders["GENDER"].transform([sexe])[0]
demo_vect[0, 2] = label_encoders["Maladie_chronique"].transform([maladie_chronique])[0]
# Les autres features démographiques sont à 0

# === 2. Encoder le texte contextuel ===
texte_contextuel = f"Symptômes: {symptomes}. Allergies: {allergies}. Traitement régulier: {traitement_regulier}."
model_st = SentenceTransformer("all-MiniLM-L6-v2", device='cpu')  # Charger sur CPU pour éviter erreurs CUDA
embedding = model_st.encode([texte_contextuel])  # (1, 384)

# === 3. Fusion vecteur final (demo + text)
final_vector = np.concatenate([demo_vect, embedding], axis=1)
new_x = torch.cat([data.x.cpu(), torch.tensor(final_vector, dtype=torch.float)], dim=0)
new_index = new_x.shape[0] - 1  # nouvel index

# === 4. Prédiction
model.eval()
with torch.no_grad():
    output = model(new_x.to(device), data.edge_index.to(device))
    pred_class = torch.argmax(output[new_index]).item()
    predicted_drug = drug_encoder.inverse_transform([pred_class])[0]

print(f"💊 Médicament recommandé : {predicted_drug}")


💊 Médicament recommandé : Metoprolol


In [27]:
from sentence_transformers import SentenceTransformer
import numpy as np
import torch

# === INPUT UTILISATEUR ===
age_utilisateur = 50
sexe = "0"
symptomes = "fatigue, vision floue, soif excessive"
maladie_chronique = "CAD (Coronary Artery Disease), HTN (Hypertension), DM2 (Diabetes Mellitus type 2)"
allergies = "allergie aux bêta-bloquants"
traitement_regulier = "metformine, insuline"

# === 1. Encoder les données démographiques ===
demo_vect = np.zeros((1, data.x.shape[1] - 384))  # shape = (1, nb_features_démographiques)

# Normaliser l'âge
mean_age = df['age'].mean()
std_age = df['age'].std()
demo_vect[0, 0] = (age_utilisateur - mean_age) / std_age

# Encoder les variables catégorielles si besoin
demo_vect[0, 1] = label_encoders["GENDER"].transform([sexe])[0]
demo_vect[0, 2] = label_encoders["Maladie_chronique"].transform([maladie_chronique])[0]
# Les autres features démographiques sont à 0

# === 2. Encoder le texte contextuel ===
texte_contextuel = f"Symptômes: {symptomes}. Allergies: {allergies}. Traitement régulier: {traitement_regulier}."
model_st = SentenceTransformer("all-MiniLM-L6-v2", device='cpu')  # Charger sur CPU pour éviter erreurs CUDA
embedding = model_st.encode([texte_contextuel])  # (1, 384)

# === 3. Fusion vecteur final (demo + text)
final_vector = np.concatenate([demo_vect, embedding], axis=1)
new_x = torch.cat([data.x.cpu(), torch.tensor(final_vector, dtype=torch.float)], dim=0)
new_index = new_x.shape[0] - 1  # nouvel index

# === 4. Prédiction
model.eval()
with torch.no_grad():
    output = model(new_x.to(device), data.edge_index.to(device))
    pred_class = torch.argmax(output[new_index]).item()
    predicted_drug = drug_encoder.inverse_transform([pred_class])[0]

print(f"💊 Médicament recommandé : {predicted_drug}")

ValueError: y contains previously unseen labels: 'CAD (Coronary Artery Disease), HTN (Hypertension), DM2 (Diabetes Mellitus type 2)'

In [22]:
# Affiche toutes les maladies contenant "DM2"
for val in label_encoders["Maladie_chronique"].classes_:
    if "dm2" in val.lower():
        print(f"✔️ Copie EXACTEMENT :\n{val}\n")

✔️ Copie EXACTEMENT :
3V CAD, CHF - EF 35%, afib chronique, DM2, PVD s/p pontage L [**Doctor Last Name **]-->DP en [**2168**], récente découverte d'un cancer du côlon s/p colectomie R le [**2173-8-19**].

✔️ Copie EXACTEMENT :
; - CAD (Coronary Artery Disease) s/p CABG (Coronary Artery Bypass Grafting); - HTN (Hypertension); - DM2 (Type 2 Diabetes Mellitus); - CVA (Cerebrovascular Accident); - Atrial fibrillation on coumadin; - H/O stroke; - Bioprosthetic MVR (Mitral Valve Replacement); - LBBB (Left Bundle Branch Block); - L retinal detachment, vitreous hemorrhage; - CRI (Chronic Renal Insufficiency); - CHF (Congestive Heart Failure); - Anemia; - UTIs (Urinary Tract Infections); - Vascular dementia; - CKD (Chronic Kidney Disease)

✔️ Copie EXACTEMENT :
; - CAD (Coronary Artery Disease); - AAA (Abdominal Aortic Aneurysm); - DM2 (Diabetes Mellitus Type 2); - Cryptogenic cirrhosis; - Pancytopenia; - CRI (Chronic Renal Insufficiency); - HIT (Heparin-Induced Thrombocytopenia)

✔️ Copie EXAC

In [None]:
print("Valeurs valides pour GENDER :", label_encoders["GENDER"].classes_)
print("Valeurs valides pour Maladie_chronique :", label_encoders["Maladie_chronique"].classes_)


In [None]:
import openai
import os
from dotenv import load_dotenv
load_dotenv()
# Charger le modèle
model.load_state_dict(torch.load("best_gat_model.pth"))
model.eval()

# Choisir un exemple (ex: premier patient du dataset)
example_idx = 100
with torch.no_grad():
    out = model(data.x, data.edge_index)
    _, pred = out[target_nodes_tensor].max(dim=1)

predicted_class_id = pred[example_idx].item()
predicted_drug = drug_encoder.inverse_transform([predicted_class_id])[0]

# Extraire les infos du patient
patient_row = df.iloc[example_idx]
prompt = f"""
Voici les informations patient :
- Âge : {int(patient_row['age'] * df['age'].std() + df['age'].mean())}
- Sexe : {label_encoders['GENDER'].inverse_transform([patient_row['GENDER']])[0]}
- Symptômes : {patient_row['Symptômes']}
- Maladie chronique : {label_encoders['Maladie_chronique'].inverse_transform([patient_row['Maladie_chronique']])[0]}
- Allergies : {patient_row['Allergies']}
- Traitement régulier : {patient_row['Traitement_régulier']}

Le système de recommandation a suggéré : {predicted_drug}

Explique pourquoi ce médicament est adapté à ce patient.
"""

# Appel GPT
client = openai.OpenAI(api_key="OPENAI_API_KEY")  # Remplace par ta vraie clé API ici

response = client.chat.completions.create(
    model="gpt-4",
    messages=[
        {"role": "system", "content": "Tu es un assistant médical expert."},
        {"role": "user", "content": prompt}
    ],
    temperature=0.7
)

# Affichage
print("\n🧠 Réponse du LLM :\n")
print(response.choices[0].message.content)
