# Chargement PTB

In [44]:
import sys
import kagglehub
import pandas as pd
import os

In [45]:
# Download latest version
path = kagglehub.dataset_download("abhirampolisetti/ptb-diagnostic-ecg-database")

In [46]:
from transformers import AutoTokenizer, AutoModel

model_name = "emilyalsentzer/Bio_ClinicalBERT"

tokenizer = AutoTokenizer.from_pretrained(model_name)
bert_model = AutoModel.from_pretrained(model_name)


In [47]:

# Download latest version
path = kagglehub.dataset_download("tboyle10/medicaltranscriptions")

csv_path = os.path.join(path, "mtsamples.csv")
df = pd.read_csv(csv_path)

df.head()


Unnamed: 0.1,Unnamed: 0,description,medical_specialty,sample_name,transcription,keywords
0,0,A 23-year-old white female presents with comp...,Allergy / Immunology,Allergic Rhinitis,"SUBJECTIVE:, This 23-year-old white female pr...","allergy / immunology, allergic rhinitis, aller..."
1,1,Consult for laparoscopic gastric bypass.,Bariatrics,Laparoscopic Gastric Bypass Consult - 2,"PAST MEDICAL HISTORY:, He has difficulty climb...","bariatrics, laparoscopic gastric bypass, weigh..."
2,2,Consult for laparoscopic gastric bypass.,Bariatrics,Laparoscopic Gastric Bypass Consult - 1,"HISTORY OF PRESENT ILLNESS: , I have seen ABC ...","bariatrics, laparoscopic gastric bypass, heart..."
3,3,2-D M-Mode. Doppler.,Cardiovascular / Pulmonary,2-D Echocardiogram - 1,"2-D M-MODE: , ,1. Left atrial enlargement wit...","cardiovascular / pulmonary, 2-d m-mode, dopple..."
4,4,2-D Echocardiogram,Cardiovascular / Pulmonary,2-D Echocardiogram - 2,1. The left ventricular cavity size and wall ...,"cardiovascular / pulmonary, 2-d, doppler, echo..."


In [48]:
df["label"] = df["medical_specialty"].apply(
    lambda x: 1 if "Cardio" in x or "Cardiovascular" in x else 0
)
df["label"].value_counts()

label
0    4627
1     372
Name: count, dtype: int64

In [49]:
df = df.dropna(subset=["transcription"])
df["text"] = df["transcription"].str.lower()

In [None]:
from datasets import Dataset

# On garde uniquement les colonnes utiles : texte et label
df_small = df[['transcription', 'medical_specialty']].copy()

# Optionnel : on fait un mapping des labels vers des IDs (0,1,2,...)
labels = df_small['medical_specialty'].unique().tolist()
label2id = {label: i for i, label in enumerate(labels)}
id2label = {i: label for label, i in label2id.items()}

df_small['label'] = df_small['medical_specialty'].map(label2id)

# Hugging Face Dataset
dataset = Dataset.from_pandas(df_small)

# Split train/test 20%
dataset = dataset.train_test_split(test_size=0.2)
train_dataset = dataset['train']
eval_dataset = dataset['test']


In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

def tokenize_function(examples):
    return tokenizer(examples["transcription"], truncation=True, padding="max_length", max_length=256)

train_dataset = train_dataset.map(tokenize_function, batched=True)
eval_dataset = eval_dataset.map(tokenize_function, batched=True)

train_dataset = train_dataset.rename_column("label", "labels")
eval_dataset = eval_dataset.rename_column("label", "labels")

train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
eval_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])


Map: 100%|██████████| 3972/3972 [00:01<00:00, 2812.85 examples/s]
Map: 100%|██████████| 994/994 [00:00<00:00, 2797.81 examples/s]


In [52]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    "emilyalsentzer/Bio_ClinicalBERT",
    num_labels=len(labels),
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./clinicalbert",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./logs",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    
)


In [54]:
from transformers import Trainer
from sklearn.metrics import f1_score, accuracy_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    f1 = f1_score(labels, predictions, average="macro")
    acc = accuracy_score(labels, predictions)
    return {"accuracy": acc, "f1": f1}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics
)

In [55]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,2.500082,0.347082,0.040478
2,2.829700,2.188518,0.361167,0.070352
3,2.336200,2.046144,0.360161,0.076181
4,2.049000,1.975591,0.355131,0.091892
5,1.890500,1.959386,0.337022,0.095707


TrainOutput(global_step=2485, training_loss=2.1843717824525277, metrics={'train_runtime': 2301.7931, 'train_samples_per_second': 8.628, 'train_steps_per_second': 1.08, 'total_flos': 2613584195665920.0, 'train_loss': 2.1843717824525277, 'epoch': 5.0})