# Chargement des librairies

In [None]:
import os
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel
import kagglehub

# Chargement du modèle ClinicalBERT

In [None]:
model_name = "emilyalsentzer/Bio_ClinicalBERT"


tokenizer = AutoTokenizer.from_pretrained(model_name)
bert_model = AutoModel.from_pretrained(model_name)

In [None]:
path = kagglehub.dataset_download("tboyle10/medicaltranscriptions")
csv_path = os.path.join(path, "mtsamples.csv")


df = pd.read_csv(csv_path)
df = df[["transcription", "medical_specialty"]].dropna()

# Explo des données

In [None]:
df["medical_specialty"].value_counts().head(10)

Unnamed: 0.1,Unnamed: 0,description,medical_specialty,sample_name,transcription,keywords
0,0,A 23-year-old white female presents with comp...,Allergy / Immunology,Allergic Rhinitis,"SUBJECTIVE:, This 23-year-old white female pr...","allergy / immunology, allergic rhinitis, aller..."
1,1,Consult for laparoscopic gastric bypass.,Bariatrics,Laparoscopic Gastric Bypass Consult - 2,"PAST MEDICAL HISTORY:, He has difficulty climb...","bariatrics, laparoscopic gastric bypass, weigh..."
2,2,Consult for laparoscopic gastric bypass.,Bariatrics,Laparoscopic Gastric Bypass Consult - 1,"HISTORY OF PRESENT ILLNESS: , I have seen ABC ...","bariatrics, laparoscopic gastric bypass, heart..."
3,3,2-D M-Mode. Doppler.,Cardiovascular / Pulmonary,2-D Echocardiogram - 1,"2-D M-MODE: , ,1. Left atrial enlargement wit...","cardiovascular / pulmonary, 2-d m-mode, dopple..."
4,4,2-D Echocardiogram,Cardiovascular / Pulmonary,2-D Echocardiogram - 2,1. The left ventricular cavity size and wall ...,"cardiovascular / pulmonary, 2-d, doppler, echo..."


# Tokenisation

In [None]:
sample_text = df.iloc[0]["transcription"]
inputs = tokenizer(
    sample_text,
    truncation=True,
    padding="max_length",
    max_length=256,
    return_tensors="pt"
)

label
0    4627
1     372
Name: count, dtype: int64

# Extraction des embeddings

In [None]:
with torch.no_grad():
    outputs = bert_model(**inputs)


last_hidden_state = outputs.last_hidden_state
cls_embedding = last_hidden_state[:, 0, :]

# Visualisation simple des embeddings

In [None]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt


embeddings = []
labels = []


for i in range(100):
    text = df.iloc[i]["transcription"]
    label = df.iloc[i]["medical_specialty"]

    inputs = tokenizer(text, truncation=True, padding="max_length", max_length=128, return_tensors="pt")
    with torch.no_grad():
        outputs = bert_model(**inputs)

    emb = outputs.last_hidden_state[:, 0, :].squeeze().numpy()
    embeddings.append(emb)
    labels.append(label)


X = PCA(n_components=2).fit_transform(embeddings)


plt.figure(figsize=(8,6))
plt.scatter(X[:,0], X[:,1])
plt.title("Projection PCA des embeddings cliniques")
plt.show()