In [None]:
# Install dependencies
#!pip install -r https://raw.githubusercontent.com/HeedfulMoss/DEEP_ML_Project/main/requirements.txt

In [None]:
# NLP Multi-label Classification Training Pipeline for ICD-9 Codes
# Using BERT (or Bio_ClinicalBERT) + HuggingFace Transformers

import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer
from transformers import DataCollatorWithPadding
import transformers
from transformers.training_args import TrainingArguments
import joblib
import os
from pathlib import Path

transformers.logging.set_verbosity_error()

print("CUDA Available:", torch.cuda.is_available())
print("Device Name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU only")
print("torch version:", torch.__version__)
print("torch location:", torch.__file__)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU device name: {torch.cuda.get_device_name(0)}")

# 1. Load the dataset (full and reduced versions)
preprocessed_dir = Path("../data/preprocessed")
preprocessed_dir.mkdir(parents=True, exist_ok=True)
train_df_path = preprocessed_dir / "summary_results.csv"
df = pd.read_csv(train_df_path)
#df = pd.read_csv("summary_results.csv").sample(3000, random_state=42)

# 2. Preprocessing: clean `summary_snippet`
def clean_text(text):
    text = text.replace("\n", " ").replace("  ", " ")
    text = text.strip()
    return text

df["summary_snippet"] = df["summary_snippet"].astype(str).apply(clean_text)

# 3. Convert `icd9_codes` to list and one-hot encode
labels = df["icd9_codes"].apply(lambda x: x.split(", "))
mlb = MultiLabelBinarizer()
label_matrix = mlb.fit_transform(labels)

# 4. Train/val split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df["summary_snippet"].tolist(), label_matrix, test_size=0.1, random_state=42
)

# 5. Tokenization (using Bio_ClinicalBERT or BERT)
MODEL_NAME = "emilyalsentzer/Bio_ClinicalBERT"  # or "bert-base-uncased" 
#MODEL_NAME = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)

# 6. Custom Dataset
class ICD9Dataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = ICD9Dataset(train_encodings, train_labels)
val_dataset = ICD9Dataset(val_encodings, val_labels)

# 7. Load model
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, num_labels=label_matrix.shape[1]
)

results_dir = Path("../models/bert_icd9/results")
results_dir.mkdir(parents=True, exist_ok=True)
logging_dir = results_dir / "logs"

# 8. Training args (minimal args to fix compatibility issue)
training_args = TrainingArguments(
    output_dir=str(results_dir),
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    logging_dir=str(logging_dir)
)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# 9. Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

# 10. Train
trainer.train()

model_dir = Path("models/bert_icd9/icd9_bert_model")
binarizer_path = Path("models/bert_icd9/icd9_label_binarizer.pkl")
#results_dir = Path("results/bert_icd9")

model_dir.mkdir(parents=True, exist_ok=True)
binarizer_path.mkdir(parents=True, exist_ok=True)

# 11. Save model and label binarizer
model.save_pretrained(model_dir)
tokenizer.save_pretrained(model_dir)
joblib.dump(mlb, binarizer_path)

# 13. Streamlit App Example (optional UI)
# Save this in a separate file named app.py and run with: streamlit run app.py
"""
import streamlit as st
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import joblib

st.title("ICD-9 Code Predictor from Discharge Summary")

user_input = st.text_area("Paste a clinical note:", height=300)

if st.button("Predict"):
    tokenizer = AutoTokenizer.from_pretrained("./icd9_bert_model")
    model = AutoModelForSequenceClassification.from_pretrained("./icd9_bert_model")
    mlb = joblib.load("icd9_label_binarizer.pkl")

    inputs = tokenizer(user_input, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.sigmoid(outputs.logits).squeeze().cpu().numpy()
        predicted = mlb.inverse_transform([probs > 0.5])[0]

    st.subheader("Predicted ICD-9 Codes:")
    st.write(predicted)
"""


CUDA Available: True
Device Name: NVIDIA GeForce GTX 1080 Ti
torch version: 2.6.0+cu126
torch location: C:\Users\Alex\Documents\GitHub\DEEP_ML_Project\venv\Lib\site-packages\torch\__init__.py
Using device: cuda
GPU device name: NVIDIA GeForce GTX 1080 Ti


  trainer = Trainer(


{'loss': 0.3871, 'grad_norm': 0.7656834721565247, 'learning_rate': 4.852148148148149e-05, 'epoch': 0.08888888888888889}
{'loss': 0.337, 'grad_norm': 0.8874655365943909, 'learning_rate': 4.7040000000000004e-05, 'epoch': 0.17777777777777778}
{'loss': 0.3256, 'grad_norm': 0.6774147152900696, 'learning_rate': 4.555851851851852e-05, 'epoch': 0.26666666666666666}
{'loss': 0.3189, 'grad_norm': 0.6823941469192505, 'learning_rate': 4.407703703703704e-05, 'epoch': 0.35555555555555557}
{'loss': 0.3223, 'grad_norm': 0.8993301391601562, 'learning_rate': 4.2595555555555554e-05, 'epoch': 0.4444444444444444}
{'loss': 0.3154, 'grad_norm': 0.8177610635757446, 'learning_rate': 4.111407407407408e-05, 'epoch': 0.5333333333333333}
{'loss': 0.3103, 'grad_norm': 0.7148011326789856, 'learning_rate': 3.9632592592592594e-05, 'epoch': 0.6222222222222222}
{'loss': 0.3039, 'grad_norm': 1.087775707244873, 'learning_rate': 3.815111111111112e-05, 'epoch': 0.7111111111111111}
{'loss': 0.2974, 'grad_norm': 0.72371256351

AttributeError: 'list' object has no attribute 'shape'

In [3]:
# 12. Inference Function (for later use)
def predict_icd9(text):
    model.eval()
    model.to(device)  # move model to GPU if available
    text = clean_text(text)
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=256).to(device)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.sigmoid(outputs.logits).cpu().numpy()
    predicted_labels = mlb.inverse_transform(probs > 0.5)[0]
    return predicted_labels


# 14. Example Inference from Dataset
print("\nExample Inference:")
example_text = df.iloc[0]["summary_snippet"]
example_true_labels = df.iloc[0]["icd9_codes"]
predicted = predict_icd9(example_text)
print(example_text)
print(f"Predicted ICD9 codes: {predicted}")
print(f"True ICD9 codes: {example_true_labels}")


Example Inference:
Admission Date: [**2113-9-10**]    Discharge Date: [**2113-9-14**]  Service: [**Hospital Ward Name **]/MICU/SICU. HISTORY OF THE PRESENT ILLNESS: Mr. [**Known lastname 19388**] is an 81-year-old man recently discharged from [**Hospital1 188**] on [**2113-9-6**] after a 27 days hospitalization. Recent hospitalization was for a mitral valve replacement and a tricuspid valve repair, which resulted in a prolonged and PEG tube placement. Additionally, during that hospitalization, he developed a right sided loculated effusion for which he underwent a videoscopic thorascopy for removal. Of note, he was sent out on a course of Vancomycin for MRSA positive sputum culture on [**2113-8-22**]. Since the discharge from [**Hospital1 69**] four days ago, Mr. [**Known lastname 19388**] has had ongoing respiratory complained of shortness of breath with purulent sputum. He was cultured. On [**2113-9-9**], he developed leukocytosis to 14. Ceftazidime and Levofloxacin were added empiri