In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [12]:
import torch
print("Visible:", torch.cuda.device_count())

Visible: 1


In [13]:
print("Using device:", torch.cuda.current_device())
print("Device name:", torch.cuda.get_device_name(torch.cuda.current_device()))

Using device: 0
Device name: NVIDIA RTX A6000


In [14]:
import re
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TrainingArguments

In [3]:
pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.7.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (17 kB)
Collecting scipy>=1.8.0 (from scikit-learn)
  Downloading scipy-1.16.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (61 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.7.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.9/12.9 MB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Downloading scipy-1.16.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (35.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m35.3/35.3 MB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Downloading threadpoolctl-3.6.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, scipy, scikit-learn
[2K   [90

In [16]:
import json
from sklearn.model_selection import train_test_split

In [17]:
# Load the raw JSON data
with open("discharge_summaries.json", "r", encoding="utf-8") as file:
    data = json.load(file)

In [18]:
#Split: 90% train, 5% validation, 5% test
train_data, temp_data = train_test_split(data, test_size=0.10, random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=0.50, random_state=42)

# Save each split to a seperate JSON file
with open("train.json", "w", encoding="utf-8") as file:
    json.dump(train_data, file, indent=2, ensure_ascii=False)
with open("validation.json", "w", encoding="utf-8") as file:
    json.dump(train_data, file, indent=2, ensure_ascii=False)
with open("test.json", "w", encoding="utf-8") as file:
    json.dump(train_data, file, indent=2, ensure_ascii=False)

print("Dataset split and saved as train.json, validation.json, and test.json.")

Dataset split and saved as train.json, validation.json, and test.json.


In [19]:
# Optional: convert to Hugging Face DatasetDict for use in transformers pipeline
from datasets import Dataset
from datasets import DatasetDict

dataset = DatasetDict({
    "train": Dataset.from_list(train_data),
    "validation": Dataset.from_list(val_data),
    "test": Dataset.from_list(test_data)
})

# Check the sizes
print("Train size:", len(dataset["train"]))
print("Validation size:", len(dataset["validation"]))
print("Test size:", len(dataset["test"]))

Train size: 1800
Validation size: 100
Test size: 100


In [11]:
pip install --upgrade transformers

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting transformers
  Downloading transformers-4.53.2-py3-none-any.whl.metadata (40 kB)
Downloading transformers-4.53.2-py3-none-any.whl (10.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.8/10.8 MB[0m [31m697.7 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.53.1
    Uninstalling transformers-4.53.1:
      Successfully uninstalled transformers-4.53.1
Successfully installed transformers-4.53.2
Note: you may need to restart the kernel to use updated packages.


In [20]:
print("Visible CUDA devices:", torch.cuda.device_count())

Visible CUDA devices: 1


In [21]:
import re
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Trainer, TrainingArguments

torch.cuda.set_device(0)
print("Using device:", torch.cuda.current_device())
print("Device name:", torch.cuda.get_device_name(torch.cuda.current_device()))

# Updated preprocessing function
def preprocess_record(example):
    # Convert input fields to a flat string
    fields = [
        f"Patient ID: {example.get('patient_id', '')}",
        f"Age: {example.get('age', '')}",
        f"Gender: {example.get('gender', '')}",
        f"Admission Date: {example.get('admission_date', '')}",
        f"Discharge Date: {example.get('discharge_date', '')}",
        f"Primary Diagnosis: {example.get('primary_diagnosis', '')}",
        f"Procedures: {', '.join(example.get('procedures', []))}",
        f"Medications: {', '.join(example.get('medications', []))}",
        f"Follow Up Instructions: {example.get('follow_up_instructions', '')}",
    ]
    
    input_text = " | ".join(fields).lower()
    target_text = example.get("summary", "").lower()

    # Optional cleanup (can keep as-is too)
    input_text = re.sub(r'[^a-z0-9\s.,:?-]', '', input_text)
    target_text = re.sub(r'[^a-z0-9\s.,:?-]', '', target_text)

    return {"input_text": input_text, "target_text": target_text}

# Apply preprocessing
processed_dataset = dataset.map(preprocess_record)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large")

# Tokenization
def tokenize_function(example):
    inputs = tokenizer(
        example["input_text"],
        truncation=True,
        padding="max_length",
        max_length=512,
        return_tensors="pt"
    )

    with tokenizer.as_target_tokenizer():
        targets = tokenizer(
            example["target_text"],
            truncation=True,
            padding="max_length",
            max_length=256,
            return_tensors="pt"
        )

    inputs["labels"] = targets["input_ids"].squeeze()
    return inputs

# Tokenize all splits
tokenized_dataset = processed_dataset.map(tokenize_function, batched=True)

# Remove unused columns and set torch format
tokenized_dataset = tokenized_dataset.remove_columns(["input_text", "target_text"])
tokenized_dataset.set_format("torch")

# Split
train_dataset = tokenized_dataset["train"]
eval_dataset = tokenized_dataset["validation"]

# Load BART model
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large", use_safetensors=True)

# Freeze encoder
for param in model.model.encoder.parameters():
    param.requires_grad = False

# Optionally freeze decoder except last layer
for layer in model.model.decoder.layers[:-1]:
    for param in layer.parameters():
        param.requires_grad = False

# Training arguments
training_args = TrainingArguments(
    output_dir="./bart-discharge-summary",
    do_eval=True,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    logging_dir="./logs",
    logging_steps=100,
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
)

# Train
trainer.train()

Using device: 0
Device name: NVIDIA RTX A6000


Map:   0%|          | 0/1800 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/1800 [00:00<?, ? examples/s]



Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

  trainer = Trainer(


Step,Training Loss
100,8.003
200,5.2775
300,5.1188
400,5.0484
500,5.0192
600,4.9943




TrainOutput(global_step=675, training_loss=5.509608832465278, metrics={'train_runtime': 412.6271, 'train_samples_per_second': 13.087, 'train_steps_per_second': 1.636, 'total_flos': 5851182425702400.0, 'train_loss': 5.509608832465278, 'epoch': 3.0})

In [22]:
# Save the fine-tuned model
model.save_pretrained("./bart-clinical-letter-finetuned-final")
tokenizer.save_pretrained("./bart-clinical-letter-finetuned-final")

('./bart-clinical-letter-finetuned-final/tokenizer_config.json',
 './bart-clinical-letter-finetuned-final/special_tokens_map.json',
 './bart-clinical-letter-finetuned-final/vocab.json',
 './bart-clinical-letter-finetuned-final/merges.txt',
 './bart-clinical-letter-finetuned-final/added_tokens.json',
 './bart-clinical-letter-finetuned-final/tokenizer.json')

In [23]:
import torch
import re
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# Load the fine-tuned model and tokenizer
model = AutoModelForSeq2SeqLM.from_pretrained("./bart-clinical-letter-finetuned-final")
tokenizer = AutoTokenizer.from_pretrained("./bart-clinical-letter-finetuned-final")

# Device
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)

# Same preprocessing as training
def preprocess_record(example):
    fields = [
        f"Patient ID: {example.get('patient_id', '')}",
        f"Age: {example.get('age', '')}",
        f"Gender: {example.get('gender', '')}",
        f"Admission Date: {example.get('admission_date', '')}",
        f"Discharge Date: {example.get('discharge_date', '')}",
        f"Primary Diagnosis: {example.get('primary_diagnosis', '')}",
        f"Procedures: {', '.join(example.get('procedures', []))}",
        f"Medications: {', '.join(example.get('medications', []))}",
        f"Follow Up Instructions: {example.get('follow_up_instructions', '')}",
    ]
    input_text = " | ".join(fields).lower()
    input_text = re.sub(r'[^a-z0-9\s.,:?-]', '', input_text)
    return input_text

# Inference function
def generate_summary(example, model, tokenizer, device=device):
    input_text = preprocess_record(example)
    
    inputs = tokenizer(
        input_text,
        return_tensors="pt",
        truncation=True,
        padding="max_length",
        max_length=512
    )
    inputs = {k: v.to(device) for k, v in inputs.items()}

    outputs = model.generate(
        inputs["input_ids"],
        max_length=256,
        num_beams=4,
        early_stopping=True
    )
    
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Example from test set
sample_example = processed_dataset["test"][2]
generated = generate_summary(sample_example, model, tokenizer)

print("\nGenerated Summary:", generated)
print("\nReference Summary:", sample_example['target_text'])


Generated Summary: 82-year-old female admitted for stroke. procedures performed include ct scan, thrombolysis, mri brain. treated with clopidogrel, atorvastatin, amlodipine. discharged in stable condition. follow up in 1 week with primary physician. adhere to prescribed medications.

Reference Summary: 82-year-old female admitted for stroke. procedures performed include ct scan, thrombolysis, mri brain. treated with clopidogrel, atorvastatin, amlodipine. discharged in stable condition. follow up in 1 week with primary physician. adhere to prescribed medications.


In [25]:
import pandas as pd
import torch
import re
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# Load the fine-tuned model and tokenizer
model = AutoModelForSeq2SeqLM.from_pretrained("./bart-clinical-letter-finetuned-final")
tokenizer = AutoTokenizer.from_pretrained("./bart-clinical-letter-finetuned-final")

# Set device explicitly
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# Same preprocessing function used during training
def preprocess_record(example):
    fields = [
        f"Patient ID: {example.get('patient_id', '')}",
        f"Age: {example.get('age', '')}",
        f"Gender: {example.get('gender', '')}",
        f"Admission Date: {example.get('admission_date', '')}",
        f"Discharge Date: {example.get('discharge_date', '')}",
        f"Primary Diagnosis: {example.get('primary_diagnosis', '')}",
        f"Procedures: {', '.join(example.get('procedures', []))}",
        f"Medications: {', '.join(example.get('medications', []))}",
        f"Follow Up Instructions: {example.get('follow_up_instructions', '')}",
    ]
    input_text = " | ".join(fields).lower()
    input_text = re.sub(r'[^a-z0-9\s.,:?-]', '', input_text)
    return input_text

# Inference function
def generate_soap(example, model, tokenizer, device=device):
    input_text = preprocess_record(example)
    inputs = tokenizer(
        input_text,
        return_tensors="pt",
        truncation=True,
        padding="max_length",
        max_length=512
    )
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model.generate(
            inputs["input_ids"],
            max_length=256,
            num_beams=4,
            early_stopping=True
        )
    
    return tokenizer.decode(outputs[0], skip_special_tokens=True), input_text

# Generate and collect results for 100 samples
results = []

for idx in range(100):
    sample = processed_dataset["test"][idx]
    reference_soap = sample.get("summary", "")
    generated_soap, input_text = generate_soap(sample, model, tokenizer)

    results.append({
        "Structured Input": input_text,
        "Reference Summary": reference_soap,
        "Generated Summary": generated_soap
    })

# Save results to CSV
df = pd.DataFrame(results)
df.to_csv("transfer_learning_results.csv", index=False)

print("Results saved to 'transfer_learning_results.csv' ✅")

Results saved to 'transfer_learning_results.csv' ✅


In [26]:
import evaluate
import pandas as pd

# Load your saved CSV
df = pd.read_csv("transfer_learning_results.csv")

# Extract generated and reference texts
generated_list = df["Generated Summary"].tolist()
reference_list = df["Reference Summary"].tolist()

# Load evaluation metrics
rouge = evaluate.load("rouge")
bleu = evaluate.load("bleu")
bertscore = evaluate.load("bertscore")

# Compute ROUGE
rouge_result = rouge.compute(predictions=generated_list, references=reference_list)
print("\n🔍 ROUGE Results:")
for key, value in rouge_result.items():
    print(f"{key}: {value:.4f}")

# Compute BLEU (expects references as list of lists)
bleu_result = bleu.compute(predictions=generated_list, references=[[ref] for ref in reference_list])
print("\n🔍 BLEU Result:")
print(f"BLEU score: {bleu_result['bleu']:.4f}")

# Compute BERTScore
bertscore_result = bertscore.compute(predictions=generated_list, references=reference_list, lang="en")
bertscore_avg = {
    "precision": sum(bertscore_result["precision"]) / len(bertscore_result["precision"]),
    "recall": sum(bertscore_result["recall"]) / len(bertscore_result["recall"]),
    "f1": sum(bertscore_result["f1"]) / len(bertscore_result["f1"])
}
print("\n🔍 BERTScore Averages:")
for key, value in bertscore_avg.items():
    print(f"{key}: {value:.4f}")


🔍 ROUGE Results:
rouge1: 1.0000
rouge2: 1.0000
rougeL: 1.0000
rougeLsum: 1.0000

🔍 BLEU Result:
BLEU score: 0.4639


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



🔍 BERTScore Averages:
precision: 0.9588
recall: 0.9550
f1: 0.9569
