In [2]:
import pandas as pd

# Load the dataset
df = pd.read_csv("../data/mtsamples.csv")

# Show basic info and preview
print(df.shape)
df.head()

(4999, 6)


Unnamed: 0.1,Unnamed: 0,description,medical_specialty,sample_name,transcription,keywords
0,0,A 23-year-old white female presents with comp...,Allergy / Immunology,Allergic Rhinitis,"SUBJECTIVE:, This 23-year-old white female pr...","allergy / immunology, allergic rhinitis, aller..."
1,1,Consult for laparoscopic gastric bypass.,Bariatrics,Laparoscopic Gastric Bypass Consult - 2,"PAST MEDICAL HISTORY:, He has difficulty climb...","bariatrics, laparoscopic gastric bypass, weigh..."
2,2,Consult for laparoscopic gastric bypass.,Bariatrics,Laparoscopic Gastric Bypass Consult - 1,"HISTORY OF PRESENT ILLNESS: , I have seen ABC ...","bariatrics, laparoscopic gastric bypass, heart..."
3,3,2-D M-Mode. Doppler.,Cardiovascular / Pulmonary,2-D Echocardiogram - 1,"2-D M-MODE: , ,1. Left atrial enlargement wit...","cardiovascular / pulmonary, 2-d m-mode, dopple..."
4,4,2-D Echocardiogram,Cardiovascular / Pulmonary,2-D Echocardiogram - 2,1. The left ventricular cavity size and wall ...,"cardiovascular / pulmonary, 2-d, doppler, echo..."


In [3]:
# Step 1: Drop rows with missing values in 'transcription'
df = df.dropna(subset=['transcription'])

# Step 2: Keep only relevant columns (you can adjust this)
df = df[['medical_specialty', 'transcription']]

# Step 3: Clean the text (remove line breaks and extra spaces)
def clean_text(text):
    return " ".join(text.strip().split())

df['transcription'] = df['transcription'].apply(clean_text)

# Preview cleaned data
df.head()


Unnamed: 0,medical_specialty,transcription
0,Allergy / Immunology,"SUBJECTIVE:, This 23-year-old white female pre..."
1,Bariatrics,"PAST MEDICAL HISTORY:, He has difficulty climb..."
2,Bariatrics,"HISTORY OF PRESENT ILLNESS: , I have seen ABC ..."
3,Cardiovascular / Pulmonary,"2-D M-MODE: , ,1. Left atrial enlargement with..."
4,Cardiovascular / Pulmonary,1. The left ventricular cavity size and wall t...


In [11]:
!pip install transformers
!pip install torch

Collecting torch
  Using cached torch-2.8.0-cp312-cp312-win_amd64.whl.metadata (30 kB)
Collecting sympy>=1.13.3 (from torch)
  Using cached sympy-1.14.0-py3-none-any.whl.metadata (12 kB)
Collecting networkx (from torch)
  Using cached networkx-3.5-py3-none-any.whl.metadata (6.3 kB)
Collecting mpmath<1.4,>=1.1.0 (from sympy>=1.13.3->torch)
  Using cached mpmath-1.3.0-py3-none-any.whl.metadata (8.6 kB)
Using cached torch-2.8.0-cp312-cp312-win_amd64.whl (241.3 MB)
Using cached sympy-1.14.0-py3-none-any.whl (6.3 MB)
Using cached mpmath-1.3.0-py3-none-any.whl (536 kB)
Using cached networkx-3.5-py3-none-any.whl (2.0 MB)
Installing collected packages: mpmath, sympy, networkx, torch

   ---------------------------------------- 0/4 [mpmath]
   ---------------------------------------- 0/4 [mpmath]
   ---------------------------------------- 0/4 [mpmath]
   ---------- ----------------------------- 1/4 [sympy]
   ---------- ----------------------------- 1/4 [sympy]
   ---------- ------------------

In [13]:
!pip install sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.2.0-cp312-cp312-win_amd64.whl.metadata (8.3 kB)
Downloading sentencepiece-0.2.0-cp312-cp312-win_amd64.whl (991 kB)
   ---------------------------------------- 0.0/992.0 kB ? eta -:--:--
   ------------------------------- -------- 786.4/992.0 kB 6.6 MB/s eta 0:00:01
   ---------------------------------------- 992.0/992.0 kB 3.9 MB/s  0:00:00
Installing collected packages: sentencepiece
Successfully installed sentencepiece-0.2.0


In [1]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch
from tqdm import tqdm

tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('t5-small')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)


  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_

In [2]:
# Test summarization on dummy text
text = "summarize: The human heart is a muscular organ that pumps blood throughout the body. It has four chambers: two upper atria and two lower ventricles. The heart ensures the circulation of oxygen and nutrients."

# Tokenize
inputs = tokenizer.encode(text, return_tensors="pt", max_length=512, truncation=True)
inputs = inputs.to(device)

# Generate summary
summary_ids = model.generate(inputs, max_length=50, min_length=10, length_penalty=2.0, num_beams=4, early_stopping=True)

# Decode
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

print("Summary:", summary)


Summary: the human heart has four chambers: two upper atria and two lower ventricles. the heart ensures the circulation of oxygen and nutrients.
