In [2]:
import pandas as pd

import torch

from transformers import T5Tokenizer, T5ForConditionalGeneration

from torch.utils.data import DataLoader, Dataset

from tqdm import tqdm

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler

import nltk

from nltk.stem import WordNetLemmatizer

from nltk.tokenize import word_tokenize

from nltk.corpus import stopwords

from gensim.models import Word2Vec

from torch.optim import AdamW

In [4]:
pip install rouge-score

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25ldone
[?25h  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=4dfb911f491ed603a8a789ef26ebd18794290206cb411ba64e61fb3e0f9a5429
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2
Note: you may need to restart the kernel to use updated packages.


In [8]:
import pandas as pd
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from transformers import T5ForConditionalGeneration, T5Tokenizer, AdamW
from datasets import Dataset

# List of medical stopwords (e.g., words that are essential in the medical field)
medical_stopwords = set([
    "hemoglobin", "RBC", "WBC", "platelet", "count", "level", "normal", "anemia", "blood", 
    "cells", "oxygen", "immune", "system", "function", "test", "range", "cells/µL", "g/dL", 
    "symptoms", "fatigue", "weakness", "indicates", "may", "suggests", "mild", "serious", "evaluation", "suggest", "levels"
])

# Additional stopwords to retain for context
additional_stopwords = set([
    "is", "are", "the", "in", "at", "of", "to", "a", "an", "and", "or", "on", "for", "with"
])

# Combine medical stopwords with general stopwords
stop_words = stopwords.words('english')
all_stopwords = set(stop_words) | medical_stopwords | additional_stopwords

# Function to preprocess medical reports
def preprocess_medical_text(text):
    # Remove newlines and extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Tokenize the text
    tokens = word_tokenize(text)
    
    # Remove punctuation, but keep periods, commas, etc.
    tokens = [word for word in tokens if word not in string.punctuation]
    
    # Remove stopwords, but keep medical stopwords and numbers
    tokens = [word for word in tokens if word.lower() not in all_stopwords]
    
    # Reconstruct the text after tokenization and removal
    processed_text = ' '.join(tokens)
    
    return processed_text

# Function to preprocess the dataset
def preprocess_dataset(df):
    # Drop the 'SNO.' column if present
    if 'SNO.' in df.columns:
        df = df.drop(columns=['SNO.'])
    
    # Remove spaces from column names
    df.columns = df.columns.str.replace(' ', '')
    
    # Preprocess the 'SUMMARY' column
    df['SUMMARY'] = df['SUMMARY'].apply(preprocess_medical_text)
    
    return df

# Sample dataset (replace with your actual dataset)
data = pd.read_excel("/kaggle/input/data-set/PROJECT_DATA.xlsx")

# Create DataFrame from the sample data
df = pd.DataFrame(data)

# Preprocess the dataset
processed_df = preprocess_dataset(df)

# Format input text (lab values) and target text (summary) for T5
processed_df['input_text'] = processed_df.apply(
    lambda row: f"MCV: {row['MCV']} MCHC: {row['MCHC']} HB: {row['HB']} RBC: {row['RBC']} WBC: {row['WBC']} "
                f"PLT: {row['PLT']} RDWCV: {row['RDWCV']} NEUTRO: {row['NEUTRO']} LYMPHO: {row['LYMPHO']}", axis=1
)
processed_df['target_text'] = processed_df['SUMMARY']

# Convert the DataFrame to a Dataset format suitable for Hugging Face
dataset = Dataset.from_pandas(processed_df[['input_text', 'target_text']])

# Split into training and test datasets
train_test_split = dataset.train_test_split(test_size=0.2)
train_data = train_test_split['train']
test_data = train_test_split['test']

# Load the tokenizer and model
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Tokenize the data
def tokenize_data(example):
    inputs = tokenizer(example['input_text'], max_length=400, truncation=True, padding="max_length")
    targets = tokenizer(example['target_text'], max_length=400, truncation=True, padding="max_length")
    inputs['labels'] = targets['input_ids']
    return inputs

train_data = train_data.map(tokenize_data, batched=True)
test_data = test_data.map(tokenize_data, batched=True)

# Set format for PyTorch
train_data.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
test_data.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

# Prepare DataLoader for training
from torch.utils.data import DataLoader
train_dataloader = DataLoader(train_data, batch_size=2, shuffle=True)

# Prepare DataLoader for evaluation
test_dataloader = DataLoader(test_data, batch_size=2)

# Set device to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
model.to(device)

# Define optimizer (AdamW is commonly used for transformers)
optimizer = AdamW(model.parameters(), lr=3e-5)

# Training loop on GPU
model.train()

for epoch in range(200):
    total_loss = 0

    for batch in tqdm(train_dataloader):
        optimizer.zero_grad()

        # Move batch to GPU
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        # Backward pass
        loss.backward()
        optimizer.step()

    avg_loss = total_loss / len(train_dataloader)
    print(f'Epoch {epoch + 1}, Average Loss: {avg_loss:.4f}')

# Save the model after training
model.save_pretrained("./t5_PROJECT_100SCRATCH")
tokenizer.save_pretrained("./t5_PROJECT_100SCRATCH")

print("Training complete and model saved!")


Map:   0%|          | 0/225 [00:00<?, ? examples/s]

Map:   0%|          | 0/57 [00:00<?, ? examples/s]



Using device: cuda


100%|██████████| 113/113 [00:14<00:00,  7.63it/s]


Epoch 1, Average Loss: 3.2215


100%|██████████| 113/113 [00:14<00:00,  7.60it/s]


Epoch 2, Average Loss: 1.4591


100%|██████████| 113/113 [00:15<00:00,  7.53it/s]


Epoch 3, Average Loss: 1.0212


100%|██████████| 113/113 [00:15<00:00,  7.47it/s]


Epoch 4, Average Loss: 0.8740


100%|██████████| 113/113 [00:15<00:00,  7.42it/s]


Epoch 5, Average Loss: 0.7759


100%|██████████| 113/113 [00:15<00:00,  7.38it/s]


Epoch 6, Average Loss: 0.7042


100%|██████████| 113/113 [00:15<00:00,  7.34it/s]


Epoch 7, Average Loss: 0.6558


100%|██████████| 113/113 [00:15<00:00,  7.30it/s]


Epoch 8, Average Loss: 0.6131


100%|██████████| 113/113 [00:15<00:00,  7.27it/s]


Epoch 9, Average Loss: 0.5821


100%|██████████| 113/113 [00:15<00:00,  7.25it/s]


Epoch 10, Average Loss: 0.5575


100%|██████████| 113/113 [00:15<00:00,  7.22it/s]


Epoch 11, Average Loss: 0.5348


100%|██████████| 113/113 [00:15<00:00,  7.20it/s]


Epoch 12, Average Loss: 0.5159


100%|██████████| 113/113 [00:15<00:00,  7.19it/s]


Epoch 13, Average Loss: 0.4998


100%|██████████| 113/113 [00:15<00:00,  7.19it/s]


Epoch 14, Average Loss: 0.4839


100%|██████████| 113/113 [00:15<00:00,  7.16it/s]


Epoch 15, Average Loss: 0.4779


100%|██████████| 113/113 [00:15<00:00,  7.10it/s]


Epoch 16, Average Loss: 0.4641


100%|██████████| 113/113 [00:15<00:00,  7.07it/s]


Epoch 17, Average Loss: 0.4548


100%|██████████| 113/113 [00:15<00:00,  7.08it/s]


Epoch 18, Average Loss: 0.4421


100%|██████████| 113/113 [00:15<00:00,  7.09it/s]


Epoch 19, Average Loss: 0.4332


100%|██████████| 113/113 [00:15<00:00,  7.11it/s]


Epoch 20, Average Loss: 0.4234


100%|██████████| 113/113 [00:15<00:00,  7.10it/s]


Epoch 21, Average Loss: 0.4161


100%|██████████| 113/113 [00:15<00:00,  7.09it/s]


Epoch 22, Average Loss: 0.4078


100%|██████████| 113/113 [00:15<00:00,  7.07it/s]


Epoch 23, Average Loss: 0.4035


100%|██████████| 113/113 [00:15<00:00,  7.07it/s]


Epoch 24, Average Loss: 0.3945


100%|██████████| 113/113 [00:15<00:00,  7.07it/s]


Epoch 25, Average Loss: 0.3870


100%|██████████| 113/113 [00:15<00:00,  7.09it/s]


Epoch 26, Average Loss: 0.3820


100%|██████████| 113/113 [00:15<00:00,  7.08it/s]


Epoch 27, Average Loss: 0.3765


100%|██████████| 113/113 [00:15<00:00,  7.09it/s]


Epoch 28, Average Loss: 0.3698


100%|██████████| 113/113 [00:15<00:00,  7.09it/s]


Epoch 29, Average Loss: 0.3632


100%|██████████| 113/113 [00:15<00:00,  7.10it/s]


Epoch 30, Average Loss: 0.3632


100%|██████████| 113/113 [00:15<00:00,  7.10it/s]


Epoch 31, Average Loss: 0.3555


100%|██████████| 113/113 [00:15<00:00,  7.10it/s]


Epoch 32, Average Loss: 0.3507


100%|██████████| 113/113 [00:15<00:00,  7.09it/s]


Epoch 33, Average Loss: 0.3494


100%|██████████| 113/113 [00:15<00:00,  7.09it/s]


Epoch 34, Average Loss: 0.3449


100%|██████████| 113/113 [00:15<00:00,  7.09it/s]


Epoch 35, Average Loss: 0.3401


100%|██████████| 113/113 [00:15<00:00,  7.09it/s]


Epoch 36, Average Loss: 0.3364


100%|██████████| 113/113 [00:15<00:00,  7.08it/s]


Epoch 37, Average Loss: 0.3318


100%|██████████| 113/113 [00:15<00:00,  7.07it/s]


Epoch 38, Average Loss: 0.3287


100%|██████████| 113/113 [00:15<00:00,  7.08it/s]


Epoch 39, Average Loss: 0.3260


100%|██████████| 113/113 [00:15<00:00,  7.07it/s]


Epoch 40, Average Loss: 0.3214


100%|██████████| 113/113 [00:15<00:00,  7.07it/s]


Epoch 41, Average Loss: 0.3168


100%|██████████| 113/113 [00:15<00:00,  7.08it/s]


Epoch 42, Average Loss: 0.3151


100%|██████████| 113/113 [00:15<00:00,  7.10it/s]


Epoch 43, Average Loss: 0.3130


100%|██████████| 113/113 [00:15<00:00,  7.08it/s]


Epoch 44, Average Loss: 0.3081


100%|██████████| 113/113 [00:15<00:00,  7.10it/s]


Epoch 45, Average Loss: 0.3097


100%|██████████| 113/113 [00:15<00:00,  7.09it/s]


Epoch 46, Average Loss: 0.3038


100%|██████████| 113/113 [00:15<00:00,  7.09it/s]


Epoch 47, Average Loss: 0.3010


100%|██████████| 113/113 [00:15<00:00,  7.08it/s]


Epoch 48, Average Loss: 0.2992


100%|██████████| 113/113 [00:15<00:00,  7.07it/s]


Epoch 49, Average Loss: 0.2935


100%|██████████| 113/113 [00:15<00:00,  7.08it/s]


Epoch 50, Average Loss: 0.2922


100%|██████████| 113/113 [00:15<00:00,  7.09it/s]


Epoch 51, Average Loss: 0.2918


100%|██████████| 113/113 [00:15<00:00,  7.09it/s]


Epoch 52, Average Loss: 0.2892


100%|██████████| 113/113 [00:15<00:00,  7.08it/s]


Epoch 53, Average Loss: 0.2874


100%|██████████| 113/113 [00:15<00:00,  7.08it/s]


Epoch 54, Average Loss: 0.2819


100%|██████████| 113/113 [00:15<00:00,  7.08it/s]


Epoch 55, Average Loss: 0.2812


100%|██████████| 113/113 [00:15<00:00,  7.09it/s]


Epoch 56, Average Loss: 0.2779


100%|██████████| 113/113 [00:15<00:00,  7.08it/s]


Epoch 57, Average Loss: 0.2749


100%|██████████| 113/113 [00:15<00:00,  7.08it/s]


Epoch 58, Average Loss: 0.2740


100%|██████████| 113/113 [00:15<00:00,  7.07it/s]


Epoch 59, Average Loss: 0.2719


100%|██████████| 113/113 [00:15<00:00,  7.08it/s]


Epoch 60, Average Loss: 0.2705


100%|██████████| 113/113 [00:15<00:00,  7.08it/s]


Epoch 61, Average Loss: 0.2665


100%|██████████| 113/113 [00:15<00:00,  7.07it/s]


Epoch 62, Average Loss: 0.2639


100%|██████████| 113/113 [00:15<00:00,  7.08it/s]


Epoch 63, Average Loss: 0.2627


100%|██████████| 113/113 [00:15<00:00,  7.09it/s]


Epoch 64, Average Loss: 0.2605


100%|██████████| 113/113 [00:15<00:00,  7.09it/s]


Epoch 65, Average Loss: 0.2598


100%|██████████| 113/113 [00:15<00:00,  7.08it/s]


Epoch 66, Average Loss: 0.2579


100%|██████████| 113/113 [00:15<00:00,  7.07it/s]


Epoch 67, Average Loss: 0.2561


100%|██████████| 113/113 [00:15<00:00,  7.09it/s]


Epoch 68, Average Loss: 0.2520


100%|██████████| 113/113 [00:15<00:00,  7.10it/s]


Epoch 69, Average Loss: 0.2515


100%|██████████| 113/113 [00:15<00:00,  7.10it/s]


Epoch 70, Average Loss: 0.2486


100%|██████████| 113/113 [00:15<00:00,  7.09it/s]


Epoch 71, Average Loss: 0.2475


100%|██████████| 113/113 [00:15<00:00,  7.10it/s]


Epoch 72, Average Loss: 0.2461


100%|██████████| 113/113 [00:15<00:00,  7.09it/s]


Epoch 73, Average Loss: 0.2449


100%|██████████| 113/113 [00:15<00:00,  7.07it/s]


Epoch 74, Average Loss: 0.2423


100%|██████████| 113/113 [00:15<00:00,  7.08it/s]


Epoch 75, Average Loss: 0.2406


100%|██████████| 113/113 [00:15<00:00,  7.08it/s]


Epoch 76, Average Loss: 0.2385


100%|██████████| 113/113 [00:15<00:00,  7.10it/s]


Epoch 77, Average Loss: 0.2359


100%|██████████| 113/113 [00:15<00:00,  7.10it/s]


Epoch 78, Average Loss: 0.2354


100%|██████████| 113/113 [00:15<00:00,  7.10it/s]


Epoch 79, Average Loss: 0.2330


100%|██████████| 113/113 [00:15<00:00,  7.10it/s]


Epoch 80, Average Loss: 0.2308


100%|██████████| 113/113 [00:15<00:00,  7.11it/s]


Epoch 81, Average Loss: 0.2307


100%|██████████| 113/113 [00:15<00:00,  7.10it/s]


Epoch 82, Average Loss: 0.2304


100%|██████████| 113/113 [00:15<00:00,  7.09it/s]


Epoch 83, Average Loss: 0.2281


100%|██████████| 113/113 [00:15<00:00,  7.09it/s]


Epoch 84, Average Loss: 0.2246


100%|██████████| 113/113 [00:15<00:00,  7.10it/s]


Epoch 85, Average Loss: 0.2226


100%|██████████| 113/113 [00:15<00:00,  7.09it/s]


Epoch 86, Average Loss: 0.2236


100%|██████████| 113/113 [00:15<00:00,  7.08it/s]


Epoch 87, Average Loss: 0.2223


100%|██████████| 113/113 [00:15<00:00,  7.08it/s]


Epoch 88, Average Loss: 0.2205


100%|██████████| 113/113 [00:15<00:00,  7.08it/s]


Epoch 89, Average Loss: 0.2193


100%|██████████| 113/113 [00:15<00:00,  7.09it/s]


Epoch 90, Average Loss: 0.2159


100%|██████████| 113/113 [00:15<00:00,  7.07it/s]


Epoch 91, Average Loss: 0.2171


100%|██████████| 113/113 [00:15<00:00,  7.08it/s]


Epoch 92, Average Loss: 0.2151


100%|██████████| 113/113 [00:15<00:00,  7.09it/s]


Epoch 93, Average Loss: 0.2133


100%|██████████| 113/113 [00:15<00:00,  7.10it/s]


Epoch 94, Average Loss: 0.2106


100%|██████████| 113/113 [00:15<00:00,  7.07it/s]


Epoch 95, Average Loss: 0.2103


100%|██████████| 113/113 [00:15<00:00,  7.08it/s]


Epoch 96, Average Loss: 0.2080


100%|██████████| 113/113 [00:15<00:00,  7.08it/s]


Epoch 97, Average Loss: 0.2091


100%|██████████| 113/113 [00:15<00:00,  7.08it/s]


Epoch 98, Average Loss: 0.2053


100%|██████████| 113/113 [00:15<00:00,  7.08it/s]


Epoch 99, Average Loss: 0.2058


100%|██████████| 113/113 [00:15<00:00,  7.07it/s]


Epoch 100, Average Loss: 0.2025


100%|██████████| 113/113 [00:15<00:00,  7.08it/s]


Epoch 101, Average Loss: 0.2021


100%|██████████| 113/113 [00:15<00:00,  7.08it/s]


Epoch 102, Average Loss: 0.2002


100%|██████████| 113/113 [00:15<00:00,  7.08it/s]


Epoch 103, Average Loss: 0.1992


100%|██████████| 113/113 [00:15<00:00,  7.08it/s]


Epoch 104, Average Loss: 0.1970


100%|██████████| 113/113 [00:15<00:00,  7.07it/s]


Epoch 105, Average Loss: 0.1963


100%|██████████| 113/113 [00:15<00:00,  7.08it/s]


Epoch 106, Average Loss: 0.1976


100%|██████████| 113/113 [00:15<00:00,  7.08it/s]


Epoch 107, Average Loss: 0.1960


100%|██████████| 113/113 [00:15<00:00,  7.07it/s]


Epoch 108, Average Loss: 0.1942


100%|██████████| 113/113 [00:15<00:00,  7.08it/s]


Epoch 109, Average Loss: 0.1939


100%|██████████| 113/113 [00:15<00:00,  7.07it/s]


Epoch 110, Average Loss: 0.1923


100%|██████████| 113/113 [00:15<00:00,  7.09it/s]


Epoch 111, Average Loss: 0.1908


100%|██████████| 113/113 [00:15<00:00,  7.09it/s]


Epoch 112, Average Loss: 0.1890


100%|██████████| 113/113 [00:15<00:00,  7.07it/s]


Epoch 113, Average Loss: 0.1874


100%|██████████| 113/113 [00:15<00:00,  7.08it/s]


Epoch 114, Average Loss: 0.1867


100%|██████████| 113/113 [00:15<00:00,  7.09it/s]


Epoch 115, Average Loss: 0.1854


100%|██████████| 113/113 [00:15<00:00,  7.08it/s]


Epoch 116, Average Loss: 0.1844


100%|██████████| 113/113 [00:15<00:00,  7.09it/s]


Epoch 117, Average Loss: 0.1838


100%|██████████| 113/113 [00:15<00:00,  7.10it/s]


Epoch 118, Average Loss: 0.1815


100%|██████████| 113/113 [00:15<00:00,  7.09it/s]


Epoch 119, Average Loss: 0.1802


100%|██████████| 113/113 [00:15<00:00,  7.09it/s]


Epoch 120, Average Loss: 0.1806


100%|██████████| 113/113 [00:15<00:00,  7.08it/s]


Epoch 121, Average Loss: 0.1788


100%|██████████| 113/113 [00:15<00:00,  7.08it/s]


Epoch 122, Average Loss: 0.1778


100%|██████████| 113/113 [00:15<00:00,  7.08it/s]


Epoch 123, Average Loss: 0.1746


100%|██████████| 113/113 [00:15<00:00,  7.09it/s]


Epoch 124, Average Loss: 0.1769


100%|██████████| 113/113 [00:15<00:00,  7.08it/s]


Epoch 125, Average Loss: 0.1740


100%|██████████| 113/113 [00:15<00:00,  7.08it/s]


Epoch 126, Average Loss: 0.1729


100%|██████████| 113/113 [00:15<00:00,  7.07it/s]


Epoch 127, Average Loss: 0.1725


100%|██████████| 113/113 [00:15<00:00,  7.07it/s]


Epoch 128, Average Loss: 0.1706


100%|██████████| 113/113 [00:15<00:00,  7.07it/s]


Epoch 129, Average Loss: 0.1714


100%|██████████| 113/113 [00:15<00:00,  7.07it/s]


Epoch 130, Average Loss: 0.1700


100%|██████████| 113/113 [00:15<00:00,  7.07it/s]


Epoch 131, Average Loss: 0.1666


100%|██████████| 113/113 [00:15<00:00,  7.08it/s]


Epoch 132, Average Loss: 0.1656


100%|██████████| 113/113 [00:15<00:00,  7.07it/s]


Epoch 133, Average Loss: 0.1671


100%|██████████| 113/113 [00:15<00:00,  7.08it/s]


Epoch 134, Average Loss: 0.1618


100%|██████████| 113/113 [00:15<00:00,  7.08it/s]


Epoch 135, Average Loss: 0.1621


100%|██████████| 113/113 [00:15<00:00,  7.09it/s]


Epoch 136, Average Loss: 0.1628


100%|██████████| 113/113 [00:15<00:00,  7.08it/s]


Epoch 137, Average Loss: 0.1614


100%|██████████| 113/113 [00:15<00:00,  7.08it/s]


Epoch 138, Average Loss: 0.1608


100%|██████████| 113/113 [00:15<00:00,  7.09it/s]


Epoch 139, Average Loss: 0.1607


100%|██████████| 113/113 [00:15<00:00,  7.09it/s]


Epoch 140, Average Loss: 0.1590


100%|██████████| 113/113 [00:15<00:00,  7.09it/s]


Epoch 141, Average Loss: 0.1580


100%|██████████| 113/113 [00:15<00:00,  7.08it/s]


Epoch 142, Average Loss: 0.1559


100%|██████████| 113/113 [00:15<00:00,  7.07it/s]


Epoch 143, Average Loss: 0.1549


100%|██████████| 113/113 [00:15<00:00,  7.07it/s]


Epoch 144, Average Loss: 0.1547


100%|██████████| 113/113 [00:15<00:00,  7.07it/s]


Epoch 145, Average Loss: 0.1542


100%|██████████| 113/113 [00:15<00:00,  7.08it/s]


Epoch 146, Average Loss: 0.1504


100%|██████████| 113/113 [00:15<00:00,  7.08it/s]


Epoch 147, Average Loss: 0.1548


100%|██████████| 113/113 [00:15<00:00,  7.08it/s]


Epoch 148, Average Loss: 0.1505


100%|██████████| 113/113 [00:15<00:00,  7.09it/s]


Epoch 149, Average Loss: 0.1514


100%|██████████| 113/113 [00:15<00:00,  7.09it/s]


Epoch 150, Average Loss: 0.1491


100%|██████████| 113/113 [00:15<00:00,  7.07it/s]


Epoch 151, Average Loss: 0.1487


100%|██████████| 113/113 [00:15<00:00,  7.08it/s]


Epoch 152, Average Loss: 0.1495


100%|██████████| 113/113 [00:15<00:00,  7.09it/s]


Epoch 153, Average Loss: 0.1460


100%|██████████| 113/113 [00:15<00:00,  7.09it/s]


Epoch 154, Average Loss: 0.1472


100%|██████████| 113/113 [00:15<00:00,  7.08it/s]


Epoch 155, Average Loss: 0.1440


100%|██████████| 113/113 [00:15<00:00,  7.08it/s]


Epoch 156, Average Loss: 0.1435


100%|██████████| 113/113 [00:15<00:00,  7.09it/s]


Epoch 157, Average Loss: 0.1420


100%|██████████| 113/113 [00:15<00:00,  7.08it/s]


Epoch 158, Average Loss: 0.1420


100%|██████████| 113/113 [00:15<00:00,  7.07it/s]


Epoch 159, Average Loss: 0.1407


100%|██████████| 113/113 [00:15<00:00,  7.07it/s]


Epoch 160, Average Loss: 0.1398


100%|██████████| 113/113 [00:15<00:00,  7.07it/s]


Epoch 161, Average Loss: 0.1401


100%|██████████| 113/113 [00:15<00:00,  7.08it/s]


Epoch 162, Average Loss: 0.1399


100%|██████████| 113/113 [00:15<00:00,  7.07it/s]


Epoch 163, Average Loss: 0.1367


100%|██████████| 113/113 [00:15<00:00,  7.07it/s]


Epoch 164, Average Loss: 0.1398


100%|██████████| 113/113 [00:15<00:00,  7.07it/s]


Epoch 165, Average Loss: 0.1363


100%|██████████| 113/113 [00:15<00:00,  7.08it/s]


Epoch 166, Average Loss: 0.1357


100%|██████████| 113/113 [00:15<00:00,  7.07it/s]


Epoch 167, Average Loss: 0.1365


100%|██████████| 113/113 [00:15<00:00,  7.07it/s]


Epoch 168, Average Loss: 0.1324


100%|██████████| 113/113 [00:15<00:00,  7.07it/s]


Epoch 169, Average Loss: 0.1338


100%|██████████| 113/113 [00:15<00:00,  7.07it/s]


Epoch 170, Average Loss: 0.1324


100%|██████████| 113/113 [00:15<00:00,  7.08it/s]


Epoch 171, Average Loss: 0.1319


100%|██████████| 113/113 [00:15<00:00,  7.07it/s]


Epoch 172, Average Loss: 0.1289


100%|██████████| 113/113 [00:15<00:00,  7.08it/s]


Epoch 173, Average Loss: 0.1290


100%|██████████| 113/113 [00:15<00:00,  7.08it/s]


Epoch 174, Average Loss: 0.1303


100%|██████████| 113/113 [00:15<00:00,  7.08it/s]


Epoch 175, Average Loss: 0.1281


100%|██████████| 113/113 [00:15<00:00,  7.07it/s]


Epoch 176, Average Loss: 0.1276


100%|██████████| 113/113 [00:15<00:00,  7.07it/s]


Epoch 177, Average Loss: 0.1267


100%|██████████| 113/113 [00:15<00:00,  7.07it/s]


Epoch 178, Average Loss: 0.1271


100%|██████████| 113/113 [00:15<00:00,  7.08it/s]


Epoch 179, Average Loss: 0.1272


100%|██████████| 113/113 [00:15<00:00,  7.07it/s]


Epoch 180, Average Loss: 0.1238


100%|██████████| 113/113 [00:15<00:00,  7.07it/s]


Epoch 181, Average Loss: 0.1238


100%|██████████| 113/113 [00:15<00:00,  7.07it/s]


Epoch 182, Average Loss: 0.1241


100%|██████████| 113/113 [00:15<00:00,  7.08it/s]


Epoch 183, Average Loss: 0.1226


100%|██████████| 113/113 [00:15<00:00,  7.07it/s]


Epoch 184, Average Loss: 0.1212


100%|██████████| 113/113 [00:15<00:00,  7.07it/s]


Epoch 185, Average Loss: 0.1215


100%|██████████| 113/113 [00:15<00:00,  7.09it/s]


Epoch 186, Average Loss: 0.1191


100%|██████████| 113/113 [00:15<00:00,  7.09it/s]


Epoch 187, Average Loss: 0.1159


100%|██████████| 113/113 [00:15<00:00,  7.08it/s]


Epoch 188, Average Loss: 0.1182


100%|██████████| 113/113 [00:15<00:00,  7.09it/s]


Epoch 189, Average Loss: 0.1177


100%|██████████| 113/113 [00:15<00:00,  7.08it/s]


Epoch 190, Average Loss: 0.1188


100%|██████████| 113/113 [00:15<00:00,  7.09it/s]


Epoch 191, Average Loss: 0.1165


100%|██████████| 113/113 [00:15<00:00,  7.09it/s]


Epoch 192, Average Loss: 0.1143


100%|██████████| 113/113 [00:15<00:00,  7.09it/s]


Epoch 193, Average Loss: 0.1143


100%|██████████| 113/113 [00:15<00:00,  7.09it/s]


Epoch 194, Average Loss: 0.1143


100%|██████████| 113/113 [00:15<00:00,  7.10it/s]


Epoch 195, Average Loss: 0.1123


100%|██████████| 113/113 [00:15<00:00,  7.10it/s]


Epoch 196, Average Loss: 0.1125


100%|██████████| 113/113 [00:15<00:00,  7.09it/s]


Epoch 197, Average Loss: 0.1122


100%|██████████| 113/113 [00:15<00:00,  7.08it/s]


Epoch 198, Average Loss: 0.1105


100%|██████████| 113/113 [00:15<00:00,  7.09it/s]


Epoch 199, Average Loss: 0.1108


100%|██████████| 113/113 [00:15<00:00,  7.09it/s]


Epoch 200, Average Loss: 0.1116
Training complete and model saved!


In [9]:
from nltk.translate.bleu_score import corpus_bleu
from rouge_score import rouge_scorer
import torch
from tqdm import tqdm

# Function to evaluate the model
def evaluate_model(model, test_dataloader, device):
    model.eval()  # Set model to evaluation mode
    total_loss = 0
    predictions, references = [], []
    
    for batch in tqdm(test_dataloader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Forward pass
        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()

        # Decode the predicted token IDs to text
        pred_ids = outputs.logits.argmax(dim=-1)
        decoded_preds = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

        # Store the predictions and references for BLEU/ROUGE
        predictions.extend(decoded_preds)
        references.extend(decoded_labels)

    # Compute average loss
    avg_loss = total_loss / len(test_dataloader)

    # Calculate BLEU score (using NLTK's corpus_bleu)
    bleu_score = corpus_bleu([[ref.split()] for ref in references], [pred.split() for pred in predictions])

    # Calculate ROUGE score (using rouge-score package)
    scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)
    rouge_scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}
    for ref, pred in zip(references, predictions):
        scores = scorer.score(ref, pred)
        for key in rouge_scores:
            rouge_scores[key].append(scores[key].fmeasure)
    
    # Calculate average ROUGE scores
    avg_rouge_scores = {key: sum(value) / len(value) for key, value in rouge_scores.items()}

    return avg_loss, bleu_score, avg_rouge_scores

# Evaluate the model on the test dataset
avg_loss, bleu_score, avg_rouge_scores = evaluate_model(model, test_dataloader, device)

# Print the evaluation metrics
print(f"Evaluation Results:")
print(f"Average Loss: {avg_loss:.4f}")
print(f"BLEU Score: {bleu_score:.4f}")
print(f"ROUGE Scores: {avg_rouge_scores}")


100%|██████████| 29/29 [00:07<00:00,  4.06it/s]


Evaluation Results:
Average Loss: 0.3406
BLEU Score: 0.2970
ROUGE Scores: {'rouge1': 0.6737388419617204, 'rouge2': 0.4781325274534715, 'rougeL': 0.6462441021264972}


In [10]:
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer

# Load the trained model and tokenizer
model = T5ForConditionalGeneration.from_pretrained("./t5_PROJECT_100SCRATCH")
tokenizer = T5Tokenizer.from_pretrained("./t5_PROJECT_100SCRATCH")

# Set device to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
model.to(device)

# Function to generate summary for new input text
def generate_summary(input_text):
    # Tokenize the input text
    inputs = tokenizer(input_text, return_tensors="pt", max_length=400, truncation=True, padding="max_length")
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)
    
    # Generate the summary (text) using the trained model
    model.eval()  # Set the model to evaluation mode
    with torch.no_grad():
        generated_ids = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=200)
    
    # Decode the generated tokens to get the summary
    summary = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    return summary

# Example input text (lab values)
input_text = "MCV: 85 MCHC: 33 HB: 13.5 RBC: 4.5 WBC: 7.0 PLT: 250 RDWCV: 15 NEUTRO: 60 LYMPHO: 30"

# Generate the summary for the input text
summary = generate_summary(input_text)
print("Generated Summary:", summary)


Using device: cuda
Generated Summary: MCV 85 fL 13.5 g/dL Platelets low 250 thousand cells/L could affect clotting ability RDW-CV 15 variability red cell sizes Severe requires urgent medical address potential health risks


In [12]:
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer

# Load the trained model and tokenizer
model = T5ForConditionalGeneration.from_pretrained("./t5_PROJECT_100SCRATCH")
tokenizer = T5Tokenizer.from_pretrained("./t5_PROJECT_100SCRATCH")

# Set device to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
model.to(device)

# Function to generate summary for new input text
def generate_summary(input_text):
    # Tokenize the input text
    inputs = tokenizer(input_text, return_tensors="pt", max_length=900, truncation=True, padding="max_length")
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)
    
    # Generate the summary (text) using the trained model
    model.eval()  # Set the model to evaluation mode
    with torch.no_grad():
        generated_ids = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=200)
    
    # Decode the generated tokens to get the summary
    summary = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    return summary

# Example input text (lab values)
input_text = "MCV: 85 MCHC: 33 HB: 9.5 RBC: 2.5 WBC: 7.0 PLT: 750 RDWCV: 15 NEUTRO: 60 LYMPHO: 30"

# Generate the summary for the input text
summary = generate_summary(input_text)
print("Generated Summary:", summary)


Using device: cuda
Generated Summary: 9.5 g/dL RBC 2.5 million cells/L indicating good transport WBC 7 thousand cells/L neutrophils 60 lymphocytes 30 750 thousand cells/L adequate clotting RDW-CV 15 variation red cell sizes Overall profile shows good health minor concerns regarding
