In [None]:
!pip install transformers datasets scikit-learn nltk --quiet


In [None]:
import os
import torch
import pandas as pd
import numpy as np
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.metrics import accuracy_score
import nltk
nltk.download('punkt')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
# Set environment variables
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["WANDB_DISABLED"] = "true"

In [None]:
def preprocess_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower().replace('\n', ' ')
    text = ''.join([c if c.isalnum() or c.isspace() else ' ' for c in text])
    return ' '.join(text.split())


In [None]:
# Load datasets
train_df = pd.read_csv('/content/drive/MyDrive/Hack-athons/train.csv')
test_df = pd.read_csv('/content/drive/MyDrive/Hack-athons/test.csv')


# Preprocess text
# Preprocess text
train_df['Prompt'] = train_df['Prompt'].apply(preprocess_text)
train_df['Clinician'] = train_df['Clinician'].apply(preprocess_text)
test_df['Prompt'] = test_df['Prompt'].apply(preprocess_text)

Data Augmentation via Paraphrasing

In [None]:
from tqdm import tqdm

# Load tokenizer and model for paraphrasing
paraphrase_tokenizer = T5Tokenizer.from_pretrained('t5-base')
paraphrase_model = T5ForConditionalGeneration.from_pretrained('t5-base')
paraphrase_model.eval()

def paraphrase_text(text, num_return_sequences=1):
    input_text = f"paraphrase: {text} </s>"
    encoding = paraphrase_tokenizer.encode_plus(input_text, return_tensors="pt", max_length=512, truncation=True)
    with torch.no_grad():
        outputs = paraphrase_model.generate(
            input_ids=encoding['input_ids'],
            attention_mask=encoding['attention_mask'],
            max_length=512,
            num_return_sequences=num_return_sequences,
            num_beams=5  # remove temperature here
        )
    return [paraphrase_tokenizer.decode(output, skip_special_tokens=True) for output in outputs]

# Augment training data
augmented_data = []
for idx, row in tqdm(train_df.iterrows(), total=len(train_df)):
    augmented_data.append({'Prompt': row['Prompt'], 'Clinician': row['Clinician']})
    try:
        paraphrased = paraphrase_text(row['Prompt'], num_return_sequences=1)
        for para in paraphrased:
            augmented_data.append({'Prompt': para, 'Clinician': row['Clinician']})
    except Exception as e:
        continue

augmented_df = pd.DataFrame(augmented_data)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

 57%|█████▋    | 229/400 [45:08<28:10,  9.88s/it]

Load Tokenizer and Model for Training

In [None]:
# Load tokenizer and model
tokenizer = T5Tokenizer.from_pretrained('t5-base')
model = T5ForConditionalGeneration.from_pretrained('t5-base')

# Enable multi-GPU if available
if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs")
    model = torch.nn.DataParallel(model)


Tokenization Function

In [None]:
def tokenize_function(examples):
    inputs = tokenizer(
        ["summarize: " + text for text in examples['Prompt']],
        max_length=512,
        padding='max_length',
        truncation=True
    )
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples['Clinician'],
            max_length=128,
            padding='max_length',
            truncation=True
        )
    return {
        'input_ids': inputs['input_ids'],
        'attention_mask': inputs['attention_mask'],
        'labels': labels['input_ids']
    }


Prepare Datasets

In [None]:
# Convert to HuggingFace Dataset
train_dataset = Dataset.from_pandas(augmented_df[['Prompt', 'Clinician']])
train_dataset = train_dataset.map(tokenize_function, batched=True, remove_columns=['Prompt', 'Clinician'])

# Split dataset
train_val_split = train_dataset.train_test_split(test_size=0.01)
train_dataset = train_val_split['train']
eval_dataset = train_val_split['test']


Evaluation Metrics

In [None]:
from nltk.translate.bleu_score import sentence_bleu

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Compute BLEU score
    bleu_scores = [sentence_bleu([ref.split()], pred.split()) for pred, ref in zip(decoded_preds, decoded_labels)]
    avg_bleu = np.mean(bleu_scores)

    # Compute accuracy
    acc = accuracy_score(decoded_labels, decoded_preds)
    return {"accuracy": acc, "bleu": avg_bleu}


Training Arguments

In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=8,
    num_train_epochs=6,
    # Changed evaluation_strategy to eval_strategy
    eval_strategy='epoch',
    save_strategy='epoch',
    logging_steps=100,
    save_total_limit=2,
    learning_rate=2e-5,
    weight_decay=0.01,
    warmup_steps=500,
    fp16=True,
    remove_unused_columns=False,
    report_to='none',
    logging_dir='./logs',
    logging_first_step=True,
    prediction_loss_only=False
)

Trainer Setup

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics
)


Train the Model

In [None]:
print("Starting training...")
trainer.train()


Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,Bleu
1,5.51,5.030037,0.0,0.109048
2,5.51,4.887677,0.0,0.109048
3,5.51,4.63896,0.0,0.109325
4,5.203,4.245702,0.0,0.102897
5,5.203,3.809929,0.0,0.107938
6,5.203,3.584705,0.0,0.103578


The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, indepe

TrainOutput(global_step=150, training_loss=4.893227450052897, metrics={'train_runtime': 720.0194, 'train_samples_per_second': 6.6, 'train_steps_per_second': 0.208, 'total_flos': 2893767895941120.0, 'train_loss': 4.893227450052897, 'epoch': 6.0})

Generate Predictions

In [None]:
def generate_predictions(prompts):
    model_to_use = model.module if isinstance(model, torch.nn.DataParallel) else model
    inputs = tokenizer(
        ["summarize: " + text for text in prompts],
        return_tensors='pt',
        padding=True,
        truncation=True,
        max_length=512
    ).to(next(model_to_use.parameters()).device)

    with torch.no_grad():
        outputs = model_to_use.generate(
            input_ids=inputs['input_ids'],
            attention_mask=inputs['attention_mask'],
            max_length=128
        )
    return [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]


prompt for testing

In [None]:
# Example prompt for testing
custom_prompt = "The patient presents with a persistent cough and shortness of breath. What is the likely diagnosis and recommended clinical action?"

# Preprocess the prompt just like during training
custom_prompt_clean = preprocess_text(custom_prompt)

# Generate prediction
test_output = generate_predictions([custom_prompt_clean])
print("\n🧾 Prompt:")
print(custom_prompt)
print("\n💡 Model's Response:")
print(test_output[0])



🧾 Prompt:
The patient presents with a persistent cough and shortness of breath. What is the likely diagnosis and recommended clinical action?

💡 Model's Response:
patient presents with persistent cough and shortness of breath what is the likely diagnosis and recommended clinical action .


Save Predictions

In [None]:
test_df['Clinician'] = generate_predictions(test_df['Prompt'].tolist())
test_df[['Master_Index', 'Clinician']].to_csv('submission.csv', index=False)
print("✅ Training complete and predictions saved to 'submission.csv'")


✅ Training complete and predictions saved to 'submission.csv'
