In [18]:
!pip install bert_score

Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m289.9 kB/s[0m eta [36m0:00:00[0m[36m0:00:01[0m
[?25hInstalling collected packages: bert_score
Successfully installed bert_score-0.3.13


In [19]:
import numpy as np
import pandas as pd
import transformers
import torch
from transformers import GPT2Tokenizer
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2LMHeadModel
from transformers import Trainer, TrainingArguments
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
from bert_score import score as bert_score

In [2]:
df_train = pd.read_csv("preprocessed_data/train.csv")
df_test = pd.read_csv("preprocessed_data/test.csv")
df_val = pd.read_csv("preprocessed_data/val.csv")

In [3]:
def prepare_data(df):
    df['text'] = df['transcription'] + " [SEP] " + df['description']
    return df['text'].tolist()

train_texts = prepare_data(df_train)
val_texts = prepare_data(df_val)
test_texts = prepare_data(df_test)

In [4]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Verify that the pad token is set correctly
print("Pad token set to:", tokenizer.pad_token)

train_encodings = tokenizer(train_texts, truncation=True, padding="longest", max_length=512)
val_encodings = tokenizer(val_texts, truncation=True, padding="longest", max_length=512)



Pad token set to: <|endoftext|>


In [5]:
class Medical_dataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __len__(self):
        return len(self.encodings.input_ids)

    def __getitem__(self, idx):
        # Return input_ids as labels for model training
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = item['input_ids'].clone()
        return item

train_dataset = Medical_dataset(train_encodings)
val_dataset = Medical_dataset(val_encodings)

In [None]:
model_before = GPT2LMHeadModel.from_pretrained('FinancialSupport/gpt2-ft-medical-qa')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_before.to(device)

In [None]:
model = GPT2LMHeadModel.from_pretrained('FinancialSupport/gpt2-ft-medical-qa')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


In [None]:
model.eval()

In [20]:
def generate_description(transcription):
    input_text = transcription
    input_ids = tokenizer.encode(input_text, return_tensors='pt').to(device)

    # Check if input_ids exceed the model's vocab size
    if torch.max(input_ids) >= tokenizer.vocab_size:
        raise ValueError("Input IDs contain indices outside the model's vocabulary size.")
    
    # Add attention mask creation
    attention_mask = torch.ones(input_ids.shape, dtype=torch.long).to(device)

    # Adjust max_length if necessary
    max_length = min(4096, model_before.config.n_positions)

    output = model_before.generate(input_ids, attention_mask=attention_mask, max_length=max_length, num_return_sequences=1)
    description = tokenizer.decode(output[0], skip_special_tokens=True)
    description = description.split("[SEP]")[-1].strip()
    return description

# Select the first 3 rows of the validation set
df_val_sample = df_val.head(3).copy()

# Generate descriptions for the first 3 rows of the validation set
df_val_sample['generated_description'] = df_val_sample['transcription'].apply(generate_description)

# Calculate BLEU scores
def calculate_bleu_scores(reference, candidate):
    reference = [reference.split()]
    candidate = candidate.split()
    smoothing_function = SmoothingFunction().method1
    bleu_1 = sentence_bleu(reference, candidate, weights=(1.0, 0.0, 0.0, 0.0), smoothing_function=smoothing_function)
    bleu_2 = sentence_bleu(reference, candidate, weights=(0.5, 0.5, 0.0, 0.0), smoothing_function=smoothing_function)
    bleu_3 = sentence_bleu(reference, candidate, weights=(0.33, 0.33, 0.33, 0.0), smoothing_function=smoothing_function)
    bleu_4 = sentence_bleu(reference, candidate, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smoothing_function)
    return bleu_1, bleu_2, bleu_3, bleu_4

# Calculate ROUGE scores
def calculate_rouge(reference, candidate):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference, candidate)
    return scores['rouge1'].fmeasure, scores['rouge2'].fmeasure, scores['rougeL'].fmeasure

# Calculate BERTScore
def calculate_bertscore(reference, candidate):
    P, R, F1 = bert_score([candidate], [reference], lang='en', rescale_with_baseline=True)
    return P.mean().item(), R.mean().item(), F1.mean().item()

# Apply all calculations
df_val_sample[['bleu_1', 'bleu_2', 'bleu_3', 'bleu_4']] = df_val_sample.apply(
    lambda row: calculate_bleu_scores(row['description'], row['generated_description']),
    axis=1, result_type='expand'
)

df_val_sample[['rouge_1', 'rouge_2', 'rouge_L']] = df_val_sample.apply(
    lambda row: calculate_rouge(row['description'], row['generated_description']),
    axis=1, result_type='expand'
)

df_val_sample[['bert_precision', 'bert_recall', 'bert_f1']] = df_val_sample.apply(
    lambda row: calculate_bertscore(row['description'], row['generated_description']),
    axis=1, result_type='expand'
)

# Print the scores for the first 3 rows
print(df_val_sample[['transcription', 'description', 'generated_description', 'bleu_1', 'bleu_2', 'bleu_3', 'bleu_4', 'rouge_1', 'rouge_2', 'rouge_L', 'bert_precision', 'bert_recall', 'bert_f1']])

# Print the average scores for the first 3 rows
average_bleu_1 = df_val_sample['bleu_1'].mean()
average_bleu_2 = df_val_sample['bleu_2'].mean()
average_bleu_3 = df_val_sample['bleu_3'].mean()
average_bleu_4 = df_val_sample['bleu_4'].mean()
average_rouge_1 = df_val_sample['rouge_1'].mean()
average_rouge_2 = df_val_sample['rouge_2'].mean()
average_rouge_L = df_val_sample['rouge_L'].mean()
average_bert_precision = df_val_sample['bert_precision'].mean()
average_bert_recall = df_val_sample['bert_recall'].mean()
average_bert_f1 = df_val_sample['bert_f1'].mean()

print(f"Average BLEU-1 score: {average_bleu_1}")
print(f"Average BLEU-2 score: {average_bleu_2}")
print(f"Average BLEU-3 score: {average_bleu_3}")
print(f"Average BLEU-4 score: {average_bleu_4}")
print(f"Average ROUGE-1 score: {average_rouge_1}")
print(f"Average ROUGE-2 score: {average_rouge_2}")
print(f"Average ROUGE-L score: {average_rouge_L}")
print(f"Average BERT Precision: {average_bert_precision}")
print(f"Average BERT Recall: {average_bert_recall}")
print(f"Average BERT F1: {average_bert_f1}")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


                                       transcription  \
0  history of present illness:  ,the patient is a...   
1  hx: ,this 46y/o rhm with htn was well until 2 ...   
2  title of operation: , placement of right new v...   

                                         description  \
0   a woman presenting to our clinic for the firs...   
1   patient with sudden onset dizziness and rue c...   
2   placement of right new ventriculoperitoneal (...   

                               generated_description    bleu_1    bleu_2  \
0  history of present illness: ,the patient is a ...  0.050000  0.048663   
1  hx:,this 46y/o rhm with htn was well until 2 w...  0.011852  0.009377   
2  title of operation:, placement of right new ve...  0.021592  0.020921   

     bleu_3    bleu_4   rouge_1   rouge_2   rouge_L  bert_precision  \
0  0.048775  0.045940  0.093434  0.088608  0.093434       -0.293104   
1  0.008453  0.006950  0.027064  0.016282  0.024357       -0.436072   
2  0.021045  0.019549  0.044213 

In [10]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # number of training epochs
    per_device_train_batch_size=2,   # batch size for training
    per_device_eval_batch_size=8,    # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    evaluation_strategy="epoch",     # evaluate each epoch
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


In [11]:
trainer.train()



Epoch,Training Loss,Validation Loss
1,No log,2.134238
2,2.771300,1.954901
3,2.129300,1.912344




TrainOutput(global_step=1491, training_loss=2.3001555066073522, metrics={'train_runtime': 909.5495, 'train_samples_per_second': 13.101, 'train_steps_per_second': 1.639, 'total_flos': 3113555853312000.0, 'train_loss': 2.3001555066073522, 'epoch': 3.0})

In [21]:
def generate_description(transcription):
    input_text = transcription
    input_ids = tokenizer.encode(input_text, return_tensors='pt').to(device)

    # Check if input_ids exceed the model's vocab size
    if torch.max(input_ids) >= tokenizer.vocab_size:
        raise ValueError("Input IDs contain indices outside the model's vocabulary size.")
    
    # Add attention mask creation
    attention_mask = torch.ones(input_ids.shape, dtype=torch.long).to(device)

    # Adjust max_length if necessary
    max_length = min(4096, model.config.n_positions)

    output = model.generate(input_ids, attention_mask=attention_mask, max_length=max_length, num_return_sequences=1)
    description = tokenizer.decode(output[0], skip_special_tokens=True)
    description = description.split("[SEP]")[-1].strip()
    return description

# Select the first 3 rows of the validation set
df_val_sample = df_val.head(3).copy()

# Generate descriptions for the first 3 rows of the validation set
df_val_sample['generated_description'] = df_val_sample['transcription'].apply(generate_description)

# Calculate BLEU scores
def calculate_bleu_scores(reference, candidate):
    reference = [reference.split()]
    candidate = candidate.split()
    smoothing_function = SmoothingFunction().method1
    bleu_1 = sentence_bleu(reference, candidate, weights=(1.0, 0.0, 0.0, 0.0), smoothing_function=smoothing_function)
    bleu_2 = sentence_bleu(reference, candidate, weights=(0.5, 0.5, 0.0, 0.0), smoothing_function=smoothing_function)
    bleu_3 = sentence_bleu(reference, candidate, weights=(0.33, 0.33, 0.33, 0.0), smoothing_function=smoothing_function)
    bleu_4 = sentence_bleu(reference, candidate, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smoothing_function)
    return bleu_1, bleu_2, bleu_3, bleu_4

# Calculate ROUGE scores
def calculate_rouge(reference, candidate):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference, candidate)
    return scores['rouge1'].fmeasure, scores['rouge2'].fmeasure, scores['rougeL'].fmeasure

# Calculate BERTScore
def calculate_bertscore(reference, candidate):
    P, R, F1 = bert_score([candidate], [reference], lang='en', rescale_with_baseline=True)
    return P.mean().item(), R.mean().item(), F1.mean().item()

# Apply all calculations
df_val_sample[['bleu_1', 'bleu_2', 'bleu_3', 'bleu_4']] = df_val_sample.apply(
    lambda row: calculate_bleu_scores(row['description'], row['generated_description']),
    axis=1, result_type='expand'
)

df_val_sample[['rouge_1', 'rouge_2', 'rouge_L']] = df_val_sample.apply(
    lambda row: calculate_rouge(row['description'], row['generated_description']),
    axis=1, result_type='expand'
)

df_val_sample[['bert_precision', 'bert_recall', 'bert_f1']] = df_val_sample.apply(
    lambda row: calculate_bertscore(row['description'], row['generated_description']),
    axis=1, result_type='expand'
)

# Print the scores for the first 3 rows
print(df_val_sample[['transcription', 'description', 'generated_description', 'bleu_1', 'bleu_2', 'bleu_3', 'bleu_4', 'rouge_1', 'rouge_2', 'rouge_L', 'bert_precision', 'bert_recall', 'bert_f1']])

# Print the average scores for the first 3 rows
average_bleu_1 = df_val_sample['bleu_1'].mean()
average_bleu_2 = df_val_sample['bleu_2'].mean()
average_bleu_3 = df_val_sample['bleu_3'].mean()
average_bleu_4 = df_val_sample['bleu_4'].mean()
average_rouge_1 = df_val_sample['rouge_1'].mean()
average_rouge_2 = df_val_sample['rouge_2'].mean()
average_rouge_L = df_val_sample['rouge_L'].mean()
average_bert_precision = df_val_sample['bert_precision'].mean()
average_bert_recall = df_val_sample['bert_recall'].mean()
average_bert_f1 = df_val_sample['bert_f1'].mean()

print(f"Average BLEU-1 score: {average_bleu_1}")
print(f"Average BLEU-2 score: {average_bleu_2}")
print(f"Average BLEU-3 score: {average_bleu_3}")
print(f"Average BLEU-4 score: {average_bleu_4}")
print(f"Average ROUGE-1 score: {average_rouge_1}")
print(f"Average ROUGE-2 score: {average_rouge_2}")
print(f"Average ROUGE-L score: {average_rouge_L}")
print(f"Average BERT Precision: {average_bert_precision}")
print(f"Average BERT Recall: {average_bert_recall}")
print(f"Average BERT F1: {average_bert_f1}")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predict

                                       transcription  \
0  history of present illness:  ,the patient is a...   
1  hx: ,this 46y/o rhm with htn was well until 2 ...   
2  title of operation: , placement of right new v...   

                                         description  \
0   a woman presenting to our clinic for the firs...   
1   patient with sudden onset dizziness and rue c...   
2   placement of right new ventriculoperitoneal (...   

                               generated_description    bleu_1    bleu_2  \
0  history of present illness: ,the patient is a ...  0.048942  0.047632   
1  hx:,this 46y/o rhm with htn was well until 2 w...  0.010710  0.008472   
2  placement of right new ventriculoperitoneal (v...  1.000000  1.000000   

     bleu_3    bleu_4   rouge_1   rouge_2   rouge_L  bert_precision  \
0  0.047752  0.044965  0.091584  0.086849  0.091584       -0.293104   
1  0.007644  0.006279  0.022699  0.012642  0.022699       -0.436072   
2  1.000000  1.000000  1.000000 