In [1]:
import numpy as np
import pandas as pd
import transformers
import torch
from transformers import GPT2Tokenizer
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2LMHeadModel
from transformers import Trainer, TrainingArguments
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

In [2]:
df_train = pd.read_csv("preprocessed_data/train.csv")
df_test = pd.read_csv("preprocessed_data/test.csv")
df_val = pd.read_csv("preprocessed_data/val.csv")

In [3]:
def prepare_data(df):
    df['text'] = df['transcription'] + " [SEP] " + df['description']
    return df['text'].tolist()

train_texts = prepare_data(df_train)
val_texts = prepare_data(df_val)
test_texts = prepare_data(df_test)

In [4]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Verify that the pad token is set correctly
print("Pad token set to:", tokenizer.pad_token)

train_encodings = tokenizer(train_texts, truncation=True, padding="longest", max_length=512)
val_encodings = tokenizer(val_texts, truncation=True, padding="longest", max_length=512)



Pad token set to: <|endoftext|>


In [5]:
class Medical_dataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __len__(self):
        return len(self.encodings.input_ids)

    def __getitem__(self, idx):
        # Return input_ids as labels for model training
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = item['input_ids'].clone()
        return item

train_dataset = Medical_dataset(train_encodings)
val_dataset = Medical_dataset(val_encodings)

In [7]:
model = GPT2LMHeadModel.from_pretrained('FinancialSupport/gpt2-ft-medical-qa')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [8]:
model.eval()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [9]:
def generate_description(transcription):
    input_text = transcription
    input_ids = tokenizer.encode(input_text, return_tensors='pt').to(device)

    # Check if input_ids exceed the model's vocab size
    if torch.max(input_ids) >= tokenizer.vocab_size:
        raise ValueError("Input IDs contain indices outside the model's vocabulary size.")
    
    # Add attention mask creation
    attention_mask = torch.ones(input_ids.shape, dtype=torch.long).to(device)

    # Adjust max_length if necessary
    max_length = min(4096, model.config.n_positions)

    output = model.generate(input_ids, attention_mask=attention_mask, max_length=max_length, num_return_sequences=1)
    description = tokenizer.decode(output[0], skip_special_tokens=True)
    description = description.split("[SEP]")[-1].strip()
    return description

# Select the first 3 rows of the validation set
df_val_sample = df_val.head(3).copy()

# Generate descriptions for the first 3 rows of the validation set
df_val_sample['generated_description'] = df_val_sample['transcription'].apply(generate_description)

# Calculate BLEU score
def calculate_bleu(reference, candidate):
    reference = [reference.split()]
    candidate = candidate.split()
    smoothing_function = SmoothingFunction().method1
    return sentence_bleu(reference, candidate, smoothing_function=smoothing_function)

df_val_sample['bleu_score'] = df_val_sample.apply(lambda row: calculate_bleu(row['description'], row['generated_description']), axis=1)

# Print the BLEU scores for the first 3 rows
print(df_val_sample[['transcription', 'description', 'generated_description', 'bleu_score']])

# Print the average BLEU score for the first 3 rows
average_bleu_score = df_val_sample['bleu_score'].mean()
print(f"Average BLEU score on the first 3 rows of the validation set: {average_bleu_score}")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


                                       transcription  \
0  history of present illness:  ,the patient is a...   
1  hx: ,this 46y/o rhm with htn was well until 2 ...   
2  title of operation: , placement of right new v...   

                                         description  \
0   a woman presenting to our clinic for the firs...   
1   patient with sudden onset dizziness and rue c...   
2   placement of right new ventriculoperitoneal (...   

                               generated_description  bleu_score  
0  history of present illness: ,the patient is a ...    0.045940  
1  hx:,this 46y/o rhm with htn was well until 2 w...    0.006950  
2  title of operation:, placement of right new ve...    0.019549  
Average BLEU score on the first 3 rows of the validation set: 0.024146234789758


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50260, 1280)
    (wpe): Embedding(1024, 1280)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-35): 36 x GPT2Block(
        (ln_1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=1280, out_features=50260, bias=False)
)

In [10]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # number of training epochs
    per_device_train_batch_size=2,   # batch size for training
    per_device_eval_batch_size=8,    # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    evaluation_strategy="epoch",     # evaluate each epoch
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


In [11]:
trainer.train()



Epoch,Training Loss,Validation Loss
1,No log,2.134238
2,2.771300,1.954901
3,2.129300,1.912344




TrainOutput(global_step=1491, training_loss=2.3001555066073522, metrics={'train_runtime': 909.5495, 'train_samples_per_second': 13.101, 'train_steps_per_second': 1.639, 'total_flos': 3113555853312000.0, 'train_loss': 2.3001555066073522, 'epoch': 3.0})

In [12]:
def generate_description(transcription):
    input_text = transcription
    input_ids = tokenizer.encode(input_text, return_tensors='pt').to(device)

    # Check if input_ids exceed the model's vocab size
    if torch.max(input_ids) >= tokenizer.vocab_size:
        raise ValueError("Input IDs contain indices outside the model's vocabulary size.")
    
    # Add attention mask creation
    attention_mask = torch.ones(input_ids.shape, dtype=torch.long).to(device)

    # Adjust max_length if necessary
    max_length = min(4096, model.config.n_positions)

    output = model.generate(input_ids, attention_mask=attention_mask, max_length=max_length, num_return_sequences=1)
    description = tokenizer.decode(output[0], skip_special_tokens=True)
    description = description.split("[SEP]")[-1].strip()
    return description

# Select the first 3 rows of the validation set
df_val_sample = df_val.head(3).copy()

# Generate descriptions for the first 3 rows of the validation set
df_val_sample['generated_description'] = df_val_sample['transcription'].apply(generate_description)

# Calculate BLEU score
def calculate_bleu(reference, candidate):
    reference = [reference.split()]
    candidate = candidate.split()
    smoothing_function = SmoothingFunction().method1
    return sentence_bleu(reference, candidate, smoothing_function=smoothing_function)

df_val_sample['bleu_score'] = df_val_sample.apply(lambda row: calculate_bleu(row['description'], row['generated_description']), axis=1)

# Print the BLEU scores for the first 3 rows
print(df_val_sample[['transcription', 'description', 'generated_description', 'bleu_score']])

# Print the average BLEU score for the first 3 rows
average_bleu_score = df_val_sample['bleu_score'].mean()
print(f"Average BLEU score on the first 3 rows of the validation set: {average_bleu_score}")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


                                       transcription  \
0  history of present illness:  ,the patient is a...   
1  hx: ,this 46y/o rhm with htn was well until 2 ...   
2  title of operation: , placement of right new v...   

                                         description  \
0   a woman presenting to our clinic for the firs...   
1   patient with sudden onset dizziness and rue c...   
2   placement of right new ventriculoperitoneal (...   

                               generated_description  bleu_score  
0  history of present illness: ,the patient is a ...    0.044965  
1  hx:,this 46y/o rhm with htn was well until 2 w...    0.006279  
2  placement of right new ventriculoperitoneal (v...    1.000000  
Average BLEU score on the first 3 rows of the validation set: 0.3504146379791368
