In [7]:
from transformers import AutoModelForMaskedLM, AutoTokenizer
from datasets import Dataset
from datasets import DatasetDict
from transformers import DataCollatorForLanguageModeling
from transformers import TrainingArguments, Trainer, pipeline

import pandas as pd
import tqdm

In [8]:
def clean_text(text):
  text = text.lower().strip()  # Lowercase and remove whitespace
  return text


In [9]:
data = pd.read_csv("../MTS-Dialog-main/MTS-Dialog-main/Correlation-Study/MTS-Dialog-Automatic-Summaries-ValidationSet.csv")

data.head()

Unnamed: 0,ID,Dialogue,Reference Summary,Automatic Summary
0,0,Doctor: When did your pain begin? \r\nPatient:...,"The patient is a 26-year-old female, referred ...",The patient is a 26-year-old female who has ha...
1,1,"Doctor: Hey, bud. What brings you in today? \r...",As mentioned denies any oropharyngeal swelling...,Rash on the upper arms and torso.
2,2,Doctor: Has anything changed in your medical h...,Essentially unchanged from her visit of 04/15/...,Noncontributory.
3,3,Doctor: How've you been treating your acne? \r...,Accutane.,Accutane.
4,4,Doctor: Have you been experiencing any mental ...,Confusion and hallucinations.,Noncontributory.


In [None]:
def prepare_data(data):
  data["Dialogue"] = data["Dialogue"].apply(clean_text)
  data["summary"] = data["summary"].apply(clean_text)

  tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

  def encode_data(row):
    encoded = tokenizer(row["conversation_text"], row["summary"], padding="max_length", truncation=True)
    return encoded

  encoded_data = data.apply(encode_data, axis=1)

  from sklearn.model_selection import train_test_split

  train_data, val_test_data = train_test_split(encoded_data, test_size=0.2, random_state=42)
  train_data, val_data = train_test_split(train_data, test_size=0.25, random_state=42)

  # Extract data for training
  train_input_ids = train_data["input_ids"]
  train_attention_mask = train_data["attention_mask"]
  train_labels = train_data["labels"]  # Assuming "labels" column stores tokenized summary IDs

  # Similar operations for validation data (modify column names if different)
  val_input_ids = val_data["input_ids"]
  val_attention_mask = val_data["attention_mask"]
  val_labels = val_data["labels"]

  return train_input_ids, train_attention_mask, train_labels, val_input_ids, val_attention_mask, val_labels

# Fine-tuning script
def fine_tune_bert(model_name, data_path, output_dir):
  train_input_ids, train_attention_mask, train_labels, val_input_ids, val_attention_mask, val_labels = prepare_data(data_path)

  model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

  training_args = TrainingArguments(
      output_dir=output_dir,
      per_device_train_batch_size=8,  # Adjust batch size based on GPU memory
      num_train_epochs=3,  # Adjust epochs as needed
      evaluation_strategy="epoch",
      save_steps=500,
      save_total_limit=2,
  )

  def compute_metrics(pred):
    # Replace with your chosen summarization evaluation metric (e.g., ROUGE score)
    from datasets import load_metric
    rouge = load_metric("rouge")
    metrics = rouge.compute(predictions=pred.predictions, references=pred.label_ids, use_stemmer=True)
    return metrics

  trainer = Seq2SeqTrainer(
      model=model,
      args=training_args,
      train_dataset=(train_input_ids, train_attention_mask, train_labels),
      eval_dataset=(val_input_ids, val_attention_mask, val_labels),
      compute_metrics=compute_metrics,
  )

  # Train the model
  set_seed(training_args.seed)
  trainer.train()

# Example usage
data_path = "path/to/your/medical_conversation_dataset.csv"
output_dir = "fine_tuned_bert_model"
model_name = "bert-base-uncased"  # Consider BioBERT or BioMedBERT for medical text

fine_tune_bert(model_name, data_path, output_dir)

print("Fine-tuning completed. Model saved in", output_dir)