# Kirsten Mayland - Final Project

---

Kirsten Mayland (kirsten.r.mayland.25@dartmouth.edu) <br>
Dartmouth College, CS72, Winter 2025

Purpose: To train an LLM on the r/AskDocs database I generated, and ask it to produce releavant followup questions

In [1]:
# overview
model_name = "google/flan-t5-small"
results_csv = "CS72_FinalProject_flan_t5_small_Results.csv"

## Set Up

---



In [2]:
%pip install datasets torch transformers
# %pip install -U bitsandbytes

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.

In [3]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import Trainer, TrainingArguments
from huggingface_hub import login
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
from transformers import T5Tokenizer, T5ForConditionalGeneration

In [None]:
# login(token="")

## Prepare LLM

---



In [5]:
# # Load the model
# model = AutoModelForCausalLM.from_pretrained(
#     model_name,
#     torch_dtype="auto", # quantization_config=bnb_config,
#     low_cpu_mem_usage = True
#     )
# and tokenizer
# tokenizer = AutoTokenizer.from_pretrained(model_name)

tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name, device_map="auto")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [6]:
# Prepare model for QLoRA training
# model = prepare_model_for_kbit_training(model)

# # Define LoRA configuration
# lora_config = LoraConfig(
#     r=8,  # Rank of low-rank update matrices
#     lora_alpha=32,  # Scaling factor
#     target_modules=["q_proj", "v_proj"],  # Apply LoRA to attention layers
#     lora_dropout=0.05,  # Dropout for regularization
#     bias="none"
# )

# # Wrap model with QLoRA adapters
# model = get_peft_model(model, lora_config)
# model.print_trainable_parameters()  # Verify trainable parameters

## Prepare Data for LLM

---



In [7]:
# load your CSV file (loaded into colab in sidebar)
df = pd.read_csv("/content/CS72_FinalProject_Reddit_AskDocs_Dataset.csv", on_bad_lines='skip')

# Create a list of formatted dictionaries
data = []
for _, row in df.iterrows():
  # messages = [
  #   {"role": "system", "content": "You are a medical professional who asks follow-up questions to get more relevant information before answering a client’s medical questions. You are concise. You do not provide medical advice. You do not answer medical questions. You only ask questions."},
  #   {"role": "user", "content": f"Please ask follow up questions to these posts. Do not answer questions or provide advice. Only ask questions. Post Title: {row['Title']}. Post: {row['Post']}"},
  #   {"role": "assistant", "content": row["Follow-Up Questions"]}  # Expected model output
  # ]
  # text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)

  input_text = f"Please ask follow up questions to these posts. Your goal is to prompt them to provide more relevant medical information that they might have forgotten to add. Post Title: {row['Title']}. Post: {row['Post']}"
  response = row["Follow-Up Questions"]  # Expected model output
  data.append({"prompt": input_text, "response": response})

# Convert to Hugging Face Dataset format
dataset = Dataset.from_pandas(pd.DataFrame(data))

# Split the dataset into 80% tuning and 20% testing
split_dataset = dataset.train_test_split(test_size=0.2)
train_dataset = split_dataset["train"]
test_dataset = split_dataset["test"]

In [8]:
# tokenize the dataset so it can be used for training
def preprocess_function(examples):
  # assistant_marker = "<|im_start|>assistant\n"
  # new_inputs = []
  # new_targets = []

  # for prompt in examples["prompt"]:
  #   # Check if the assistant marker is in the prompt
  #   if assistant_marker in prompt:
  #     # Split the prompt at the assistant marker
  #     input_part, assistant_part = prompt.split(assistant_marker, 1)
  #     # Optionally, include the assistant marker in the input as a cue
  #     new_input = input_part.strip()
  #     new_target = assistant_marker + assistant_part
  #   else:
  #     new_input = prompt
  #     new_target = prompt

  #   new_inputs.append(new_input)
  #   new_targets.append(new_target)

  # # Tokenize the input and target separately
  # model_inputs = tokenizer(new_inputs, padding="max_length", truncation=True, max_length=512, return_tensors="pt")
  # labels = tokenizer(new_targets, padding="max_length", truncation=True, max_length=512, return_tensors="pt")

  # # Replace pad tokens in labels with -100 so that they are ignored in the loss computation.
  # labels["input_ids"] = [
  #     [token if token != tokenizer.pad_token_id else -100 for token in label]
  #     for label in labels["input_ids"]
  # ]

  inputs = examples["prompt"]
  targets = examples["response"]
  model_inputs = tokenizer(inputs, padding="max_length", truncation=True, max_length=512, return_tensors="pt")
  labels = tokenizer(targets, padding="max_length", truncation=True, max_length=512, return_tensors="pt")

  model_inputs["labels"] = labels["input_ids"]
  return model_inputs

  # input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to("cuda")


# Apply tokenization
tokenized_train = train_dataset.map(preprocess_function, batched=True, remove_columns=["prompt", "response"])
tokenized_test = test_dataset.map(preprocess_function, batched=True, remove_columns=["prompt", "response"])

Map:   0%|          | 0/44709 [00:00<?, ? examples/s]

Map:   0%|          | 0/11178 [00:00<?, ? examples/s]

## Pre-Training Testing

---



In [9]:
from google.colab import files

def generate_follow_up(title, post):
    # Combine the title and post as input for the model
    # messages = [
    # {"role": "system", "content": "You are a medical professional who asks follow-up questions to get more relevant information before answering a client’s medical questions. You are concise. You do not provide medical advice. You do not answer medical questions. You only ask questions."},
    # {"role": "user", "content": f"Please ask follow up questions to these posts. Do not answer questions or provide advice. Only ask questions. Post Title: {title}. Post: {post}"}
    # ]
    # text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    input_text = f"Please ask follow up questions to these posts. Your goal is to prompt them to provide more relevant medical information that they might have forgotten to add. Post Title: {row['Title']}. Post: {row['Post']}"

    # tokenize and generate response
    inputs = tokenizer(input_text, return_tensors="pt").to(model.device)

    #(Optional) Move inputs to GPU if available:
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    # Generate follow-up questions
    outputs = model.generate(**inputs, max_new_tokens=512, do_sample=True, top_k=50, top_p=0.9, temperature=0.7)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # If the response starts with 'assistant:', remove it and stuff before
    # if "assistant" in response:
    #   response = response.split("assistant")[1].strip()
    #   if not response:
    #     response = "No follow-up questions generated."
    # else:
    #   response = "No follow-up questions generated."

    # print("Title:", title)
    # print("Post:", post)
    print("Generated:", response)
    print("-" * 40)
    return response


In [10]:
# Load CSV
eval_df = pd.read_csv("/content/CS72_FinalProject_EvalDataset.csv", on_bad_lines='skip')
eval_df = eval_df.dropna().reset_index(drop=True)
eval_df['Pre-training Results'] = eval_df.apply(lambda row: generate_follow_up(row['Title'], row['Post']), axis=1)

Generated: What are the results of the retest?
----------------------------------------
Generated: Is there anything else I can do for you?
----------------------------------------
Generated: AST and ALT blood tests
----------------------------------------
Generated: Is this normal?
----------------------------------------
Generated: Is there anything else I can do for you?
----------------------------------------
Generated: How can I help?
----------------------------------------
Generated: Are my AST and ALT still elevated?
----------------------------------------
Generated: Is this normal?
----------------------------------------
Generated: What is the reason for the elevated LFT's?
----------------------------------------
Generated: What are the symptoms of elevated LFT’s?
----------------------------------------
Generated: What is your name?
----------------------------------------
Generated: How long have you been having elevated LFT’s?
----------------------------------------
Ge

## Fine-Tune LLM

---



In [11]:
# fine-tune the model
training_args = TrainingArguments(
  output_dir="./results",
  learning_rate=2e-5,
  per_device_train_batch_size=8,  # Adjust batch size as needed based on RAM
  gradient_accumulation_steps=2, # Accumulate more steps  # max_steps=500, # Fewer training steps  # optim="adamw_32bit",
  num_train_epochs=1,
  weight_decay=0.01,
  report_to="none",  # Avoids unnecessary logging issues
  fp16=False,  # Disable fp16 for CPU
  bf16=True,  # Disable bfloat16 on CPU
  eval_strategy="no",  # Avoid storing evaluation results in memory
)

In [13]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test
    )

# Fine-tune the model
trainer.train()

# After training, save your model and tokenizer
model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")

OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB. GPU 0 has a total capacity of 39.56 GiB of which 12.88 MiB is free. Process 9629 has 39.54 GiB memory in use. Of the allocated memory 39.01 GiB is allocated by PyTorch, and 33.29 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

## Post-Training Testing

---



In [None]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)  # Ensure the model is on the right device

model.generation_config.pad_token_id = tokenizer.pad_token_id

In [None]:
# model = AutoModelForCausalLM.from_pretrained("./fine_tuned_Qwen", torch_dtype="auto", device_map="auto")
# tokenizer = AutoTokenizer.from_pretrained("./fine_tuned_Qwen")

# Assuming test_dataset is already created from your CSV split and contains a "prompt" field.
# for example in test_dataset.select(range(5)):  # Test on 5 examples; adjust as needed
#   input_text = example["prompt"]

#   inputs = tokenizer(input_text, return_tensors="pt").to(model.device)

#   #(Optional) Move inputs to GPU if available:
#   inputs = {k: v.to(model.device) for k, v in inputs.items()}

#   # Generate follow-up questions
#   outputs = model.generate(**inputs, max_new_tokens=512, do_sample=True, top_k=50, top_p=0.9, temperature=0.9)
#   response = tokenizer.decode(outputs[0], skip_special_tokens=True)

#   # If the response starts with 'assistant:', remove it and stuff before
#   if "assistant" in response:
#     response = response.split("assistant")[1].strip()

#   print("Generated:", response)
#   print("Expected Response:", example["response"])
#   print("-" * 40)

In [None]:
# Generate follow-up questions and add them to a new column
eval_df['Post Training Results'] = eval_df.apply(lambda row: generate_follow_up(row['Title'], row['Post']), axis=1)

# Save the updated DataFrame to a new CSV file and save
eval_df.to_csv("/content/" + results_csv, index=False, encoding="utf-8-sig")

In [None]:
files.download(results_csv)
print(f"Generated follow-up questions and saved to '{results_csv}'")