In [1]:
import torch
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForLanguageModeling, Trainer, TrainingArguments
import pandas as pd

# ----------------- 1. Configuration -----------------
MODEL_NAME = "gpt2"
DATASET_NAME = "LinhDuong/chatdoctor-200k"
OUTPUT_DIR = "./chatdoctor_gpt2_finetuned"
MAX_LENGTH = 512  # Maximum sequence length for the model
TRAIN_RATIO = 0.9  # 90% for training, 10% for evaluation

# ----------------- 2. Load Dataset and Tokenizer -----------------
print(f"Loading dataset: {DATASET_NAME}")
# The dataset is a JSON file and might require specifying the 'train' split
dataset = load_dataset(DATASET_NAME, split='train')

print(f"Loading tokenizer: {MODEL_NAME}")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
# GPT-2 does not have a padding token by default, which is needed for batch training
tokenizer.pad_token = tokenizer.eos_token

# ----------------- 3. Data Preprocessing and Formatting -----------------

def format_conversation(examples):
    """
    Formats the 'instruction' and 'output' columns into a single continuous text string
    suitable for Causal Language Modeling (CLM).
    We use a special token (like tokenizer.eos_token) to separate conversations.
    """
    # Define conversational separators
    BOS = tokenizer.bos_token if tokenizer.bos_token else "<|startoftext|>"
    EOS = tokenizer.eos_token

    full_texts = []
    for instruction, output in zip(examples['instruction'], examples['output']):
        # Format the conversation in a continuous stream:
        # [BOS] Instruction: <user instruction> Assistant: <model response> [EOS]
        text = f"{BOS}Instruction: {instruction}\nAssistant: {output}{EOS}"
        full_texts.append(text)
    return {"text": full_texts}

# Apply the formatting function
dataset = dataset.map(format_conversation, batched=True, remove_columns=['instruction', 'output'])

# Tokenize the formatted text
def tokenize_function(examples):
    # This tokenization handles truncation and padding
    return tokenizer(examples["text"], truncation=True, max_length=MAX_LENGTH)

tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=['text']
)

# ----------------- 4. Split Data for Training and Evaluation -----------------
# Determine the number of training samples
train_size = int(TRAIN_RATIO * len(tokenized_dataset))
eval_size = len(tokenized_dataset) - train_size

# Split the dataset
tokenized_dataset = tokenized_dataset.shuffle(seed=42)
train_dataset = tokenized_dataset.select(range(train_size))
eval_dataset = tokenized_dataset.select(range(train_size, len(tokenized_dataset)))

print(f"Training on {len(train_dataset)} samples, Evaluating on {len(eval_dataset)} samples.")

# ----------------- 5. Load Model and Data Collator -----------------
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)

# Data collator prepares batches of data for the model (specifically for CLM)
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False # mlm=False is crucial for Causal Language Modeling (GPT-style training)
)

# ----------------- 6. Define Training Arguments and Trainer -----------------
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    overwrite_output_dir=True,
    num_train_epochs=3,                  # Number of epochs to run
    per_device_train_batch_size=4,       # Batch size per GPU/CPU for training
    per_device_eval_batch_size=4,
    max_steps=1000,# Batch size per GPU/CPU for evaluation
    warmup_steps=500,                    # Number of warmup steps for learning rate scheduler
    weight_decay=0.01,                   # Strength of weight decay
    logging_dir='./logs',
    logging_steps=500,
    eval_strategy="epoch",         # Evaluate at the end of each epoch
    save_strategy="epoch",               # Save checkpoint at the end of each epoch
    fp16=torch.cuda.is_available()       # Use 16-bit precision if a CUDA GPU is available
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

# ----------------- 7. Start Training -----------------
print("\n" + "="*50)
print("             STARTING FINE-TUNING")
print("="*50 + "\n")

trainer.train()

# ----------------- 8. Save the Final Model -----------------
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print(f"\nModel saved to: {OUTPUT_DIR}")

  from .autonotebook import tqdm as notebook_tqdm


Loading dataset: LinhDuong/chatdoctor-200k
Loading tokenizer: gpt2


Map: 100%|██████████| 207408/207408 [00:01<00:00, 160525.21 examples/s]
Map: 100%|██████████| 207408/207408 [00:26<00:00, 7843.99 examples/s]


Training on 186667 samples, Evaluating on 20741 samples.

             STARTING FINE-TUNING



`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Epoch,Training Loss,Validation Loss
0,2.6875,2.55492



Model saved to: ./chatdoctor_gpt2_finetuned


In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# Directory where your model and tokenizer were saved
MODEL_PATH = "./chatdoctor_gpt2_finetuned"

# 1. Load the fine-tuned model and tokenizer
print(f"Loading model and tokenizer from: {MODEL_PATH}")
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = AutoModelForCausalLM.from_pretrained(MODEL_PATH)

# Set model to evaluation mode
model.eval()

# Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# 2. Define the Test Prompt
# IMPORTANT: The prompt must follow the exact structure used during training.
# Training Format: [BOS]Instruction: <user instruction>\nAssistant: <model response>[EOS]
# Test Prompt: [BOS]Instruction: <user instruction>\nAssistant:

TEST_INSTRUCTION = "I have a sore throat and a fever of 101 degrees Fahrenheit. What should I do?"
PROMPT = f"{tokenizer.bos_token}Instruction: {TEST_INSTRUCTION}\nAssistant: "

# 3. Tokenize the input prompt
input_ids = tokenizer.encode(PROMPT, return_tensors='pt').to(device)

# 4. Generate the response
print("\n--- Generating Response ---")
print(f"PROMPT: {TEST_INSTRUCTION}")

with torch.no_grad():
    output_sequences = model.generate(
        input_ids=input_ids,
        max_length=200,          # Max length of the *total* sequence (prompt + response)
        temperature=0.7,         # Controls randomness (lower = more deterministic)
        top_k=50,                # Filters out low-probability words
        top_p=0.95,              # Filters based on cumulative probability
        repetition_penalty=1.2,  # Discourages repeating words
        do_sample=True,          # Enables sampling (creative generation)
        pad_token_id=tokenizer.eos_token_id,
        # Stop generation when the End-of-Sequence token is produced
        eos_token_id=tokenizer.eos_token_id,
    )

# 5. Decode and format the result
generated_text = tokenizer.decode(output_sequences[0], skip_special_tokens=True)

# The model will generate the full prompt plus the answer.
# We clean the output to show only the model's generated response.
response = generated_text.replace(PROMPT.replace(tokenizer.bos_token, ""), "").strip()

print("\n--- Generated Answer ---")
print(response)

Loading model and tokenizer from: ./chatdoctor_gpt2_finetuned


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.



--- Generating Response ---
PROMPT: I have a sore throat and a fever of 101 degrees Fahrenheit. What should I do?

--- Generated Answer ---
Go for an ultrasound to rule out any infection or other possible cause that could be causing your pain, then consult with the doctor regarding surgery if necessary as this will help you in resolving it further so get well soon! Take care Chat Doctor . Regards Thank You ! Hope my answer was helpful please feel free take Care!! Best wishes!!! Wishing good health from yourself & family Thanks...Regards - Tania Mankiwa-Jaye/Treatment Specialist at B&H Medical Center In Chennai.... Get Well Soon Asap Consultant after consulting physician again ASAP If needed go ahead With Surgery Now It is advisable That This Is All Natural To Avoid Infection Once Its resolved Please Do Not Let Your Child Bleed Or Grown Any Further So Don't Give Up ... Keep Giving Kindly The Advice / Reviewing Questions Dear Drs.,I am JayE Gy


In [3]:
import os

# Define the folder to download and the name for the zip file
FOLDER_TO_DOWNLOAD = "chatdoctor_gpt2_finetuned"
ZIP_FILE_NAME = f"{FOLDER_TO_DOWNLOAD}.zip"

# Create the zip file using the 'zip' command line utility
# The '!' prefix executes a shell command
print(f"Compressing folder: {FOLDER_TO_DOWNLOAD}...")
!zip -r {ZIP_FILE_NAME} {FOLDER_TO_DOWNLOAD}
print(f"Compression complete. Zip file created: {ZIP_FILE_NAME}")

Compressing folder: chatdoctor_gpt2_finetuned...
Compression complete. Zip file created: chatdoctor_gpt2_finetuned.zip


'zip' is not recognized as an internal or external command,
operable program or batch file.
