# Develop of model

## Stage 1

### Dataset Preparation
Download the orginal data

In [2]:
! pip install datasets

Collecting datasets
  Using cached datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Using cached pyarrow-19.0.1-cp311-cp311-macosx_12_0_arm64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Using cached dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Using cached xxhash-3.5.0-cp311-cp311-macosx_11_0_arm64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Using cached multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Using cached datasets-3.5.0-py3-none-any.whl (491 kB)
Using cached dill-0.3.8-py3-none-any.whl (116 kB)
Using cached multiprocess-0.70.16-py311-none-any.whl (143 kB)
Using cached pyarrow-19.0.1-cp311-cp311-macosx_12_0_arm64.whl (30.7 MB)
Using cached xxhash-3.5.0-cp311-cp311-macosx_11_0_arm64.whl (30 kB)
Installing collected packages: xxhash, pyarrow, dill, multiprocess, datasets
Successfully installed datasets-3.5.0 dill-0.3.8 multiprocess-0.70.1

In [2]:
from datasets import load_dataset, load_from_disk

In [7]:
dataset = load_dataset(path="UCSD26/medical_dialog", name='processed.en',trust_remote_code=True)

Downloading data: 100%|██████████| 414k/414k [00:00<00:00, 7.62MB/s]
Downloading data: 100%|██████████| 57.7k/57.7k [00:00<00:00, 13.2MB/s]
Downloading data: 100%|██████████| 52.0k/52.0k [00:00<00:00, 14.0MB/s]
Generating train split: 100%|██████████| 482/482 [00:00<00:00, 34522.20 examples/s]
Generating validation split: 100%|██████████| 60/60 [00:00<00:00, 16233.92 examples/s]
Generating test split: 100%|██████████| 61/61 [00:00<00:00, 26070.16 examples/s]


check the dataset

In [8]:
# Basic information about the dataset
print("Dataset structure:", dataset)
print("\nSplits available:", dataset.keys())

# Get the number of examples in each split
for split in dataset.keys():
    print(f"\nNumber of examples in {split} split:", len(dataset[split]))

# Look at the features/columns in the dataset
print("\nFeatures:", dataset[next(iter(dataset.keys()))].features)

# Display a few examples from the training set (or main split)
main_split = next(iter(dataset.keys()))
print(f"\nExample dialogues from the {main_split} split:")
for i in range(3):  # Show first 3 examples
    print(f"\nExample {i+1}:")
    example = dataset[main_split][i]
    for key, value in example.items():
        if isinstance(value, str) and len(value) > 100:
            print(f"{key}: {value[:100]}...")  # Truncate long text
        else:
            print(f"{key}: {value}")
    print("-" * 50)

Dataset structure: DatasetDict({
    train: Dataset({
        features: ['description', 'utterances'],
        num_rows: 482
    })
    validation: Dataset({
        features: ['description', 'utterances'],
        num_rows: 60
    })
    test: Dataset({
        features: ['description', 'utterances'],
        num_rows: 61
    })
})

Splits available: dict_keys(['train', 'validation', 'test'])

Number of examples in train split: 482

Number of examples in validation split: 60

Number of examples in test split: 61

Features: {'description': Value(dtype='string', id=None), 'utterances': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)}

Example dialogues from the train split:

Example 1:
description: throat a bit sore and want to get a good imune booster, especially in light of the virus. please adv...
utterances: ['patient: throat a bit sore and want to get a good imune booster, especially in light of the virus. please advise. have not been in contact with nyone with

Create three jsonl file for the dataset.
- train
- test
- validation

In [10]:
import json
import os

In [16]:
# Load the dataset with all splits
dataset = load_dataset(path="UCSD26/medical_dialog", name='processed.en',trust_remote_code=True)

# Format function for one split
def format_split(split_data):
    formatted = []
    for item in split_data:
        utterances = item['utterances']
        if len(utterances) >= 2:
            patient = next((u.split("patient:")[1].strip() for u in utterances if u.lower().startswith("patient:")), None)
            doctor = next((u.split("doctor:")[1].strip() for u in utterances if u.lower().startswith("doctor:")), None)
            if patient and doctor:
                formatted.append({
                    "prompt": f"### Instruction:\n{patient}\n\n### Response:",
                    "response": doctor
                })
    return formatted

# Format each split
train_data = format_split(dataset["train"])
val_data = format_split(dataset["validation"])
test_data = format_split(dataset["test"])

# Save to JSONL
def save_jsonl(data, filename):
    with open(filename, "w") as f:
        for entry in data:
            json.dump(entry, f)
            f.write("\n")


# Save files to your custom path
save_jsonl(train_data, "datasets/stage1_data/stage1_train.jsonl")
save_jsonl(val_data, "datasets/stage1_data/stage1_validation.jsonl")
save_jsonl(test_data, "datasets/stage1_data/stage1_test.jsonl")

print("✅ Saved: stage1_train.jsonl, stage1_validation.jsonl, stage1_test.jsonl")

✅ Saved: stage1_train.jsonl, stage1_validation.jsonl, stage1_test.jsonl


Unsloth uses the ChatML format (like OpenAI’s messages format)

### Upload to Huggingface

In [28]:
from datasets import load_dataset, DatasetDict

# Load from local files
train = load_dataset("json", data_files="datasets/stage1_data_chatml/stage1_train.jsonl")["train"]
val = load_dataset("json", data_files="datasets/stage1_data_chatml/stage1_validation.jsonl")["train"]
test = load_dataset("json", data_files="datasets/stage1_data_chatml/stage1_test.jsonl")["train"]

# Combine into DatasetDict
dataset = DatasetDict({
    "train": train,
    "validation": val,
    "test": test
})

# Save the dataset to disk
dataset.save_to_disk("datasets/stage1_data/medical_dialog_dataset_unsloth")

print("Dataset saved to disk at datasets/stage1_data/medical_dialog_dataset_unsloth")


Generating train split: 482 examples [00:00, 83158.02 examples/s]
Generating train split: 60 examples [00:00, 21245.95 examples/s]
Generating train split: 61 examples [00:00, 32497.47 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 482/482 [00:00<00:00, 145390.47 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 60/60 [00:00<00:00, 24847.77 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 61/61 [00:00<00:00, 26136.74 examples/s]

Dataset saved to disk at datasets/stage1_data/medical_dialog_dataset_unsloth





In [30]:
from datasets import load_dataset, Dataset, DatasetDict
import re
import os

# Load the original dataset
dataset = load_dataset(path="UCSD26/medical_dialog", name="processed.en", trust_remote_code=True)

def convert_to_chatml(example):
    """Convert a conversation to ChatML format"""
    
    utterances = example["utterances"]
    chatml_text = "<|im_start|>system\nYou are a helpful medical assistant that provides accurate and ethical medical information.<|im_end|>\n"
    
    for utterance in utterances:
        # Determine if this is a patient (user) or doctor (assistant) message
        if utterance.lower().startswith("patient:"):
            role = "user"
            # Remove the "patient:" prefix and trim
            content = re.sub(r'^patient:\s*', '', utterance, flags=re.IGNORECASE).strip()
        elif utterance.lower().startswith("doctor:"):
            role = "assistant"
            # Remove the "doctor:" prefix and trim
            content = re.sub(r'^doctor:\s*', '', utterance, flags=re.IGNORECASE).strip()
        else:
            # If no clear prefix, try to determine based on position
            # In most datasets, odd indices are user, even are assistant
            continue  # Skip if can't determine role
        
        chatml_text += f"<|im_start|>{role}\n{content}<|im_end|>\n"
    
    return {"text": chatml_text}

# Convert each split
chatml_datasets = {}
for split in dataset.keys():
    chatml_datasets[split] = dataset[split].map(convert_to_chatml)

# Create a new DatasetDict with the converted data
chatml_dataset = DatasetDict(chatml_datasets)

# Create directory if it doesn't exist
os.makedirs("datasets/medical_dialog_chatml", exist_ok=True)

# Save each split separately
for split in chatml_dataset.keys():
    # Save each split as a separate file
    output_file = os.path.join("datasets/medical_dialog_chatml", f"{split}.txt")
    
    with open(output_file, "w", encoding="utf-8") as f:
        for example in chatml_dataset[split]:
            f.write(example["text"])
            f.write("\n\n")  # Add blank lines between conversations
    
    print(f"Saved {split} split to {output_file} with {len(chatml_dataset[split])} examples")

# Also save the entire dataset in HF format for later use
chatml_dataset.save_to_disk("datasets/medical_dialog_chatml_hf")
print("Saved complete dataset to medical_dialog_chatml_hf")

Saved train split to datasets/medical_dialog_chatml/train.txt with 482 examples
Saved validation split to datasets/medical_dialog_chatml/validation.txt with 60 examples
Saved test split to datasets/medical_dialog_chatml/test.txt with 61 examples


Saving the dataset (1/1 shards): 100%|██████████| 482/482 [00:00<00:00, 163273.67 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 60/60 [00:00<00:00, 21126.45 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 61/61 [00:00<00:00, 27728.68 examples/s]

Saved complete dataset to medical_dialog_chatml_hf





### Stage 2 Dataset

In [32]:
from datasets import load_dataset, DatasetDict

# Load from your labeled JSONL files
dataset = DatasetDict({
    "train": load_dataset("json", data_files="datasets/stage2_data/straumann_stage2_train_labeled.jsonl", split="train"),
    "validation": load_dataset("json", data_files="datasets/stage2_data/straumann_stage2_validation_labeled.jsonl", split="train"),
    "test": load_dataset("json", data_files="datasets/stage2_data/straumann_stage2_test_labeled.jsonl", split="train"),
})

# Save it back to the same folder (or a subfolder to avoid overwriting)
dataset.save_to_disk("datasets/stage2_data/dental_implant_straumann")
print("Dataset saved to disk at datasets/dental_implant_straumann")

Saving the dataset (1/1 shards): 100%|██████████| 250/250 [00:00<00:00, 84067.67 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 25/25 [00:00<00:00, 10561.80 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 25/25 [00:00<00:00, 10446.06 examples/s]

Dataset saved to disk at datasets/dental_implant_straumann





### Combine two dataset

In [39]:
import json

# --- Load Stage 1 ---
with open("datasets/stage1_data/stage1_train.jsonl", "r") as f1:
    stage1 = [json.loads(line) for line in f1]

# Convert to unified format
stage1_converted = [
    {
        "instruction": item["prompt"].replace("### Instruction:\n", "").replace("### Response:", "").strip(),
        "response": item["response"].strip()
    }
    for item in stage1
]

# --- Load Stage 2 ---
with open("datasets/stage2_data/straumann_stage2_train.jsonl", "r") as f2:
    stage2 = [json.loads(line) for line in f2]

# Convert to unified format
stage2_converted = []
for item in stage2:
    if "text" in item:
        parts = item["text"].split("### Response:\n")
        if len(parts) == 2:
            instruction = parts[0].replace("### Instruction:\n", "").strip()
            response = parts[1].strip()
            stage2_converted.append({
                "instruction": instruction,
                "response": response
            })

# --- Merge ---
merged = stage1_converted + stage2_converted

# --- Save ---
with open("datasets/stage3_data/merged_train.jsonl", "w") as f_out:
    for item in merged:
        json.dump(item, f_out)
        f_out.write("\n")

print(f"✅ Merged {len(merged)} entries into 'datasets/stage3_data/merged_train.jsonl'")

✅ Merged 732 entries into 'datasets/stage3_data/merged_train.jsonl'


In [41]:
from datasets import load_dataset, DatasetDict

# Load from your labeled JSONL files
dataset = DatasetDict({
    "train": load_dataset("json", data_files="datasets/stage3_data/merged_train.jsonl", split="train"),
    "validation": load_dataset("json", data_files="datasets/stage3_data/merged_validation.jsonl", split="train"),
    "test": load_dataset("json", data_files="datasets/stage3_data/merged_test.jsonl", split="train"),
})

# Save it back to the same folder (or a subfolder to avoid overwriting)
dataset.save_to_disk("datasets/stage3_data/doctor_chat_dental_implants_qa")
print("Dataset saved to disk at atasets/stage3_data/doctor_chat_dental_implants_qa")

Generating train split: 86 examples [00:00, 28958.75 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 732/732 [00:00<00:00, 236371.59 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 85/85 [00:00<00:00, 27913.86 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 86/86 [00:00<00:00, 29371.40 examples/s]

Dataset saved to disk at atasets/stage3_data/doctor_chat_dental_implants_qa





### Change the style to match the unsloth alpaca style

In [None]:
from transformers import AutoTokenizer
from datasets import load_from_disk

# Load tokenizer and eos token
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")
EOS_TOKEN = tokenizer.eos_token

# Load your dataset
dataset = load_from_disk("datasets/stage3_data/doctor_chat_dental_implants_qa")

# Define Alpaca format
alpaca_prompt = """Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
{}

### Response:
{}"""

# Format function
def formatting_prompts_func(examples):
    return {
        "text": [
            alpaca_prompt.format(instr, resp) + EOS_TOKEN
            for instr, resp in zip(examples["instruction"], examples["response"])
        ]
    }

# Apply formatting
formatted_dataset = dataset.map(formatting_prompts_func, batched=True)

# Save to disk
formatted_dataset.save_to_disk("datasets/stage3_data/doctor_dental_alpaca_format")
print("✅ Saved to disk: datasets/stage3_data/doctor_dental_alpaca_format")

Map: 100%|██████████| 732/732 [00:00<00:00, 63408.31 examples/s]
Map: 100%|██████████| 85/85 [00:00<00:00, 28439.36 examples/s]
Map: 100%|██████████| 86/86 [00:00<00:00, 24501.44 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 732/732 [00:00<00:00, 315218.74 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 85/85 [00:00<00:00, 44497.73 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 86/86 [00:00<00:00, 41087.84 examples/s]

✅ Saved to disk: datasets/stage3_data/doctor_dental_alpaca_format





In [44]:
formatted_dataset["train"][0]["text"]

"Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nthroat a bit sore and want to get a good imune booster, especially in light of the virus. please advise. have not been in contact with nyone with the virus.\n\n### Response:\nduring this pandemic. throat pain can be from a strep throat infection (antibiotics needed), a cold or influenza or other virus, or from some other cause such as allergies or irritants. usually, a person sees the doctor (call first) if the sore throat is bothersome, recurrent, or doesn't go away quickly. covid-19 infections tend to have cough, whereas strep throat usually lacks cough but has more throat pain. (3/21/20)<|im_end|>"

In [45]:
print(repr(tokenizer.eos_token))  # should show '</s>'

'<|im_end|>'


### Convert Alpaca-style JSONL to Llama 3.2 chat-style JSONL

stage 1

In [49]:
import json

# Input/output paths
input_file = "datasets/stage1_data/stage1_validation.jsonl"
output_file = "datasets/stage1_data/stage1_converted_validation.jsonl"

def convert_line(line):
    # Parse the JSON line
    data = json.loads(line)
    
    # Extract prompt and response
    prompt = data.get("prompt", "").replace("### Instruction:\n", "").strip()
    response = data.get("response", "").replace("### Response:", "").strip()
    
    # Convert to the required format
    return {
        "conversation": [
            { "from": "patient", "value": prompt },
            { "from": "doctor", "value": response }
        ]
    }

# Process the file
with open(input_file, 'r', encoding='utf-8') as fin, open(output_file, 'w', encoding='utf-8') as fout:
    for line in fin:
        converted = convert_line(line)
        fout.write(json.dumps(converted, ensure_ascii=False) + '\n')

print(f"Conversion complete. Saved to {output_file}")

Conversion complete. Saved to datasets/stage1_data/stage1_converted_validation.jsonl


stage 2

In [52]:
import json
import re

# Input and output paths
input_file = 'datasets/stage2_data/straumann_stage2_train.jsonl'
output_file = 'datasets/stage2_data/straumann_converted_train.jsonl'

def convert_entry(entry):
    text = entry.get("text", "")
    
    # Extract the instruction and response using regex
    instruction_match = re.search(r"### Instruction:\s*(.*?)\s*### Response:", text, re.DOTALL)
    response_match = re.search(r"### Response:\s*(.*)", text, re.DOTALL)

    if not instruction_match or not response_match:
        return None  # Skip malformed entries
    
    instruction = instruction_match.group(1).strip()
    response = response_match.group(1).strip()
    
    return {
        "conversation": [
            { "from": "human", "value": instruction },
            { "from": "expert", "value": response }
        ]
    }

# Process and write output
with open(input_file, 'r', encoding='utf-8') as fin, open(output_file, 'w', encoding='utf-8') as fout:
    for line in fin:
        data = json.loads(line)
        converted = convert_entry(data)
        if converted:
            fout.write(json.dumps(converted, ensure_ascii=False) + '\n')

print(f"Conversion complete. Output saved to {output_file}")

Conversion complete. Output saved to datasets/stage2_data/straumann_converted_train.jsonl


In [55]:
# Input file paths
file1 = 'datasets/stage1_data/stage1_converted_train.jsonl'
file2 = 'datasets/stage2_data/straumann_converted_train.jsonl'

# Output file path
output_file = 'datasets/stage3_data/llama_merged_train.jsonl'

# Merge them
with open(output_file, 'w', encoding='utf-8') as fout:
    for fname in [file1, file2]:
        with open(fname, 'r', encoding='utf-8') as fin:
            for line in fin:
                fout.write(line)

print(f"Files combined and saved to {output_file}")

Files combined and saved to datasets/stage3_data/llama_merged_train.jsonl


create dataset for llama finetuning

In [59]:
from datasets import load_dataset, DatasetDict

# Load from your labeled JSONL files
dataset = DatasetDict({
    "train": load_dataset("json", data_files="datasets/stage3_data/llama_merged_train.jsonl", split="train"),
    "validation": load_dataset("json", data_files="datasets/stage3_data/llama_merged_validation.jsonl", split="train"),
    "test": load_dataset("json", data_files="datasets/stage3_data/llama_merged_test.jsonl", split="train"),
})

# Save it back to the same folder (or a subfolder to avoid overwriting)
dataset.save_to_disk("datasets/stage3_data/doctor_dental_llama_qa_new")
print("Dataset saved to disk at")

Generating train split: 732 examples [00:00, 124684.48 examples/s]
Generating train split: 85 examples [00:00, 41363.94 examples/s]
Generating train split: 86 examples [00:00, 29646.60 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 732/732 [00:00<00:00, 183417.80 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 85/85 [00:00<00:00, 27951.07 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 86/86 [00:00<00:00, 28362.18 examples/s]

Dataset saved to disk at





### Make own dataset for training

In [60]:
from datasets import load_dataset

ds = load_dataset("Malikeh1375/medical-question-answering-datasets", "all-processed")

Generating train split: 100%|██████████| 246678/246678 [00:00<00:00, 374453.56 examples/s]


In [61]:
# save to disk
ds.save_to_disk("datasets/medical-question-answering-datasets")

Saving the dataset (1/1 shards): 100%|██████████| 246678/246678 [00:00<00:00, 247901.88 examples/s]


In [63]:
print(ds["train"][0])

{'instruction': "If you are a doctor, please answer the medical questions based on the patient's description.", 'input': 'Hey Just wondering.  I am a 39 year old female, pretty smallMy heart rate is around 97 to 106 at rest, and my BP is 140/90 and twice I get 175/118I did visit a doctor because I  didnt feel well past month or twoThen the doctor gave me a heart medicine to take the pulse down and BP  (its still in further examination.)But I wondering what it can be? Do I need the medicine really?  Is that bad ?', 'output': "hello and thank you for using chatbot. i carefully read your question and i understand your concern. i will try to explain you something and give you my opinion. we talk about hypertension if we have mean value that exceeds 140 / 90 mmhg. a person might have high value during emotional and physicals trees so it's mandatory to judge on mean values. usaly hypertension does not give any symptoms but left untreated he slowly modifies the heart. according to heart rhyth

In [65]:
print(ds["train"].column_names)

['instruction', 'input', 'output', '__index_level_0__']


In [71]:
import json
from datasets import load_dataset

# Load your dataset
dataset = ds  # or local path if using load_from_disk

# Output file
output_path = "datasets/medical_qa/medical_qa.jsonl"

eot = tokenizer.eos_token

# Convert each row to the desired format
with open(output_path, "w", encoding="utf-8") as f:
    for example in dataset["train"]:
        instruction = example["instruction"].strip()
        input_text = example["input"].strip()
        output_text = example["output"].strip()

        chat = {
            "conversations": [
                { "from": "system", "value": instruction },
                { "from": "human", "value": input_text },
                { "from": "assistant", "value": output_text + eot }
            ]
        }

        json.dump(chat, f, ensure_ascii=False)
        f.write("\n")

print(f"✅ Saved to: {output_path}")

✅ Saved to: datasets/medical_qa/medical_qa.jsonl


In [72]:
from datasets import load_dataset, DatasetDict

# Load from your labeled JSONL files
dataset = DatasetDict({
    "train": load_dataset("json", data_files="datasets/medical_qa/medical_qa.jsonl", split="train"),
})

# Save it back to the same folder (or a subfolder to avoid overwriting)
dataset.save_to_disk("datasets/medical_qa/medical_qa_dataset_new")
print("Dataset saved to disk")

Generating train split: 246678 examples [00:00, 735946.23 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 246678/246678 [00:00<00:00, 1528070.11 examples/s]

Dataset saved to disk



