In [1]:
from datasets import load_dataset

In [2]:
pip install datasets huggingface_hub transformers gradio



In [3]:
ds = load_dataset("ruslanmv/ai-medical-chatbot")
print(ds)                    # shows splits & sizes
print(ds["train"][0])        # preview first example
# If no 'train' split, print keys: print(ds)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/863 [00:00<?, ?B/s]

dialogues.parquet:   0%|          | 0.00/142M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/256916 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['Description', 'Patient', 'Doctor'],
        num_rows: 256916
    })
})
{'Description': 'Q. What does abutment of the nerve root mean?', 'Patient': 'Hi doctor,I am just wondering what is abutting and abutment of the nerve root means in a back issue. Please explain. What treatment is required for\xa0annular bulging and tear?', 'Doctor': 'Hi. I have gone through your query with diligence and would like you to know that I am here to help you. For further information consult a neurologist online -->'}


In [4]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import gradio as gr
import torch

# Title and description
title = "🤖 AI ChatBot"
description = "Building open-domain chatbots is a challenging area for machine learning research."
examples = [["How are you?"]]

# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-large")
model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-large")

def predict(input_text, history=None):
    if history is None:
        history = []

    # Tokenize new user input
    new_user_input_ids = tokenizer.encode(input_text + tokenizer.eos_token, return_tensors="pt")

    # Prepare chat history
    if history:
        past_ids = torch.LongTensor(history)
        bot_input_ids = torch.cat([past_ids, new_user_input_ids], dim=-1)
    else:
        bot_input_ids = new_user_input_ids

    # Generate response
    output_ids = model.generate(bot_input_ids, max_length=1000, pad_token_id=tokenizer.eos_token_id)
    history = output_ids.tolist()

    # Decode and extract bot reply
    decoded_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    user_reply = input_text
    bot_reply = decoded_text.split(input_text)[-1].strip()

    # Format chatbot UI output
    chatbot_messages = []
    if len(history) > 0:
        chatbot_messages = [(user_reply, bot_reply)]

    return chatbot_messages, history

# Gradio interface
gr.Interface(
    fn=predict,
    title=title,
    description=description,
    examples=examples,
    inputs=["text", "state"],
    outputs=["chatbot", "state"],
    theme="finlaymacklon/boxy_violet"
).launch()

tokenizer_config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/642 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.75G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.75G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

  obj = utils.component_or_layout_class(cls_name)(render=render)


It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://003a8bcc9ee5e9e838.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




Prepare the dataset for training

In [5]:
from datasets import load_dataset

ds = load_dataset("ruslanmv/ai-medical-chatbot")["train"]

formatted_data = []
for item in ds:
    patient = item["Patient"]
    doctor = item["Doctor"]
    conversation = f"Patient: {patient}\nDoctor: {doctor}"
    formatted_data.append(conversation)

len(formatted_data), formatted_data[0]



(256916,
 'Patient: Hi doctor,I am just wondering what is abutting and abutment of the nerve root means in a back issue. Please explain. What treatment is required for\xa0annular bulging and tear?\nDoctor: Hi. I have gone through your query with diligence and would like you to know that I am here to help you. For further information consult a neurologist online -->')

Save processed dataset to a text file

In [6]:
with open("medical_chatbot_data.txt", "w", encoding="utf-8") as f:
    for line in formatted_data:
        f.write(line + "\n")


Tokenizer & dataset preparation for training

In [7]:
def format_and_tokenize(batch):
    conversations = [
        f"Patient: {p}\nDoctor: {d}{tokenizer.eos_token}"
        for p, d in zip(batch["Patient"], batch["Doctor"])
    ]
    return tokenizer(conversations, truncation=True)

tokenized_ds = ds.map(format_and_tokenize, batched=True, batch_size=512)


Map:   0%|          | 0/256916 [00:00<?, ? examples/s]

In [11]:
from transformers import AutoTokenizer

model_name = "gpt2"   # or gemma, or any causal LM you want to fine-tune
tokenizer = AutoTokenizer.from_pretrained(model_name)

# GPT2 has no pad token → set pad_token = eos_token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [12]:
def tokenize_function(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=256,
    )

tokenized_ds = formatted_ds.map(tokenize_function, batched=True)


Map:   0%|          | 0/256916 [00:00<?, ? examples/s]

In [13]:
tokenized_ds = tokenized_ds.remove_columns(
    ["Patient", "Doctor", "Description", "text"]
)

tokenized_ds.set_format("torch")


Load DialoGPT model + Set Training Arguments

In [14]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)


Add pad token (required)

In [15]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="finetuned_medical_model",
    evaluation_strategy="no",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    num_train_epochs=3,
    weight_decay=0.01,
)


TypeError: TrainingArguments.__init__() got an unexpected keyword argument 'evaluation_strategy'

RE-TOKENIZE the dataset WITH padding + truncation

In [None]:
def tokenize_function(batch):
    return tokenizer(
        batch["text"],
        padding="max_length",
        truncation=True,
        max_length=256,   # safe length for DialoGPT
    )

tokenized_dataset = formatted_ds.map(tokenize_function, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(["text", "Patient", "Doctor", "Description"])
tokenized_dataset.set_format("torch")


Map:   0%|          | 0/256916 [00:00<?, ? examples/s]

Use correct collator

In [None]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)


Create Trainer

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator
)


Start training

In [None]:
trainer.train()


`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.
