<a href="https://colab.research.google.com/github/MAI3003-Data-Witches/MedicalChatbot/blob/main/medical_chatbot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This base chatbot can have a medical conversation with a patient.

# Installation requirements

Run this Jupyther notebook in your favorite IDE, or here in this Google Collab environment.

If you run this locally, install the required libraries:
- pip install openai
- pip install ipywidgets


In [37]:
# Import necessary libraries
from openai import OpenAI
import os
import ipywidgets as widgets
from IPython.display import display, clear_output
from IPython.display import HTML


# Configure OpenAI API
api_key = ''

client = OpenAI(api_key=api_key)


def initialize_chat():
    """
    Initialize the chat with a system message that sets the context for the medical chatbot.
    """
    # Concatenate all extracted texts into a single string to be used as context
    contextual_knowledge = "\n\n".join(all_extracted_texts)

    return [{
        "role": "system",
        "content": f"""You are a helpful medical information assistant. You can recommend some drugs. Do not give very long answers.
        Your knowledge base also includes the following information about medications:
        {contextual_knowledge}"""
    }]


def chat_with_medical_bot():
    """
    Main function to run the medical chatbot interaction.
    """
    messages = initialize_chat()

    print("Medical Information Assistant: Hello! I can help you with general medical information.")
    print("Type 'quit' to end the conversation.\n")

    while True:
        user_input = input("You: ")

        if user_input.lower() == 'quit':
            break

        # Add user message to conversation history
        messages.append({"role": "user", "content": user_input})

        # Get bot's response
        bot_response = get_chatbot_response(messages)

        # Add bot's response to conversation history
        messages.append({"role": "assistant", "content": bot_response})

        print(f"\nMedical Assistant: {bot_response}\n")



# Create widgets for the chat interface
output = widgets.Output()
text_input = widgets.Text(
    value='',
    placeholder='Type your medical question here...',
    description='',
    layout=widgets.Layout(width='80%')
)
send_button = widgets.Button(
    description='Send',
    button_style='primary',
    layout=widgets.Layout(width='19%')
)

# Create a container for chat history display
chat_history = widgets.HTML(
    value='<div style="height: 400px; overflow-y: auto; background-color: #f9f9f9; padding: 15px; border-radius: 5px;"></div>'
)

def update_chat_history(message, sender):
    """Update the chat history display with new messages"""
    current = chat_history.value[:-6]  # Remove closing div tag
    style = "background-color: #DCF8C6; margin: 5px; padding: 10px; border-radius: 10px;" if sender == "You" else "background-color: #E8E8E8; margin: 5px; padding: 10px; border-radius: 10px;"
    new_message = f'<div style="{style}"><strong>{sender}:</strong> {message}</div>'
    chat_history.value = current + new_message + "</div>"

def on_send_button_clicked(b):
    """Handle send button clicks"""
    user_message = text_input.value
    if user_message.strip() == '':
        return

    # Clear input
    text_input.value = ''

    # Update chat history with user message
    update_chat_history(user_message, "You")

    # Add user message to OpenAI messages
    messages.append({"role": "user", "content": user_message})

    # Get and display bot response
    bot_response = get_chatbot_response(messages)
    messages.append({"role": "assistant", "content": bot_response})
    update_chat_history(bot_response, "Medical Assistant")

# Connect button click to handler
send_button.on_click(on_send_button_clicked)

# Handle Enter key in text input
def on_enter(widget):
    if text_input.value.strip() != '':
        on_send_button_clicked(None)
text_input.on_submit(on_enter)

# Create main chat container
chat_container = widgets.VBox([
    widgets.HTML(value='<h2>Medical Information Assistant</h2>'),
    widgets.HTML(value='<p style="color: #666;">Ask me any medical questions, but remember I\'m not a replacement for professional medical advice.</p>'),
    chat_history,
    widgets.HBox([text_input, send_button])
])

In [35]:
def get_chatbot_response(messages, max_completion_tokens=1500):
   """
    Get a response from the GPT model based on the conversation history.

    Args:
        messages (list): List of conversation messages
        max_completion_tokens (int): Maximum length of the response

    Returns:
        str: The chatbot's response
   """
   try:
       response = client.chat.completions.create(
           model="gpt-5-mini",
           messages=messages,
           max_completion_tokens=max_completion_tokens
       )
       return response.choices[0].message.content
   except Exception as e:
       return f"An error occurred: {str(e)}"

In [38]:
# Initialize chat history
messages = initialize_chat()

# Display the chat interface
display(chat_container)

VBox(children=(HTML(value='<h2>Medical Information Assistant</h2>'), HTML(value='<p style="color: #666;">Ask m…

# Finetuning
## Retrieve PDF links



In [13]:
import requests
import re

# The URL for the data_list.md file is available in `source_information`
url = source_information

try:
    response = requests.get(url)
    response.raise_for_status()  # Raise an exception for HTTP errors (4xx or 5xx)
    content = response.text

    # Use regular expression to find all URLs ending with '.pdf'
    # This regex looks for URLs starting with http/https and ending with .pdf, capturing the full URL
    pdf_links = re.findall(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+\.pdf', content)

    print(f"Successfully downloaded content from {url}")
    print(f"Found {len(pdf_links)} PDF links.")
    print("PDF links:")
    for link in pdf_links:
        print(link)

except requests.exceptions.RequestException as e:
    print(f"Error downloading the file: {e}")
    pdf_links = [] # Initialize as empty list on error



Successfully downloaded content from https://raw.githubusercontent.com/MAI3003-Data-Witches/MedicalChatbot/refs/heads/main/data_list.md
Found 5 PDF links.
PDF links:
https://raw.githubusercontent.com/MAI3003-Data-Witches/MedicalChatbot/b07133842f18d7fab2bfb6da46cd6ecebd633f1f/data/Paracetamol.pdf
https://raw.githubusercontent.com/MAI3003-Data-Witches/MedicalChatbot/b07133842f18d7fab2bfb6da46cd6ecebd633f1f/data/cetirizine.pdf
https://raw.githubusercontent.com/MAI3003-Data-Witches/MedicalChatbot/b07133842f18d7fab2bfb6da46cd6ecebd633f1f/data/diclofenac.pdf
https://raw.githubusercontent.com/MAI3003-Data-Witches/MedicalChatbot/b07133842f18d7fab2bfb6da46cd6ecebd633f1f/data/ibuprofen.pdf
https://raw.githubusercontent.com/MAI3003-Data-Witches/MedicalChatbot/b07133842f18d7fab2bfb6da46cd6ecebd633f1f/data/xylometazoline.pdf


## Download and extract text from PDFs

In [14]:
try:
    import PyPDF2
    print("PyPDF2 is already installed.")
except ImportError:
    print("PyPDF2 not found. Installing...")
    !pip install PyPDF2
    import PyPDF2
    print("PyPDF2 installed successfully.")

PyPDF2 not found. Installing...
Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1
PyPDF2 installed successfully.


In [15]:
import io
from PyPDF2 import PdfReader

all_extracted_texts = []

print(f"Starting PDF text extraction for {len(pdf_links)} PDFs...")

for i, link in enumerate(pdf_links):
    try:
        response = requests.get(link)
        response.raise_for_status()  # Raise an exception for HTTP errors

        # Use io.BytesIO to treat the content as a file
        with io.BytesIO(response.content) as open_pdf_file:
            reader = PdfReader(open_pdf_file)
            current_pdf_text = ""
            for page_num in range(len(reader.pages)):
                page = reader.pages[page_num]
                current_pdf_text += page.extract_text() or "" # Add an empty string if text extraction returns None

            all_extracted_texts.append(current_pdf_text)
            print(f"  Successfully extracted text from PDF {i+1}/{len(pdf_links)}: {link.split('/')[-1]}")

    except requests.exceptions.RequestException as e:
        print(f"  Error downloading PDF {i+1}/{len(pdf_links)} from {link}: {e}")
    except Exception as e:
        print(f"  Error processing PDF {i+1}/{len(pdf_links)} from {link}: {e}")

print(f"\nFinished PDF text extraction.")
print(f"Total PDFs processed: {len(all_extracted_texts)}")
# Calculate total length of all extracted texts to verify content
total_chars = sum(len(text) for text in all_extracted_texts)
print(f"Total length of all extracted texts: {total_chars} characters")

Starting PDF text extraction for 5 PDFs...
  Successfully extracted text from PDF 1/5: Paracetamol.pdf
  Successfully extracted text from PDF 2/5: cetirizine.pdf
  Successfully extracted text from PDF 3/5: diclofenac.pdf
  Error downloading PDF 4/5 from https://raw.githubusercontent.com/MAI3003-Data-Witches/MedicalChatbot/b07133842f18d7fab2bfb6da46cd6ecebd633f1f/data/ibuprofen.pdf: 404 Client Error: Not Found for url: https://raw.githubusercontent.com/MAI3003-Data-Witches/MedicalChatbot/b07133842f18d7fab2bfb6da46cd6ecebd633f1f/data/ibuprofen.pdf
  Successfully extracted text from PDF 5/5: xylometazoline.pdf

Finished PDF text extraction.
Total PDFs processed: 4
Total length of all extracted texts: 92913 characters


## Prepare fine-tuning data

In [16]:
import re

# 1. Identify the drug names for the successfully extracted PDF texts
drug_names = ['Paracetamol', 'cetirizine', 'diclofenac', 'xylometazoline']

# 2. Initialize an empty list called fine_tuning_data
fine_tuning_data = []

# 3. Iterate through all_extracted_texts
for idx, extracted_text in enumerate(all_extracted_texts):
    if idx >= len(drug_names): # Skip if there's no corresponding drug name due to previous errors
        continue

    drug_name = drug_names[idx]

    # a. Clean Text
    cleaned_text = extracted_text.replace('\n\n\n', '\n\n') # Replace triple newlines with double
    cleaned_text = cleaned_text.strip() # Remove leading/trailing whitespace
    # Remove common irrelevant phrases (case-insensitive and with potential surrounding spaces)
    cleaned_text = re.sub(r'Klik hier voor een uitgebreide samenvatting als PDF', '', cleaned_text, flags=re.IGNORECASE)
    cleaned_text = re.sub(r'\s*ing van Geneesmiddelen op: www\.cbg-meb\.nl\.?\s*', '', cleaned_text, flags=re.IGNORECASE)
    cleaned_text = re.sub(r'\s*[​-‍﻿]', '', cleaned_text) # Remove zero-width spaces
    cleaned_text = re.sub(r'\s{2,}', ' ', cleaned_text) # Replace multiple spaces with a single space
    cleaned_text = cleaned_text.strip()

    # b. Chunk Text
    # Split by two or more newline characters to get paragraphs
    chunks = re.split(r'\n{2,}', cleaned_text)

    # Further process chunks: ensure minimum length and remove empty ones
    processed_chunks = []
    for chunk in chunks:
        chunk = chunk.strip()
        if len(chunk) > 50: # Minimum character length for chunks
            processed_chunks.append(chunk)

    # c. Generate messages Format
    for chunk in processed_chunks:
        user_prompt = f"Tell me about {drug_name} and its uses."
        fine_tuning_example = {
            "messages": [
                {"role": "system", "content": "You are a helpful medical information assistant that provides factual information about medications."},
                {"role": "user", "content": user_prompt},
                {"role": "assistant", "content": chunk}
            ]
        }
        fine_tuning_data.append(fine_tuning_example)

# 4. Print the total number of fine-tuning examples generated and display the first five examples
print(f"Total fine-tuning examples generated: {len(fine_tuning_data)}")
print("\nFirst 5 fine-tuning examples:")
for i, example in enumerate(fine_tuning_data[:5]):
    print(f"\nExample {i+1}:")
    print(example)

Total fine-tuning examples generated: 4

First 5 fine-tuning examples:

Example 1:
{'messages': [{'role': 'system', 'content': 'You are a helpful medical information assistant that provides factual information about medications.'}, {'role': 'user', 'content': 'Tell me about Paracetamol and its uses.'}, {'role': 'assistant', 'content': "Paracetamol werkt pijnstillend en koortsverlagend.\nHet is te gebruiken bij verschillende soorten pijn zoals, hoofdpijn, migraine, koorts, griep,\nverkoudheid, keelpijn, bijholteontsteking, middenoorontsteking, oorpijn door\ngehoorgangontsteking, artrose, spierpijn, gewrichtspijn en menstruatieklachten.\nPijn bestaat in verschillende vormen. Pijn kan komen door een beschadiging in uw lichaam of als\niets niet werkt zoals het hoort. De zenuwen in het beschadigde lichaamsdeel geven een seintje\nnaar uw hersenen en u voelt pijn. Soms heeft pijn geen duidelijke oorzaak en weten we niet\nwaar de pijn vandaan komt.Belangrijk om te weten over paracetamol\nParac

## Upload fine-tuning data to OpenAI

In [17]:
import json

# 1. Define a filename for the fine-tuning data
file_name = "fine_tuning_data.jsonl"

# 2. Write the fine-tuning data to a JSONL file
print(f"Saving fine-tuning data to {file_name}...")
with open(file_name, "w") as f:
    for entry in fine_tuning_data:
        f.write(json.dumps(entry) + "\n")
print("Fine-tuning data saved successfully.")

# 3. Upload the file to OpenAI
print("Uploading fine-tuning data to OpenAI...")
try:
    with open(file_name, "rb") as f:
        response = client.files.create(
            file=f,
            purpose="fine-tune"
        )
    file_id = response.id
    print(f"File uploaded successfully. File ID: {file_id}")
except Exception as e:
    print(f"Error uploading file to OpenAI: {e}")
    file_id = None

Saving fine-tuning data to fine_tuning_data.jsonl...
Fine-tuning data saved successfully.
Uploading fine-tuning data to OpenAI...
File uploaded successfully. File ID: file-UtXYVsaJ1XCtsRsMU6dENo


## Initiate OpenAI fine-tuning job