In [1]:
!pip install -q transformers accelerate bitsandbytes
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch

# Initializing Gradio interface for Prompt-Tuned Model

In [3]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("gabbar427/mediguide")
model = AutoModelForCausalLM.from_pretrained("gabbar427/mediguide")


chatbot = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    device_map="auto"
)

def chat_with_bot(message, max_new_tokens=2000):
    # Format the input with special tokens if required
    inputs = f"<|user|>\n{message}\n<|assistant|>\n"

    # Generate response
    outputs = chatbot(
        inputs,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        temperature=0.7,
        top_p=0.9,
        repetition_penalty=1.1,
        pad_token_id=tokenizer.eos_token_id
    )

    # Extract and clean the response
    response = outputs[0]['generated_text'].split("<|assistant|>")[1].strip()
    return response

tokenizer_config.json:   0%|          | 0.00/137k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.67M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/437 [00:00<?, ?B/s]

chat_template.jinja:   0%|          | 0.00/3.96k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.09k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/4.14G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

Device set to use cuda:0


In [4]:
import gradio as gr

iface = gr.Interface(
    fn=chat_with_bot,
    inputs=gr.Textbox(lines=3, label="Your Medical Question"),
    outputs=gr.Textbox(label="MediGuide Bot"),
    title="🩺 MediGuide Medical Chatbot",
    description="Ask health-related questions. Powered by gabbar427/mediguide."
)

# Launch it!
iface.launch(share=True)  # share=True gives you a public URL

# Add the following url as endpoint for mistralai_prompt

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://d9b6ec1eb15e39f318.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




# Initializing Gradio interface for Prefix Tuned Model

In [8]:
path = "ankraj/mediguide"

In [9]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel
import torch

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

# Load base model first
base_model = AutoModelForCausalLM.from_pretrained(
    "mistralai/Mistral-7B-Instruct-v0.3",
    device_map="auto",
    quantization_config=bnb_config,
    offload_folder="./offload",
    offload_state_dict=True
)

# Inject adapter
model = PeftModel.from_pretrained(base_model, path)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(path)
tokenizer.pad_token = tokenizer.eos_token

config.json:   0%|          | 0.00/601 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.55G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

adapter_config.json:   0%|          | 0.00/391 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/5.24M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/141k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.67M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/437 [00:00<?, ?B/s]

In [10]:
model.eval()
model = torch.compile(model)

In [11]:
from transformers import StoppingCriteria, StoppingCriteriaList
class StopOnTokens(StoppingCriteria):
    def __init__(self, stop_phrases, tokenizer):
        self.tokenizer = tokenizer
        self.stop_ids_list = [
            tokenizer(phrase, return_tensors="pt").input_ids[0][1:]  # remove BOS
            for phrase in stop_phrases
        ]

    def __call__(self, input_ids, scores, **kwargs):
        device = input_ids.device
        for stop_ids in self.stop_ids_list:
            stop_ids = stop_ids.to(device)  # ✅ Move to same device
            if len(input_ids[0]) >= len(stop_ids):
                if torch.equal(input_ids[0][-len(stop_ids):], stop_ids):
                    return True
        return False

In [15]:
import re
def preprocess_input(input_text):
    instruction = "If you are a doctor, please answer the medical questions based on the patient's description."
    prompt = f"[MED] {instruction}\nPatient: {input_text} \nDoctor:"
    return prompt

def clean_output(text):
    stop_patterns = [
        r"Take care Chat Doctor\.",
        r"Regards, Chat Doctor\.",
        r"Regards. Chat Doctor\.",
        r"Wishing you good health\.",
        r"Wishing you a good health\.",
        r"Thanks for using Chat Doctor\.",
        r"Goodbye\.",
        r"Take care\.",
        r"\.com"
    ]

    doc_match = re.search(r"Doctor:\s*(.*)", text, re.DOTALL | re.IGNORECASE)
    if not doc_match:
        return text.strip()

    after_doctor = doc_match.group(1)

    stop_pattern = r"(.*?)(" + "|".join(stop_patterns) + ")"
    stop_match = re.search(stop_pattern, after_doctor, re.DOTALL | re.IGNORECASE)

    if stop_match:
        return stop_match.group(1).strip() + " " + stop_match.group(2)

    return after_doctor.strip()

In [20]:
def run_medical_bot(message, max_new_tokens=500):
    prompt = preprocess_input(message)
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    # List of phrases that should stop generation
    stop_phrases = [
        "Take care Chat Doctor.",
        "Regards, Chat Doctor.",
        "Regards. Chat Doctor.",
        "Wishing you good health.",
        "Wishing you a good health.",
        "Thanks for using Chat Doctor.",
        "Goodbye.",
        "Take care."
    ]

    stopping_criteria = StoppingCriteriaList([StopOnTokens(stop_phrases, tokenizer)])

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            top_p=0.9,
            temperature=0.7,
            pad_token_id=tokenizer.eos_token_id,
            stopping_criteria=stopping_criteria
        )

    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return clean_output(generated_text)

In [21]:
import gradio as gr

iface2 = gr.Interface(
    fn=run_medical_bot,
    inputs=gr.Textbox(lines=3, label="Your Medical Question"),
    outputs=gr.Textbox(label="MediGuide Bot"),
    title="🩺 MediGuide Medical Chatbot",
    description="Ask health-related questions. Powered by ankraj/mediguide."
)

iface2.launch(share=True)
# Add the following url as endpoint for mistralai_prefix

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://13b6b5feadc45ec9eb.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




# Initializing Gradio interface for QLoRA Model

In [2]:
tokenizer1 = AutoTokenizer.from_pretrained("Greyitis/mediguide_new")
model1 = AutoModelForCausalLM.from_pretrained("Greyitis/mediguide_new")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of the model checkpoint at Greyitis/mediguide_new were not used when initializing MistralForCausalLM: ['model.layers.0.self_attn.q_proj.base_layer.weight', 'model.layers.0.self_attn.q_proj.base_layer.weight.absmax', 'model.layers.0.self_attn.q_proj.base_layer.weight.quant_map', 'model.layers.0.self_attn.q_proj.base_layer.weight.quant_state.bitsandbytes__nf4', 'model.layers.0.self_attn.q_proj.lora_A.default.weight', 'model.layers.0.self_attn.q_proj.lora_B.default.weight', 'model.layers.0.self_attn.v_proj.base_layer.weight', 'model.layers.0.self_attn.v_proj.base_layer.weight.absmax', 'model.layers.0.self_attn.v_proj.base_layer.weight.quant_map', 'model.layers.0.self_attn.v_proj.base_layer.weight.quant_state.bitsandbytes__nf4', 'model.layers.0.self_attn.v_proj.lora_A.default.weight', 'model.layers.0.self_attn.v_proj.lora_B.default.weight', 'model.layers.1.self_attn.q_proj.base_layer.weight', 'model.layers.1.self_attn.q_proj.base_layer.weight.absmax', 'model.layers.1.self_attn

In [3]:
chatbot1 = pipeline(
    "text-generation",
    model=model1,
    tokenizer=tokenizer1,
    device_map="auto"
)

def format_prompt(question: str) -> str:

    return f"[|Human|]\n{question}\n[|AI|]\n"

def test_questions(question, max_length: int = 256, **gen_kwargs):
    prompt = format_prompt(question)
    output = chatbot1(
        prompt,
        max_length=max_length,
        **gen_kwargs
    )
    full_text = output[0]["generated_text"]
    response = full_text.split("[|AI|]")[-1].strip()
    return response

def chat_with_bot1(message):
    return test_questions(message, max_length=200, do_sample=True, top_p=0.9, temperature=0.8)

Device set to use cuda:0


In [4]:
import gradio as gr

iface1 = gr.Interface(
    fn=chat_with_bot1,
    inputs=gr.Textbox(lines=3, label="Your Medical Question"),
    outputs=gr.Textbox(label="MediGuide Bot"),
    title="🩺 MediGuide Medical Chatbot",
    description="Ask health-related questions. Powered by gabbar427/mediguide."
)

iface1.launch(share=True)
# Add the following url as endpoint for mistralai_qlora

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://58d9d13f78440deab1.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


