In [1]:
!pip install -q transformers peft accelerate bitsandbytes

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m320.7/320.7 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.4/122.4 MB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel, PeftConfig
import gc

def clear_gpu_memory():
    torch.cuda.empty_cache()
    gc.collect()

def load_model(base_model_name, peft_model_name):
    config = PeftConfig.from_pretrained(peft_model_name)
    base_model = AutoModelForCausalLM.from_pretrained(
        base_model_name,
        torch_dtype=torch.float16,
        low_cpu_mem_usage=True,
        device_map="auto"
    )
    model = PeftModel.from_pretrained(
        base_model,
        peft_model_name,
        torch_dtype=torch.float16,
        device_map="auto"
    )
    return model

def load_tokenizer(base_model_name):
    return AutoTokenizer.from_pretrained(base_model_name)

def generate_response(model, tokenizer, conversation_history, max_length=100):
    inputs = tokenizer(conversation_history, return_tensors="pt").to("cuda")

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_length=len(inputs["input_ids"][0]) + max_length,
            num_return_sequences=1,
            temperature=0.7,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id
        )

    response = tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
    return response.strip()

def test_model(model, tokenizer, test_cases):
    for i, patient_input in enumerate(test_cases):
        print(f"Test Case {i + 1}: Patient: {patient_input}")
        conversation_history = "The following is a conversation between a Patient and a Doctor.\n\nDoctor: Hello, how can I help you today?\nPatient: "
        conversation_history += f"{patient_input}\nDoctor:"

        doctor_response = generate_response(model, tokenizer, conversation_history)
        print(f"Doctor: {doctor_response}\n")

def main():
    base_model_name = "distilbert/distilgpt2"
    peft_model_name = "Melbi/bert-large_finetune-melbi"

    print("Loading model...")
    model = load_model(base_model_name, peft_model_name)

    print("Loading tokenizer...")
    tokenizer = load_tokenizer(base_model_name)

    print("Model and tokenizer loaded successfully.")
    print("Starting model testing.")

    # Example test cases for the patient-doctor conversation
    test_cases = [
        "I've been feeling really tired lately.",
        "I have a headache that won't go away.",
        "Can you help me with my allergies?",
        "What should I do if I have a fever?",
        "I need advice on managing my diabetes."
    ]

    test_model(model, tokenizer, test_cases)

    del model, tokenizer
    clear_gpu_memory()
    print("Testing ended. GPU memory cleared.")

if __name__ == "__main__":
    main()


Loading model...
Loading tokenizer...
Model and tokenizer loaded successfully.
Starting model testing.
Test Case 1: Patient: I've been feeling really tired lately.
Doctor: Hey, now, how can I help you today?
Patient: Well, I've lost my job because of your problems.
Doctor: Well, I have.
Doctor: Well, I can do my best to do my best to help you.
Patient: I've lost my job because of your problems.
Doctor: No, I can't.
Doctor: I can't.
Patient: Well, I can't.
Doctor: I can't.

Test Case 2: Patient: I have a headache that won't go away.
Doctor: You have to be patient first.
Doctor: Well, it's true, it's true.
Doctor: Well, it's true.
Doctor: Well, it's true.
Doctor: Well, it's true.
Doctor: Well, it's true.
Doctor: Well, it's true.
Doctor: Well, it's true.
Doctor: Well, it's true.
Doctor: Well, it's true.
Doctor: Well, it's true

Test Case 3: Patient: Can you help me with my allergies?
Doctor: It works. I can see that I have a problem with myself. My allergies are not as great as I think. I