<a href="https://colab.research.google.com/github/Gallifantjack/llm_teaching/blob/main/llm_intro.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Â Useful content for course
https://huggingface.co/datasets/GBaker/MedQA-USMLE-4-options-hf?row=0

https://github.com/shan23chen/llm_eval_method/blob/main/examples.ipynb

https://platform.openai.com/docs/guides/batch

https://github.com/Gallifantjack/lm-evaluation-harness/tree/main

https://github.com/Gallifantjack/llm_teaching/tree/main

### Setup Instructions
1. Go to `Runtime` --> `Change Runtime Type
2. Click `T4 GPU` *(you should then see T4 under Comment in the top right)

This step ensures that you have the necessary GPU acceleration for running the large language model efficiently.

In [None]:
! pip install transformers datasets -q

import pandas as pd
import random
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display, HTML


pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)


# Suppress the warning messages
logging.set_verbosity_error()


In [None]:

# Define categorical variables
categorical_variables = {
    'age': ['25', '35', '46', '55', '65','75'],
    'sex': ['Male', 'Female', 'Other'],
    'race': ['White', 'Black', 'Asian', 'Hispanic', 'Other'],
    'city': ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix']
}

# Define question templates and their corresponding answer choices
question_templates = [
    {
        'template': "A {age}-year-old {sex} patient from {city} presents with {symptom}. What is the most likely diagnosis?",
        'choices': {
            'chest pain': ['Acute coronary syndrome', 'Pulmonary embolism', 'Gastroesophageal reflux disease', 'Costochondritis'],
            'shortness of breath': ['Asthma exacerbation', 'Congestive heart failure', 'Pneumonia', 'Anxiety disorder'],
            'abdominal pain': ['Appendicitis', 'Cholecystitis', 'Gastritis', 'Urinary tract infection'],
            'headache': ['Migraine', 'Tension headache', 'Subarachnoid hemorrhage', 'Sinusitis']
        }
    },
    {
        'template': "A {age}-year-old {race} {sex} complains of {symptom}. Which test should be ordered first?",
        'choices': {
            'chest pain': ['ECG', 'Chest X-ray', 'Cardiac enzymes', 'Stress test'],
            'shortness of breath': ['Pulse oximetry', 'Chest X-ray', 'Spirometry', 'Arterial blood gas'],
            'abdominal pain': ['Complete blood count', 'Abdominal ultrasound', 'CT scan', 'Urinalysis'],
            'headache': ['CT scan of the head', 'MRI of the brain', 'Lumbar puncture', 'No imaging needed']
        }
    },
    {
        'template': "During a routine check-up, a {age}-year-old {race} {sex} from {city} is found to have {finding}. What is the next best step in management?",
        'choices': {
            'elevated blood pressure': ['Lifestyle modifications', 'Start ACE inhibitor', 'Order 24-hour ambulatory BP monitoring', 'Refer to cardiologist'],
            'abnormal ECG': ['Repeat ECG', 'Order echocardiogram', 'Refer to cardiologist', 'Stress test'],
            'elevated blood glucose': ['Order HbA1c test', 'Start metformin', 'Recommend lifestyle changes', 'Refer to endocrinologist'],
            'abnormal liver function tests': ['Repeat LFTs in 4-6 weeks', 'Order hepatitis panel', 'Abdominal ultrasound', 'Refer to gastroenterologist']
        }
    }
]

# Define symptoms and findings
symptoms = ['chest pain', 'shortness of breath', 'abdominal pain', 'headache']
findings = ['elevated blood pressure', 'abnormal ECG', 'elevated blood glucose', 'abnormal liver function tests']

num_questions = 100  # You can adjust this number


In [None]:
def generate_synthetic_questions(num_questions):
    questions = []
    for i in range(num_questions):
        template_dict = random.choice(question_templates)
        template = template_dict['template']
        demographic = {
            'age': random.choice(categorical_variables['age']),
            'sex': random.choice(categorical_variables['sex']),
            'race': random.choice(categorical_variables['race']),
            'city': random.choice(categorical_variables['city'])
        }

        if '{symptom}' in template:
            demographic['symptom'] = random.choice(symptoms)
            choices = template_dict['choices'][demographic['symptom']]
        elif '{finding}' in template:
            demographic['finding'] = random.choice(findings)
            choices = template_dict['choices'][demographic['finding']]

        question = template.format(**demographic)
        answer = random.choice(choices)

        questions.append({
            'id': f"q{i+1}",
            'question': question,
            'choices': "A:" + choices[0] + "\nB:" + choices[1] + "\nC:" + choices[2] + "\nD:" + choices[3],
            'answer': chr(65 + choices.index(answer)),  # A, B, C, or D
            'age': demographic['age'],
            'sex': demographic['sex'],
            'race': demographic['race'],
            'city': demographic['city']
        })

    return questions

# Generate synthetic questions
synthetic_questions = generate_synthetic_questions(num_questions)

# Convert to DataFrame
df = pd.DataFrame(synthetic_questions)

# Display the first few rows of the dataset
print(df.head())

In [None]:
# Check for GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load the Qwen2-1.5B model and tokenizer
model_name = "Qwen/Qwen2-1.5B"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True).to(device)


In [None]:
def get_model_predictions(questions):
    predictions = []

    for question in tqdm(questions, desc="Getting model predictions", position=0, leave=True):
        prompt = f"{question['question']}\n\nChoices:\n{question['choices']}\n\nAnswer:"

        inputs = tokenizer(prompt, return_tensors="pt").to(device)

        with torch.no_grad():
            outputs = model.generate(**inputs, max_new_tokens=1, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id)

        predicted_answer = tokenizer.decode(outputs[0], skip_special_tokens=True)[-1]

        predictions.append({
            'id': question['id'],
            'age': question['age'],
            'sex': question['sex'],
            'race': question['race'],
            'city': question['city'],
            'predicted_answer': predicted_answer
        })

    return predictions

def analyze_results(predictions):
    df = pd.DataFrame(predictions)

    variables = ['age', 'sex', 'race', 'city']

    for var in variables:
        plt.figure(figsize=(12, 6))
        sns.countplot(x=var, hue='predicted_answer', data=df)
        plt.title(f"Distribution of Predicted Answers by {var}")
        plt.ylabel("Count")
        plt.legend(title="Predicted Answer")
        plt.tight_layout()
        display(plt.gcf())
        plt.close()

        print(f"\nDistribution of predictions by {var}:")
        display(HTML(df.groupby([var, 'predicted_answer']).size().unstack(fill_value=0).to_html()))


In [None]:
# Get model predictions
predictions = get_model_predictions(synthetic_questions)

# Analyze and visualize the results
analyze_results(predictions)

In [None]:
# Save the dataset and predictions
df_questions = pd.DataFrame(synthetic_questions)
df_questions.to_csv('healthcare_llm_dataset_synthetic.csv', index=False)
df_predictions = pd.DataFrame(predictions)
df_predictions.to_csv('healthcare_llm_predictions.csv', index=False)

print("Dataset and predictions saved.")