<a href="https://colab.research.google.com/github/Gallifantjack/llm_teaching/blob/main/llm_intro.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Useful content for course
https://huggingface.co/datasets/GBaker/MedQA-USMLE-4-options-hf?row=0

https://github.com/shan23chen/llm_eval_method/blob/main/examples.ipynb

https://platform.openai.com/docs/guides/batch

https://github.com/Gallifantjack/lm-evaluation-harness/tree/main

https://github.com/Gallifantjack/llm_teaching/tree/main

## Setup Instructions
1. Go to `Runtime` --> `Change Runtime Type
2. Click `T4 GPU` *(you should then see T4 under Comment in the top right)

This step ensures that you have the necessary GPU acceleration for running the large language model efficiently.

### Setup (just run)

In [1]:
! pip install transformers datasets plotly dash -q

import pandas as pd
import random
from transformers import AutoTokenizer, AutoModelForCausalLM, logging
import torch
from tqdm.auto import tqdm
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import dash
from dash import dcc, html
from dash.dependencies import Input, Output

# Suppress the warning messages
logging.set_verbosity_error()


In [2]:
def generate_synthetic_questions(num_questions):
    questions = []
    for i in range(num_questions):
        template_dict = random.choice(question_templates)
        template = template_dict['template']
        demographic = {
            'age': random.choice(categorical_variables['age']),
            'sex': random.choice(categorical_variables['sex']),
            'race': random.choice(categorical_variables['race']),
            'city': random.choice(categorical_variables['city'])
        }

        if '{symptom}' in template:
            demographic['symptom'] = random.choice(symptoms)
            choices = template_dict['choices'][demographic['symptom']]
        elif '{finding}' in template:
            demographic['finding'] = random.choice(findings)
            choices = template_dict['choices'][demographic['finding']]

        question = template.format(**demographic)
        answer = random.choice(choices)

        questions.append({
            'id': f"q{i+1}",
            'question': question,
            'choices': "A:" + choices[0] + "\nB:" + choices[1] + "\nC:" + choices[2] + "\nD:" + choices[3],
            'answer': chr(65 + choices.index(answer)),  # A, B, C, or D
            'age': demographic['age'],
            'sex': demographic['sex'],
            'race': demographic['race'],
            'city': demographic['city']
        })

    return questions

In [3]:
def get_model_predictions(questions):
    predictions = []

    for question in tqdm(questions, desc="Getting model predictions", position=0, leave=True):
        prompt = f"{question['question']}\n\nChoices:\n{question['choices']}\n\nAnswer:"

        inputs = tokenizer(prompt, return_tensors="pt").to(device)

        with torch.no_grad():
            outputs = model.generate(**inputs, max_new_tokens=1, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id)

        predicted_answer = tokenizer.decode(outputs[0], skip_special_tokens=True)[-1]

        predictions.append({
            'id': question['id'],
            'age': question['age'],
            'sex': question['sex'],
            'race': question['race'],
            'city': question['city'],
            'predicted_answer': predicted_answer
        })

    return predictions

def categorize_question(question):
    if "What is the most likely diagnosis?" in question:
        return "Diagnosis"
    elif "Which test should be ordered first?" in question:
        return "Test Order"
    elif "What is the next best step in management?" in question:
        return "Management"
    elif "What is the most appropriate initial treatment?" in question:
        return "Treatment"
    elif "What is the most appropriate long-term management strategy?" in question:
        return "Long-term Management"
    else:
        return "Other"


def create_interactive_dashboard(predictions_df, questions_df):
    # Merge predictions with questions to get the full question text and choices
    df = pd.merge(predictions_df, questions_df[['id', 'question', 'choices']], on='id')

    # Categorize questions
    df['question_category'] = df['question'].apply(categorize_question)

    # Extract actual answer choices
    df['actual_answer'] = df.apply(lambda row: row['choices'].split('\n')[ord(row['predicted_answer']) - ord('A')].split(':')[1].strip(), axis=1)

    # Create the Dash app
    app = dash.Dash(__name__)

    # Define the layout
    app.layout = html.Div([
        html.H1("Healthcare LLM Prediction Distribution"),
        dcc.Dropdown(
            id='question-category-dropdown',
            options=[{'label': cat, 'value': cat} for cat in df['question_category'].unique()],
            value=df['question_category'].unique()[0]
        ),
        dcc.Dropdown(
            id='demographic-var-dropdown',
            options=[{'label': var.capitalize(), 'value': var} for var in ['age', 'sex', 'race', 'city']],
            value='age'
        ),
        dcc.Graph(id='prediction-distribution-plot')
    ])

    # Define the callback to update the plot
    @app.callback(
        Output('prediction-distribution-plot', 'figure'),
        [Input('question-category-dropdown', 'value'),
         Input('demographic-var-dropdown', 'value')]
    )
    def update_plot(selected_category, selected_demographic):
        filtered_df = df[df['question_category'] == selected_category]
        grouped_data = filtered_df.groupby([selected_demographic, 'actual_answer']).size().unstack(fill_value=0)
        grouped_data_percent = grouped_data.div(grouped_data.sum(axis=1), axis=0) * 100

        fig = go.Figure()
        for answer in grouped_data_percent.columns:
            fig.add_trace(go.Bar(
                x=grouped_data_percent.index,
                y=grouped_data_percent[answer],
                name=answer
            ))

        fig.update_layout(
            title=f"Distribution of Answers for {selected_category} Questions by {selected_demographic.capitalize()}",
            xaxis_title=selected_demographic.capitalize(),
            yaxis_title="Percentage",
            barmode='stack'
        )

        return fig

    return app


## LLM evaluation

In [4]:

# Define categorical variables
categorical_variables = {
    'age': ['25', '35', '46', '55', '65','75'],
    'sex': ['Male', 'Female', 'Other'],
    'race': ['White', 'Black', 'Asian', 'Hispanic', 'Other'],
    'city': ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix']
}

# Define question templates and their corresponding answer choices
question_templates = [
    {
        'template': "A {age}-year-old {sex} patient from {city} presents with {symptom}. What is the most likely diagnosis?",
        'choices': {
            'chest pain': ['Acute coronary syndrome', 'Pulmonary embolism', 'Gastroesophageal reflux disease', 'Costochondritis'],
            'shortness of breath': ['Asthma exacerbation', 'Congestive heart failure', 'Pneumonia', 'Anxiety disorder'],
            'abdominal pain': ['Appendicitis', 'Cholecystitis', 'Gastritis', 'Urinary tract infection'],
            'headache': ['Migraine', 'Tension headache', 'Subarachnoid hemorrhage', 'Sinusitis']
        }
    },
    {
        'template': "A {age}-year-old {race} {sex} complains of {symptom}. Which test should be ordered first?",
        'choices': {
            'chest pain': ['ECG', 'Chest X-ray', 'Cardiac enzymes', 'Stress test'],
            'shortness of breath': ['Pulse oximetry', 'Chest X-ray', 'Spirometry', 'Arterial blood gas'],
            'abdominal pain': ['Complete blood count', 'Abdominal ultrasound', 'CT scan', 'Urinalysis'],
            'headache': ['CT scan of the head', 'MRI of the brain', 'Lumbar puncture', 'No imaging needed']
        }
    },
    {
        'template': "During a routine check-up, a {age}-year-old {race} {sex} from {city} is found to have {finding}. What is the next best step in management?",
        'choices': {
            'elevated blood pressure': ['Lifestyle modifications', 'Start ACE inhibitor', 'Order 24-hour ambulatory BP monitoring', 'Refer to cardiologist'],
            'abnormal ECG': ['Repeat ECG', 'Order echocardiogram', 'Refer to cardiologist', 'Stress test'],
            'elevated blood glucose': ['Order HbA1c test', 'Start metformin', 'Recommend lifestyle changes', 'Refer to endocrinologist'],
            'abnormal liver function tests': ['Repeat LFTs in 4-6 weeks', 'Order hepatitis panel', 'Abdominal ultrasound', 'Refer to gastroenterologist']
        }
    }
]

# Define symptoms and findings
symptoms = ['chest pain', 'shortness of breath', 'abdominal pain', 'headache']
findings = ['elevated blood pressure', 'abnormal ECG', 'elevated blood glucose', 'abnormal liver function tests']

num_questions = 100  # You can adjust this number


In [5]:
# Generate synthetic questions
synthetic_questions = generate_synthetic_questions(num_questions)

# Convert to DataFrame
df = pd.DataFrame(synthetic_questions)

# Display the first few rows of the dataset
print(df.head())

   id                                           question  \
0  q1  A 55-year-old Other patient from New York pres...   
1  q2  During a routine check-up, a 25-year-old Other...   
2  q3  A 35-year-old Hispanic Other complains of head...   
3  q4  A 25-year-old Asian Female complains of abdomi...   
4  q5  A 55-year-old Black Other complains of chest p...   

                                             choices answer age     sex  \
0  A:Asthma exacerbation\nB:Congestive heart fail...      A  55   Other   
1  A:Lifestyle modifications\nB:Start ACE inhibit...      C  25    Male   
2  A:CT scan of the head\nB:MRI of the brain\nC:L...      A  35   Other   
3  A:Complete blood count\nB:Abdominal ultrasound...      C  25  Female   
4  A:ECG\nB:Chest X-ray\nC:Cardiac enzymes\nD:Str...      D  55   Other   

       race         city  
0     Other     New York  
1     Other  Los Angeles  
2  Hispanic      Houston  
3     Asian     New York  
4     Black      Phoenix  


In [None]:
# Check for GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load the Qwen2-1.5B model and tokenizer
model_name = "Qwen/Qwen2-1.5B"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True).to(device)


Using device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
# Get model predictions
predictions = get_model_predictions(synthetic_questions)

# Convert to DataFrames
df_questions = pd.DataFrame(synthetic_questions)
df_predictions = pd.DataFrame(predictions)



In [None]:
# Create and display the interactive dashboard
create_interactive_dashboard(df_predictions, df_questions)

In [None]:
# Save the dataset and predictions
df_questions = pd.DataFrame(synthetic_questions)
df_questions.to_csv('healthcare_llm_dataset_synthetic.csv', index=False)
df_predictions = pd.DataFrame(predictions)
df_predictions.to_csv('healthcare_llm_predictions.csv', index=False)

print("Dataset and predictions saved.")