In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from vllm import LLM, SamplingParams

In [None]:
# load dataset
df = pd.read_csv('Augeninnendruck.csv', sep=';')
df.head(3)

In [None]:
# load model
llm = LLM("mistralai/Mistral-7B-Instruct-v0.1", max_model_len=32768/2)

In [None]:
# get tokenizer for chat template
tokenizer = llm.get_tokenizer()

In [None]:
def query_template(df: pd.DataFrame, query: str, tokenizer):
    llm_chat = [
        {
            "role": "user",
            "content": f"""Your task is to write python code to return a list of all unique patients for the given dataframe using the pandas and matplotlib libraries.
The dataset has an event-based structure, every line has the same number of columns, so each patient has multiple rows.
Here are the first dataframe rows:
{df.head(5)}

The dataframe has the following columns:
{df.columns}
You can allocate records to patients with the `MPINumber` column. Conside 
The values for findings are found in the `Value` column. 
"""     },
        {
            "role": "assistant",
            "content": """```python
df['PatientAccountID'].unique()
```
"""
        },
        {
            "role": "user",
            "content": "Select all records with the diagnosis N18"
        },
        {
            "role": "assistant",
            "content": """```python
df[(df['Finding'] == 'Diagnose') & (df['Value'].str.contains('N18'))]
```
"""
        },
        {
            "role": "user",
            "content": "Build a new dataframe with patients which were diagnosed with N18"
        },
        {
            "role": "assistant",
            "content": """```python
diagnosed_patients = df[(df['Finding'] == 'Diagnose') & (df['Value'].str.contains('N18'))]['MPINumber'].unique()
df_n18 = df[df['MPINumber'].isin(diagnosed_patients)]
```
"""
        },
        {
            "role": "user",
            "content": query
        }
    ]

    return tokenizer.apply_chat_template(llm_chat, add_generation_prompt=True, tokenize=False)

## RQ - A
How many people got diagnosed with H36 over time?

In [None]:
query = query_template(df, "How many patients got diagnosed with H36? Consider that each patient can have multiple findings.", tokenizer)

llm_outputs = llm.generate(query, SamplingParams(temperature=0.1, max_tokens=400))
for i, out in enumerate(llm_outputs):
    print('Output:')
    print(out.outputs[0].text)
    print('#########')

## RQ - B
How was the age and gender distribution of the disease?

In [None]:
query = query_template(df, "Given the dataframe above. Create a pie chart for the Gender distribution. " + data_discription_str, tokenizer)

llm_outputs = llm.generate(query, SamplingParams(temperature=0.1, max_tokens=400))
for i, out in enumerate(llm_outputs):
    print('Output:')
    print(out.outputs[0].text)
    print('#########')

## RQ - C
What were the five most common secondary diagnoses?

In [None]:
query = query_template_chat(df, "Plot the five most common secondary diagnosis", tokenizer)

llm_outputs = llm.generate(query, SamplingParams(temperature=0.1, max_tokens=400))
for i, out in enumerate(llm_outputs):
    print('Output:')
    print(out.outputs[0].text)
    print('#########')

## RQ - D
How many HBA1C values are there for each patient?

In [None]:
query = query_template_chat(df, "How many the HBA1C values has each diagnosed patient?", tokenizer)

llm_outputs = llm.generate(query, SamplingParams(temperature=0.1, max_tokens=400))
for i, out in enumerate(llm_outputs):
    print('Output:')
    print(out.outputs[0].text)
    print('#########')