You said we didn't have to include everything so I only did Address, Demographics, Medication, Immunization. I didn't use the langchain way to output using JSON mode because it wouldn't work for me. But I did convert to JSON using pydantic and JSON to Parquet using PyArrow

I did make a generic faiss which could work with all 5 but I only did two (immunization, medication) just to show you it works and made it more efficient because I didn't run langchain twice, I just changed the code to what it should be and then printed. One thing that could be inefficient is the turning to Pandas but that was in the spark overflow so I just used that.

In [23]:
!pip install -U sentence-transformers
!pip install faiss-cpu
!pip install PyMuPDF Pillow numpy
!pip install pillow pymupdf numpy
!pip install -qU langchain-openai



In [24]:
import pandas as pd
import pyspark.sql.functions as F
from pyspark.sql.types import ArrayType, DoubleType, StructType, StructField, StringType
from pyspark.sql import SparkSession
from sentence_transformers import SentenceTransformer
from pydantic import BaseModel, Field
from typing import List, Optional
from langchain_openai import ChatOpenAI
import faiss
import numpy as np
import os
import pyarrow as pa
import pyarrow.parquet as pq
from pathlib import Path
from pyarrow import json

In [25]:
class Address(BaseModel):
    city: str = Field(description="The city where the patient lives. Should be under DEMOGRAPHICS header")
    state: str = Field(description="The state where the patient lives. Should be under DEMOGRAPHICS header")
    street_address: str = Field(default=None, description="The street address of the patient. Should be under DEMOGRAPHICS header")
    zip_code: str = Field(default=None, description="The zip code of the patient. Should be under DEMOGRAPHICS header")
    apt_number: Optional[str] = Field(None, description="The apartment number of the patient. Should be under DEMOGRAPHICS header.")

class Demographics(BaseModel):
    name: str = Field(description="The full name of the patient with no numbers and honorifics")
    date_of_birth: str = Field(description="Month/Day/Year Format, don't include the age")
    age: int = Field(description="Just the number of the age of the patient")
    gender: str = Field(description="Either Male, Female, or N/A if there is none")
    address: Address = Field(description="The full address of the patient including street, city, state, zip code, and apt number (if applicable)")
    insurance: str = Field(description="The insurance of the patient. Should be under DEMOGRAPHICS header")
    mrn: str = Field(description="The medical record number (MRN) of the patient. Should be under DEMOGRAPHICS header")

class Medication(BaseModel):
    code: Optional[str] = Field(default=None, description="Closest number from the medical code.")
    description: Optional[str] = Field(default=None, description="Closest description from the medicine name. There could be multiple")

class Immunization(BaseModel):
    code: Optional[str] = Field(default=None, description="Closest number from the medical code.")
    description: Optional[str] = Field(default=None, description="Closest description from the immunization name. There could be multiple. This is NOT COVID PCR since that is a test for COVID. This is a vaccine for some type of disease")
    date: Optional[str] = Field(default=None, description="Month/Day/Year Format")

class PatientRecord(BaseModel):
    demographics: Demographics
    medications: List[Medication]
    immunizations: List[Immunization]

llm = ChatOpenAI(model="gpt-4o-mini", api_key="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")
structured_llm = llm.with_structured_output(PatientRecord)
model = SentenceTransformer("all-MiniLM-L6-v2")

medication_file = "d22592ac-552f-4ecd-a63d-7663d77ce9ba.txt"
immunization_file = "f0f3bc8d-ef38-49ce-a2bd-dfdda982b271.txt"
both_file = "df6b563d-1ff4-4833-9af8-84431e641e9c.txt"

In [26]:
def embed(file_path):
  spark = SparkSession.builder.appName("AAAAAAAAAAAAa").getOrCreate()
  schema = StructType([
      StructField("code", StringType(), True),
      StructField("name", StringType(), True),
      ])

  df = spark.read.csv(file_path, header=True, schema=schema)

  @F.pandas_udf(returnType=ArrayType(DoubleType()))
  def encode(x: pd.Series) -> pd.Series:
      return pd.Series(model.encode(x).tolist())

  return df.withColumn("embedding", encode("name"))

This is still from the spark overflow you gave us but to make it general you  use getattr.

In [27]:
def search_and_match(file_path, data_type, structured_output):
    pain = embed(file_path).toPandas()
    names = pain['name']
    vectors = model.encode(names)

    vector_dimension = vectors.shape[1]
    index = faiss.IndexFlatL2(vector_dimension)
    faiss.normalize_L2(vectors)
    index.add(vectors)

    data_items = getattr(structured_output, data_type)

    number_of_descriptions = len([item for item in data_items if item.description])

    for i in range(number_of_descriptions):
        search_text = data_items[i].description
        search_vector = model.encode(search_text)
        _vector = np.array([search_vector])
        faiss.normalize_L2(_vector)

        k = index.ntotal
        distances, ann = index.search(_vector, k=k)

        results = pd.DataFrame({'distances': distances[0], 'ann': ann[0]})

        merge = pd.merge(results, pain, left_on='ann', right_index=True)
        data_items[i].code = (merge['code'].iloc[0])

a little if statement to make it more efficient and also converting to JSON using pydantic and saving it in a folder after the patient's name because the random numbers are too confusing.

In [28]:
def get_structured_output(file):
    with open(file, "r") as f:
        patient_note = f.read()
    return structured_llm.invoke(patient_note)

def record(file):
    structured_output = get_structured_output(file)

    if structured_output.immunizations and structured_output.immunizations[0].description:
        search_and_match("immunizations_assignment_1.csv", "immunizations", structured_output)

    if structured_output.medications and structured_output.medications[0].description:
        search_and_match("medications_assignment_1.csv", "medications", structured_output)

    print(structured_output)

    folder_name = structured_output.demographics.name
    os.makedirs(folder_name, exist_ok=True)
    json_data = structured_output.model_dump_json()
    json_path = os.path.join(folder_name, "JSON.json")
    with open(json_path, "w") as json_file:
        json_file.write(json_data)

    table = json.read_json(json_path)
    parquet_path = os.path.join(folder_name, "PARQUET.parquet")
    pq.write_table(table, parquet_path)

print of a file with only immunization with the correct code (these are before the json/parquet transformation)

In [29]:
record(immunization_file)

demographics=Demographics(name='Jacinto Kris', date_of_birth='08/24/2017', age=2, gender='Male', address=Address(city='Springfield', state='MA', street_address=None, zip_code=None, apt_number=None), insurance='Self-Pay', mrn='f0f3bc8d-ef38-49ce-a2bd-dfdda982b271') medications=[] immunizations=[Immunization(code='140', description='Influenza vaccine', date='08/01/2019'), Immunization(code='33', description='Hepatitis A vaccine', date='01/30/2020')]


print of a file with only medication with the correct code

In [30]:
record(medication_file)

demographics=Demographics(name='José Eduardo Gómez', date_of_birth='6/22/1989', age=30, gender='Male', address=Address(city='Chicopee', state='Massachusetts', street_address='427 Balistreri Way Unit 19', zip_code='01013', apt_number=None), insurance='Guardian', mrn='d22592ac-552f-4ecd-a63d-7663d77ce9ba') medications=[] immunizations=[]


print of a file with both with the correct code

In [31]:
record(both_file)

demographics=Demographics(name='Ms. Brown', date_of_birth='9/29/1982', age=37, gender='Female', address=Address(city='Boston', state='MA', street_address=None, zip_code=None, apt_number=None), insurance='Medicare/Medicaid', mrn='df6b563d-1ff4-4833-9af8-84431e641e9c') medications=[Medication(code='429503', description='Hydrochlorothiazide 12.5 MG daily'), Medication(code='896209', description='Fluticasone/Salmeterol 250/50 mcg inhaler BID')] immunizations=[Immunization(code='140', description='Influenza vaccine', date='3/11/2020')]


In [22]:
from pyspark.sql import functions as F
from pyspark.sql.functions import when, col, explode, array_contains, min, max
from pyspark.sql.types import StructType, StructField, FloatType
from pyspark.sql.window import Window
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName("MedicalRecordsAnalysis").getOrCreate()

df = spark.read.parquet("medical_records.parquet")

I don't think there's a way to do this without repeating yourself because the when doesn't allow multiple parameters besides the value and name

In [6]:
filtered_df = df.filter(col("laboratory.covid19.result").contains("Detected"))

df_with_age_groups = filtered_df.withColumn(
    "age_group",
    when((col("demographics.age") >= 71), "71+")
    .when((col("demographics.age") >= 51) & (col("demographics.age") <= 70), "51-70")
    .when((col("demographics.age") >= 31) & (col("demographics.age") <= 50), "31-50")
    .when((col("demographics.age") >= 18) & (col("demographics.age") <= 30), "18-30")
    .when((col("demographics.age") >= 11) & (col("demographics.age") <= 17), "11-17")
    .when((col("demographics.age") >= 6) & (col("demographics.age") <= 10), "6-10")
    .when((col("demographics.age") >= 0) & (col("demographics.age") <= 5), "0-5")
    .otherwise(None)
)

custom_order = when(col("age_group") == "71+", 1) \
    .when(col("age_group") == "51-70", 2) \
    .when(col("age_group") == "31-50", 3) \
    .when(col("age_group") == "18-30", 4) \
    .when(col("age_group") == "11-17", 5) \
    .when(col("age_group") == "6-10", 6) \
    .when(col("age_group") == "0-5", 7)

df_with_age_groups.groupBy("age_group").count().orderBy(custom_order).show()

+---------+-----+
|age_group|count|
+---------+-----+
|      71+|  151|
|    51-70|  291|
|    31-50|  311|
|    18-30|  235|
|    11-17|   91|
|     6-10|   74|
|      0-5|   19|
+---------+-----+



Doesn't this mean that you don't even need the earliest case and last case because all you want is cases where covid is detected or what does cumulative mean?

In [7]:
cumulative_count = filtered_df.count()
print(f"Cumulative case count: {cumulative_count}")

Cumulative case count: 1172


I'm not sure what ICU exactly entails as there wasn't a clear column but I'm just going with Controlled Ventilation because that must mean that you're in the ICU.

In [8]:
icu_df = df.filter(array_contains(col("procedures.code"), "26763009"))

cumulative_count = icu_df.count()
print(f"Cumulative case count: {cumulative_count}")

Cumulative case count: 36


In [9]:
reg_symptoms = filtered_df.select(explode(col("conditions")).alias("condition"))
reg_symptoms_filtered = reg_symptoms.filter(col("condition.description").contains("(finding)"))
reg_count = reg_symptoms_filtered.select(col("condition.description").alias("symptom")) \
   .groupBy("symptom").count().orderBy("count", ascending=False).limit(5)

icu_df = df.filter(array_contains(col("procedures.code"), "26763009"))
icu_symptoms = icu_df.select(explode(col("conditions")).alias("condition"))
icu_symptoms_filtered = icu_symptoms.filter(col("condition.description").contains("(finding)"))
icu_count = icu_symptoms_filtered.select(col("condition.description").alias("symptom")) \
   .groupBy("symptom").count().orderBy("count", ascending=False).limit(5)

print("Top 5 Regular Symptoms:")
reg_count.show(truncate=False)

print("Top 5 ICU Symptoms:")
icu_count.show(truncate=False)

Top 5 Regular Symptoms:
+------------------------+-----+
|symptom                 |count|
+------------------------+-----+
|Fever (finding)         |1040 |
|Cough (finding)         |804  |
|Loss of taste (finding) |586  |
|Fatigue (finding)       |447  |
|Sputum finding (finding)|373  |
+------------------------+-----+

Top 5 ICU Symptoms:
+------------------------------+-----+
|symptom                       |count|
+------------------------------+-----+
|Respiratory distress (finding)|36   |
|Fever (finding)               |31   |
|Cough (finding)               |24   |
|Loss of taste (finding)       |22   |
|Sputum finding (finding)      |17   |
+------------------------------+-----+



In [27]:
medications_df = filtered_df.select(explode(col("medications")).alias("medication"))

medications_descriptions = medications_df.select(col("medication.description").alias("medication_description"))

medications_count = medications_descriptions.groupBy("medication_description").count()

ranked_medications = medications_count.orderBy(col("count").desc())

ranked_medications.show()

+----------------------+-----+
|medication_description|count|
+----------------------+-----+
|  1 ML Epoetin Alfa...|  119|
|  insulin human  is...|   66|
|  Hydrochlorothiazi...|   51|
|  NDA020503 200 ACT...|   51|
|  24 HR Metformin h...|   45|
|  amLODIPine 5 MG /...|   41|
|  Acetaminophen 500...|   39|
|  Simvastatin 10 MG...|   38|
|  Acetaminophen 325...|   34|
|  120 ACTUAT Flutic...|   34|
|  1 ML Enoxaparin s...|   33|
|  0.4 ML Enoxaparin...|   29|
|  Atenolol 50 MG / ...|   26|
|  Digoxin 0.125 MG ...|   24|
|  Verapamil Hydroch...|   24|
|  Warfarin Sodium 5...|   24|
|  vancomycin 1000 M...|   22|
|  piperacillin 4000...|   22|
|  4 ML Norepinephri...|   21|
|  1 ML Vasopressin ...|   21|
+----------------------+-----+
only showing top 20 rows



In [28]:
medications_df = df_with_age_groups.select(explode(col("medications")).alias("medication"), "age_group")

medications_descriptions = medications_df.select(col("medication.description").alias("medication_description"), "age_group")

medications_count = medications_descriptions.groupBy("age_group", "medication_description").count()

window_spec = Window.partitionBy("age_group").orderBy(F.col("count").desc())

ranked_medications = medications_count.withColumn("rank", F.row_number().over(window_spec))

top_medications = ranked_medications.filter(col("rank") <= 3).drop("rank")

top_medications.show(truncate=False)

+---------+-----------------------------------------------------------------------------------------------------------------+-----+
|age_group|medication_description                                                                                           |count|
+---------+-----------------------------------------------------------------------------------------------------------------+-----+
|0-5      |Ibuprofen 100 MG Oral Tablet                                                                                     |3    |
|0-5      |Acetaminophen 21.7 MG/ML / Dextromethorphan Hydrobromide 1 MG/ML / doxylamine succinate 0.417 MG/ML Oral Solution|1    |
|0-5      |Amoxicillin 250 MG Oral Capsule                                                                                  |1    |
|11-17    |NDA020503 200 ACTUAT Albuterol 0.09 MG/ACTUAT Metered Dose Inhaler                                               |7    |
|11-17    |120 ACTUAT Fluticasone propionate 0.044 MG/ACTUAT Metered Dose In

In [50]:
df_with_blood_pressure = filtered_df.withColumn(
    "systolic_value", F.col("vitals.current.blood_pressure.systolic.value")
).withColumn(
    "diastolic_value", F.col("vitals.current.blood_pressure.diastolic.value")
)

df_with_blood_pressure = df_with_blood_pressure.withColumn(
    "hypertension", F.when(
        (F.col("systolic_value") >= 140) | (F.col("diastolic_value") >= 90), True
    ).otherwise(False)
)

df_with_blood_pressure = filtered_df.withColumn(
    "systolic_value", F.col("vitals.current.blood_pressure.systolic.value")
).withColumn(
    "diastolic_value", F.col("vitals.current.blood_pressure.diastolic.value")
).withColumn(
    "hypertension", F.when(
        (F.col("systolic_value") >= 140) | (F.col("diastolic_value") >= 90), True
    ).otherwise(False)
).withColumn(
    "blood_pressure_category", F.when(
        (F.col("systolic_value") < 90) & (F.col("diastolic_value") < 60), "Low"
    ).when(
        (F.col("systolic_value") >= 90) & (F.col("systolic_value") < 120) &
        (F.col("diastolic_value") >= 60) & (F.col("diastolic_value") < 80), "Normal"
    ).when(
        (F.col("systolic_value") >= 120) & (F.col("systolic_value") < 140) &
        (F.col("diastolic_value") >= 80) & (F.col("diastolic_value") < 90), "High"
    ).otherwise("Hypertension")
)

result = df_with_blood_pressure.groupBy("blood_pressure_category").count()

result.show()

+-----------------------+-----+
|blood_pressure_category|count|
+-----------------------+-----+
|                   High|  285|
|           Hypertension|  622|
|                 Normal|  265|
+-----------------------+-----+



In [18]:
df_with_bmi = filtered_df.withColumn(
    "height_in_meters", F.col("vitals.baseline.height.value") / 100
).withColumn(
    "bmi_value", F.col("vitals.current.weight.value") / (F.col("height_in_meters") ** 2)
).withColumn(
    "obesity_category", F.when(
        F.col("bmi_value") < 18.5, "Underweight"
    ).when(
        (F.col("bmi_value") >= 18.5) & (F.col("bmi_value") <= 24.9), "Normal weight"
    ).when(
        (F.col("bmi_value") >= 25) & (F.col("bmi_value") <= 29.9), "Overweight"
    ).otherwise("Obesity")
)

obesity_result = df_with_bmi.groupBy("obesity_category").count()

obesity_result.show()

+----------------+-----+
|obesity_category|count|
+----------------+-----+
|      Overweight|  183|
|         Obesity|  834|
|   Normal weight|   69|
|     Underweight|   86|
+----------------+-----+



1st Analysis: Are you more likely to get COVID-19 if you are overweight/obese? This is important because if you are at risk to get COVID-19, then you might consider getting to a normal weight if you are more likely to get COVID.

 It doesn't seem like weight plays a role, if anything it may play a slight role because all the percentages are essentially the same except for maybe 15 and 22.

In [21]:
covid_not_detected_df = df.filter(~F.col("laboratory.covid19.result").contains("Detected"))

def classify_bmi(df):
    return df.withColumn(
        "height_in_meters", F.col("vitals.baseline.height.value") / 100
    ).withColumn(
        "bmi_value", F.col("vitals.current.weight.value") / (F.col("height_in_meters") ** 2)
    ).withColumn(
        "obesity_category", F.when(
            F.col("bmi_value") < 18.5, "Underweight"
        ).when(
            (F.col("bmi_value") >= 18.5) & (F.col("bmi_value") <= 24.9), "Normal weight"
        ).when(
            (F.col("bmi_value") >= 25) & (F.col("bmi_value") <= 29.9), "Overweight"
        ).otherwise("Obesity")
    )

df_with_bmi_covid = classify_bmi(filtered_df)
df_with_bmi_no_covid = classify_bmi(covid_not_detected_df)

df_with_bmi_covid = df_with_bmi_covid.withColumn("covid_status", F.lit("Detected"))
df_with_bmi_no_covid = df_with_bmi_no_covid.withColumn("covid_status", F.lit("Not Detected"))

obesity_comparison_covid = df_with_bmi_covid.groupBy("covid_status", "obesity_category").count()
obesity_comparison_no_covid = df_with_bmi_no_covid.groupBy("covid_status", "obesity_category").count()

total_covid = df_with_bmi_covid.count()
total_no_covid = df_with_bmi_no_covid.count()

obesity_percentage_covid = obesity_comparison_covid.withColumn(
    "percentage", (F.col("count") / total_covid) * 100
)

obesity_percentage_no_covid = obesity_comparison_no_covid.withColumn(
    "percentage", (F.col("count") / total_no_covid) * 100
)

print("Obesity Percentage for People with COVID Detected:")
obesity_percentage_covid.orderBy("obesity_category").show()

print("Obesity Percentage for People with COVID Not Detected:")
obesity_percentage_no_covid.orderBy("obesity_category").show()

Obesity Percentage for People with COVID Detected:
+------------+----------------+-----+-----------------+
|covid_status|obesity_category|count|       percentage|
+------------+----------------+-----+-----------------+
|    Detected|   Normal weight|   69|5.887372013651877|
|    Detected|         Obesity|  834|  71.160409556314|
|    Detected|      Overweight|  183|15.61433447098976|
|    Detected|     Underweight|   86|7.337883959044368|
+------------+----------------+-----+-----------------+

Obesity Percentage for People with COVID Not Detected:
+------------+----------------+-----+------------------+
|covid_status|obesity_category|count|        percentage|
+------------+----------------+-----+------------------+
|Not Detected|   Normal weight|   12| 3.821656050955414|
|Not Detected|         Obesity|  219|  69.7452229299363|
|Not Detected|      Overweight|   71|22.611464968152866|
|Not Detected|     Underweight|   12| 3.821656050955414|
+------------+----------------+-----+---------

2nd Analysis: Are old people more likely to have COVID? This is important because this is recognizing a risk factor. It does seem to have some truth even though 71+ is lower, 51-70 is a 58% increase from 24 to 38.

In [57]:
def add_age_groups(df):
    return df.withColumn(
        "age_group",
        when((col("demographics.age") >= 71), "71+")
        .when((col("demographics.age") >= 51) & (col("demographics.age") <= 70), "51-70")
        .when((col("demographics.age") >= 31) & (col("demographics.age") <= 50), "31-50")
        .when((col("demographics.age") >= 18) & (col("demographics.age") <= 30), "18-30")
        .when((col("demographics.age") >= 11) & (col("demographics.age") <= 17), "11-17")
        .when((col("demographics.age") >= 6) & (col("demographics.age") <= 10), "6-10")
        .when((col("demographics.age") >= 0) & (col("demographics.age") <= 5), "0-5")
        .otherwise(None)
    )

df_with_age_groups_covid = add_age_groups(filtered_df)
df_with_age_groups_no_covid = add_age_groups(covid_not_detected_df)

df_with_age_groups_covid = df_with_age_groups_covid.withColumn("covid_status", F.lit("Detected"))
df_with_age_groups_no_covid = df_with_age_groups_no_covid.withColumn("covid_status", F.lit("Not Detected"))

combined_df = df_with_age_groups_covid.union(df_with_age_groups_no_covid)

age_group_covid_counts = combined_df.groupBy("age_group", "covid_status").count()

total_counts_by_covid_status = combined_df.groupBy("covid_status").count()

total_counts_by_covid_status = total_counts_by_covid_status.withColumnRenamed("count", "total_count")

age_group_covid_percentage = age_group_covid_counts.join(
    total_counts_by_covid_status, on="covid_status", how="inner"
).withColumn(
    "percentage", (col("count") / col("total_count")) * 100
)

custom_order = when(col("age_group") == "71+", 1) \
    .when(col("age_group") == "51-70", 2) \
    .when(col("age_group") == "31-50", 3) \
    .when(col("age_group") == "18-30", 4) \
    .when(col("age_group") == "11-17", 5) \
    .when(col("age_group") == "6-10", 6) \
    .when(col("age_group") == "0-5", 7)

age_group_covid_percentage_sorted = age_group_covid_percentage.orderBy(custom_order)

age_group_covid_percentage_sorted.show()

+------------+---------+-----+-----------+------------------+
|covid_status|age_group|count|total_count|        percentage|
+------------+---------+-----+-----------+------------------+
|    Detected|      71+|  151|       1172|  12.8839590443686|
|Not Detected|      71+|   50|        314| 15.92356687898089|
|    Detected|    51-70|  291|       1172|24.829351535836178|
|Not Detected|    51-70|  121|        314| 38.53503184713376|
|    Detected|    31-50|  311|       1172|26.535836177474405|
|Not Detected|    31-50|   73|        314|23.248407643312103|
|    Detected|    18-30|  235|       1172| 20.05119453924915|
|Not Detected|    18-30|   44|        314|14.012738853503185|
|    Detected|    11-17|   91|       1172| 7.764505119453926|
|Not Detected|    11-17|   14|        314|  4.45859872611465|
|    Detected|     6-10|   74|       1172| 6.313993174061433|
|Not Detected|     6-10|    9|        314|2.8662420382165608|
|    Detected|      0-5|   19|       1172| 1.621160409556314|
|Not Det

3rd Analysis: If having immunizations changed anything with having a lower rate of ICU which is important because it promotes vaccination but it doesn't seem to have changed anything.

In [14]:
people_with_immunizations = filtered_df.filter(
    F.size(F.col("immunizations")) > 0
).count()

people_without_immunizations = filtered_df.filter(
    F.size(F.col("immunizations")) == 0
).count()

total_people = filtered_df.count()

icu_with_immunizations = icu_df.filter(
    F.size(F.col("immunizations")) > 0
).count()

icu_without_immunizations = icu_df.filter(
    F.size(F.col("immunizations")) == 0
).count()

percentage_icu_with_immunizations = (icu_with_immunizations / people_with_immunizations) * 100 if people_with_immunizations > 0 else 0
percentage_icu_without_immunizations = (icu_without_immunizations / people_without_immunizations) * 100 if people_without_immunizations > 0 else 0

print(f"Total People: {total_people}")
print(f"People with Immunizations: {people_with_immunizations}")
print(f"People without Immunizations: {people_without_immunizations}")
print(f"ICU Rate for People with Immunizations: {percentage_icu_with_immunizations:.2f}%")
print(f"ICU Rate for People without Immunizations: {percentage_icu_without_immunizations:.2f}%")

36
Total People: 1172
People with Immunizations: 394
People without Immunizations: 778
ICU Rate for People with Immunizations: 2.28%
ICU Rate for People without Immunizations: 3.47%
