In [1]:
import polars as pl
from polars import selectors as cs

In [2]:
df_datos = pl.read_csv("../../data/dataset.csv")

In [3]:
# 1
(
    df_datos
    .group_by("Institute Name")
    .len("n")
    .drop_nulls()
    .filter(pl.col("Institute Name") != "Not applicable")
    .sort("n", descending=True)
)

Institute Name,n
str,u32
"""Franciscan Children's Hospita…",363
"""Carney Hospital""",357
"""New England Medical Center""",350
"""Hebrew Rehabilitation Center""",349
"""VA Hospital""",344
…,…
"""Jewish Memorial Hospital""",315
"""Beth Israel Deaconess Medical …",315
"""Lemuel Shattuck Hospital""",313
"""Va Hospital""",312


In [4]:
# 2 (opción A)
(
    df_datos
    .group_by("Disorder Subclass")
    .agg(
        percent_normal = (
            (pl.col("Heart Rate (rates/min") == "Normal").sum() /  pl.len() * 100
        ).round(2)
    )
    .sort("Disorder Subclass")
    .drop_nulls()
)

Disorder Subclass,percent_normal
str,f64
"""Alzheimer's""",47.37
"""Cancer""",49.48
"""Cystic fibrosis""",45.48
"""Diabetes""",46.78
"""Hemochromatosis""",45.76
"""Leber's hereditary optic neuro…",42.59
"""Leigh syndrome""",46.34
"""Mitochondrial myopathy""",47.33
"""Tay-Sachs""",46.38


In [5]:
# 2 (opción B)
df_counts_total = (
    df_datos
    .group_by("Disorder Subclass")
    .len("n_total")
    .drop_nulls()
    .sort("Disorder Subclass")
)
df_counts_normal = (
    df_datos
    .filter(pl.col("Heart Rate (rates/min") == "Normal")
    .group_by("Disorder Subclass")
    .len("n_normal")
    .drop_nulls()
    .sort("Disorder Subclass")
)

(
    pl.concat([df_counts_total, df_counts_normal.drop("Disorder Subclass")], how="horizontal")
    .with_columns(
        percent_normal = (pl.col("n_normal") / pl.col("n_total") * 100).round(2)
    )
    .sort("Disorder Subclass")
)

Disorder Subclass,n_total,n_normal,percent_normal
str,u32,u32,f64
"""Alzheimer's""",152,72,47.37
"""Cancer""",97,48,49.48
"""Cystic fibrosis""",3448,1568,45.48
"""Diabetes""",1817,850,46.78
"""Hemochromatosis""",1355,620,45.76
"""Leber's hereditary optic neuro…",648,276,42.59
"""Leigh syndrome""",5160,2391,46.34
"""Mitochondrial myopathy""",4405,2085,47.33
"""Tay-Sachs""",2833,1314,46.38


In [6]:
# 3
(
    df_datos
    .filter(pl.col("Paternal gene") == "Yes")
    .group_by("Gender")
    .agg(
        mean_age = pl.col("Patient Age").mean(),
    )
    .drop_nulls()
)

Gender,mean_age
str,f64
"""Male""",6.992573
"""Ambiguous""",7.005852
"""Female""",6.895551
