In [1]:
import polars as pl
from polars import selectors as cs

In [8]:
pl.Config.set_tbl_rows(30)

polars.config.Config

In [3]:
df_datos = pl.read_csv("../data/dataset.csv")

In [9]:
df_subclass_stats = (
    df_datos.group_by("Disorder Subclass", "Status")
    .agg(
        patients = pl.len(),
        mean_age = pl.col("Patient Age").mean(),
    )
    .drop_nulls()
    .filter(pl.col("mean_age") < 7)
)
df_subclass_stats

Disorder Subclass,Status,patients,mean_age
str,str,u32,f64
"""Alzheimer's""","""Alive""",77,6.506667
"""Diabetes""","""Deceased""",921,6.760465
"""Tay-Sachs""","""Deceased""",1433,6.988848
"""Leigh syndrome""","""Alive""",2625,6.902399
"""Leigh syndrome""","""Deceased""",2535,6.971332
"""Hemochromatosis""","""Alive""",668,6.863422
"""Hemochromatosis""","""Deceased""",687,6.88748
"""Mitochondrial myopathy""","""Alive""",2182,6.989816
"""Diabetes""","""Alive""",896,6.986779
"""Alzheimer's""","""Deceased""",75,6.60274


In [7]:
df_subclass_symptoms = (
    df_datos.select(cs.by_name("Disorder Subclass", "Status"), cs.starts_with("Symptom"))
    .group_by("Disorder Subclass", "Status")
    .agg(cs.starts_with("Symptom").mean())
    .filter(
        ~pl.col("Disorder Subclass").is_null(),
        pl.sum_horizontal(cs.starts_with("Symptom")) < 2.5
    )
)
df_subclass_symptoms

Disorder Subclass,Status,Symptom 1,Symptom 2,Symptom 3,Symptom 4,Symptom 5
str,str,f64,f64,f64,f64,f64
"""Mitochondrial myopathy""","""Deceased""",0.5295,0.459929,0.428571,0.404423,0.314115
"""Cancer""","""Deceased""",0.1875,0.045455,0.071429,0.022222,0.0
"""Cancer""","""Alive""",0.222222,0.139535,0.125,0.02381,0.023256
"""Hemochromatosis""","""Alive""",0.403941,0.319865,0.245066,0.153213,0.110922
"""Mitochondrial myopathy""","""Alive""",0.549669,0.485485,0.420181,0.382915,0.328463
"""Tay-Sachs""","""Deceased""",0.47288,0.39429,0.341258,0.273006,0.203019
"""Tay-Sachs""","""Alive""",0.461298,0.360695,0.367888,0.276899,0.189507
"""Hemochromatosis""","""Deceased""",0.381476,0.280702,0.235484,0.177496,0.112721


In [12]:
df_subclass_stats.join(df_subclass_symptoms, on=["Disorder Subclass", "Status"])

Disorder Subclass,Status,patients,mean_age,Symptom 1,Symptom 2,Symptom 3,Symptom 4,Symptom 5
str,str,u32,f64,f64,f64,f64,f64,f64
"""Tay-Sachs""","""Deceased""",1433,6.988848,0.47288,0.39429,0.341258,0.273006,0.203019
"""Hemochromatosis""","""Alive""",668,6.863422,0.403941,0.319865,0.245066,0.153213,0.110922
"""Hemochromatosis""","""Deceased""",687,6.88748,0.381476,0.280702,0.235484,0.177496,0.112721
"""Mitochondrial myopathy""","""Alive""",2182,6.989816,0.549669,0.485485,0.420181,0.382915,0.328463
"""Mitochondrial myopathy""","""Deceased""",2223,6.915385,0.5295,0.459929,0.428571,0.404423,0.314115


In [13]:
df_subclass_stats.join(
    df_subclass_symptoms,
    on=["Disorder Subclass", "Status"],
    how="left",
)

Disorder Subclass,Status,patients,mean_age,Symptom 1,Symptom 2,Symptom 3,Symptom 4,Symptom 5
str,str,u32,f64,f64,f64,f64,f64,f64
"""Alzheimer's""","""Alive""",77,6.506667,,,,,
"""Diabetes""","""Deceased""",921,6.760465,,,,,
"""Tay-Sachs""","""Deceased""",1433,6.988848,0.47288,0.39429,0.341258,0.273006,0.203019
"""Leigh syndrome""","""Alive""",2625,6.902399,,,,,
"""Leigh syndrome""","""Deceased""",2535,6.971332,,,,,
"""Hemochromatosis""","""Alive""",668,6.863422,0.403941,0.319865,0.245066,0.153213,0.110922
"""Hemochromatosis""","""Deceased""",687,6.88748,0.381476,0.280702,0.235484,0.177496,0.112721
"""Mitochondrial myopathy""","""Alive""",2182,6.989816,0.549669,0.485485,0.420181,0.382915,0.328463
"""Diabetes""","""Alive""",896,6.986779,,,,,
"""Alzheimer's""","""Deceased""",75,6.60274,,,,,


In [22]:
df_subclass_stats.join(
    df_subclass_symptoms,
    on=["Disorder Subclass", "Status"],
    how="right",
)

patients,mean_age,Disorder Subclass,Status,Symptom 1,Symptom 2,Symptom 3,Symptom 4,Symptom 5
u32,f64,str,str,f64,f64,f64,f64,f64
2223.0,6.915385,"""Mitochondrial myopathy""","""Deceased""",0.5295,0.459929,0.428571,0.404423,0.314115
,,"""Cancer""","""Deceased""",0.1875,0.045455,0.071429,0.022222,0.0
,,"""Cancer""","""Alive""",0.222222,0.139535,0.125,0.02381,0.023256
668.0,6.863422,"""Hemochromatosis""","""Alive""",0.403941,0.319865,0.245066,0.153213,0.110922
2182.0,6.989816,"""Mitochondrial myopathy""","""Alive""",0.549669,0.485485,0.420181,0.382915,0.328463
1433.0,6.988848,"""Tay-Sachs""","""Deceased""",0.47288,0.39429,0.341258,0.273006,0.203019
,,"""Tay-Sachs""","""Alive""",0.461298,0.360695,0.367888,0.276899,0.189507
687.0,6.88748,"""Hemochromatosis""","""Deceased""",0.381476,0.280702,0.235484,0.177496,0.112721


In [24]:
df_subclass_stats.join(
    df_subclass_symptoms,
    on=["Disorder Subclass", "Status"],
    how="full",
    coalesce=True,
)

Disorder Subclass,Status,patients,mean_age,Symptom 1,Symptom 2,Symptom 3,Symptom 4,Symptom 5
str,str,u32,f64,f64,f64,f64,f64,f64
"""Alzheimer's""","""Alive""",77.0,6.506667,,,,,
"""Diabetes""","""Deceased""",921.0,6.760465,,,,,
"""Tay-Sachs""","""Deceased""",1433.0,6.988848,0.47288,0.39429,0.341258,0.273006,0.203019
"""Leigh syndrome""","""Alive""",2625.0,6.902399,,,,,
"""Leigh syndrome""","""Deceased""",2535.0,6.971332,,,,,
"""Hemochromatosis""","""Alive""",668.0,6.863422,0.403941,0.319865,0.245066,0.153213,0.110922
"""Hemochromatosis""","""Deceased""",687.0,6.88748,0.381476,0.280702,0.235484,0.177496,0.112721
"""Mitochondrial myopathy""","""Alive""",2182.0,6.989816,0.549669,0.485485,0.420181,0.382915,0.328463
"""Diabetes""","""Alive""",896.0,6.986779,,,,,
"""Alzheimer's""","""Deceased""",75.0,6.60274,,,,,


In [27]:
df_subclass_stats.join(
    df_subclass_symptoms,
    on=["Disorder Subclass", "Status"],
    how="semi"
)

Disorder Subclass,Status,patients,mean_age
str,str,u32,f64
"""Tay-Sachs""","""Deceased""",1433,6.988848
"""Hemochromatosis""","""Alive""",668,6.863422
"""Hemochromatosis""","""Deceased""",687,6.88748
"""Mitochondrial myopathy""","""Alive""",2182,6.989816
"""Mitochondrial myopathy""","""Deceased""",2223,6.915385


In [30]:
df_subclass_stats.join(
    df_subclass_symptoms,
    on=["Disorder Subclass", "Status"],
    how="anti"
)

Disorder Subclass,Status,patients,mean_age
str,str,u32,f64
"""Alzheimer's""","""Alive""",77,6.506667
"""Diabetes""","""Deceased""",921,6.760465
"""Leigh syndrome""","""Alive""",2625,6.902399
"""Leigh syndrome""","""Deceased""",2535,6.971332
"""Diabetes""","""Alive""",896,6.986779
"""Alzheimer's""","""Deceased""",75,6.60274
"""Leber's hereditary optic neuro…","""Deceased""",325,6.53871
"""Cystic fibrosis""","""Alive""",1787,6.989863
"""Cystic fibrosis""","""Deceased""",1661,6.908387


In [55]:
df_long = (
    df_datos.group_by("Patient Age", "Disorder Subclass")
    .agg(
        patients=pl.len(),
        mean_rbc=pl.col("Blood cell count (mcL)").mean(),
    )
    .drop_nulls()
    .sort("Patient Age")
)

In [56]:
df_long

Patient Age,Disorder Subclass,patients,mean_rbc
f64,str,u32,f64
0.0,"""Leigh syndrome""",333,4.891196
0.0,"""Leber's hereditary optic neuro…",56,4.894919
0.0,"""Hemochromatosis""",81,4.887838
0.0,"""Cystic fibrosis""",229,4.904924
0.0,"""Mitochondrial myopathy""",269,4.889203
0.0,"""Diabetes""",124,4.890575
0.0,"""Cancer""",3,5.003845
0.0,"""Alzheimer's""",13,4.83956
0.0,"""Tay-Sachs""",159,4.917336
1.0,"""Leigh syndrome""",326,4.897949


In [57]:
df_long.pivot(
    index="Disorder Subclass",
    on="Patient Age",
    values="patients"
)

Disorder Subclass,0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0,11.0,12.0,13.0,14.0
str,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
"""Leigh syndrome""",333,326,316,327,335,328,320,344,300,330,301,309,339,307,316
"""Leber's hereditary optic neuro…",56,34,51,41,30,41,33,44,41,41,52,33,28,43,40
"""Hemochromatosis""",81,78,105,91,82,80,91,77,95,85,81,78,97,71,76
"""Cystic fibrosis""",229,230,187,232,218,212,219,219,202,207,210,209,219,231,203
"""Mitochondrial myopathy""",269,269,297,273,292,282,268,277,259,299,266,283,291,276,241
"""Diabetes""",124,122,111,113,124,118,106,100,114,108,110,113,101,109,119
"""Cancer""",3,1,8,6,13,6,7,4,8,5,4,4,6,8,9
"""Alzheimer's""",13,5,9,13,15,10,16,6,9,8,14,6,10,6,8
"""Tay-Sachs""",159,164,190,164,186,193,178,163,178,203,150,181,185,194,176


In [59]:
df_long2 = (
    df_datos.group_by("Patient Age", "Disorder Subclass", "Status")
    .agg(
        patients=pl.len(),
        mean_rbc=pl.col("Blood cell count (mcL)").mean(),
    )
    .drop_nulls()
    .sort("Patient Age")
)
df_long2

Patient Age,Disorder Subclass,Status,patients,mean_rbc
f64,str,str,u32,f64
0.0,"""Cancer""","""Alive""",2,4.875913
0.0,"""Leigh syndrome""","""Alive""",181,4.898793
0.0,"""Leber's hereditary optic neuro…","""Alive""",30,4.881612
0.0,"""Cancer""","""Deceased""",1,5.25971
0.0,"""Mitochondrial myopathy""","""Deceased""",135,4.893105
0.0,"""Alzheimer's""","""Deceased""",7,4.871179
0.0,"""Mitochondrial myopathy""","""Alive""",134,4.885272
0.0,"""Diabetes""","""Alive""",55,4.872227
0.0,"""Leigh syndrome""","""Deceased""",152,4.882149
0.0,"""Cystic fibrosis""","""Deceased""",109,4.891926


In [62]:
df_long2.pivot(
    index=["Disorder Subclass", "Status"],
    on="Patient Age",
    values="patients"
)

Disorder Subclass,Status,0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0,11.0,12.0,13.0,14.0
str,str,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
"""Cancer""","""Alive""",2,,4,1,6,4,3,,3,4,3,3,4,2,4
"""Leigh syndrome""","""Alive""",181,154.0,165,173,179,163,160,162.0,153,169,155,165,166,153,161
"""Leber's hereditary optic neuro…","""Alive""",30,17.0,23,18,9,21,10,24.0,22,22,22,16,15,24,25
"""Cancer""","""Deceased""",1,1.0,4,5,7,2,4,4.0,5,1,1,1,2,6,5
"""Mitochondrial myopathy""","""Deceased""",135,136.0,150,151,154,122,132,139.0,132,155,139,132,152,127,124
"""Alzheimer's""","""Deceased""",7,2.0,3,7,7,7,5,3.0,5,5,10,1,5,1,5
"""Mitochondrial myopathy""","""Alive""",134,133.0,147,122,138,160,136,138.0,127,144,127,151,139,149,117
"""Diabetes""","""Alive""",55,57.0,57,47,63,54,54,55.0,62,52,59,59,47,51,60
"""Leigh syndrome""","""Deceased""",152,172.0,151,154,156,165,160,182.0,147,161,146,144,173,154,155
"""Cystic fibrosis""","""Deceased""",109,110.0,94,110,104,96,107,111.0,109,103,95,96,104,116,86


In [63]:
df_long2.pivot(
    index="Disorder Subclass",
    on="Patient Age",
    values="patients",
    aggregate_function="sum"
)

Disorder Subclass,0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0,11.0,12.0,13.0,14.0
str,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
"""Cancer""",3,1,8,6,13,6,7,4,8,5,4,4,6,8,9
"""Leigh syndrome""",333,326,316,327,335,328,320,344,300,330,301,309,339,307,316
"""Leber's hereditary optic neuro…",56,34,51,41,30,41,33,44,41,41,52,33,28,43,40
"""Mitochondrial myopathy""",269,269,297,273,292,282,268,277,259,299,266,283,291,276,241
"""Alzheimer's""",13,5,9,13,15,10,16,6,9,8,14,6,10,6,8
"""Diabetes""",124,122,111,113,124,118,106,100,114,108,110,113,101,109,119
"""Cystic fibrosis""",229,230,187,232,218,212,219,219,202,207,210,209,219,231,203
"""Hemochromatosis""",81,78,105,91,82,80,91,77,95,85,81,78,97,71,76
"""Tay-Sachs""",159,164,190,164,186,193,178,163,178,203,150,181,185,194,176
