In [5]:
import numpy as np
import pandas as pd
import polars as pl
import matplotlib.pyplot as plt
import seaborn as sns
from skrub import TableReport

import os, glob
import gc

In [2]:
COMP_DATA_BASE = os.path.join("..", "data", "comp")
PREP_DATA_BASE = os.path.join("..", "data", "preprocessed")

TRAIN_PATH = os.path.join(COMP_DATA_BASE, "train.csv")
TEST_PATH = os.path.join(COMP_DATA_BASE, "test.csv")
DATA_DICT_PATH = os.path.join(COMP_DATA_BASE, "data_dictionary.csv")
SAMPLE_PATH = os.path.join(COMP_DATA_BASE, "sample_submission.csv")

features = list(pl.read_csv(TEST_PATH).select(pl.all().exclude("ID")).columns)
train_ds = pl.read_csv(TRAIN_PATH)

In [3]:
num_features_dict = pl.read_csv(DATA_DICT_PATH).filter(pl.col("variable").is_in(features) & (pl.col("type") == "Numerical"))
num_features = num_features_dict.select("variable").to_series().to_list()
cat_features_dict = pl.read_csv(DATA_DICT_PATH).filter(pl.col("variable").is_in(features) & (pl.col("type") == "Categorical"))
cat_features = cat_features_dict.select("variable").to_series().to_list()
misc_columns = pl.read_csv(DATA_DICT_PATH).filter(pl.col("variable").is_in(features).not_())
display(num_features_dict)
display(cat_features_dict)
display(misc_columns)


variable,description,type,values
str,str,str,str
"""hla_match_c_high""","""Recipient / 1st donor allele l…","""Numerical""",
"""hla_high_res_8""","""Recipient / 1st donor allele-l…","""Numerical""",
"""hla_low_res_6""","""Recipient / 1st donor antigen-…","""Numerical""",
"""hla_high_res_6""","""Recipient / 1st donor allele-l…","""Numerical""",
"""hla_high_res_10""","""Recipient / 1st donor allele-l…","""Numerical""",
…,…,…,…
"""comorbidity_score""","""Sorror comorbidity score""","""Numerical""",
"""karnofsky_score""","""KPS at HCT""","""Numerical""",
"""hla_low_res_8""","""Recipient / 1st donor antigen-…","""Numerical""",
"""hla_match_drb1_high""","""Recipient / 1st donor allele l…","""Numerical""",


variable,description,type,values
str,str,str,str
"""dri_score""","""Refined disease risk index""","""Categorical""","""['Intermediate' 'High' 'N/A - …"
"""psych_disturb""","""Psychiatric disturbance""","""Categorical""","""['Yes' 'No' nan 'Not done']"""
"""cyto_score""","""Cytogenetic score""","""Categorical""","""['Intermediate' 'Favorable' 'P…"
"""diabetes""","""Diabetes""","""Categorical""","""['No' 'Yes' nan 'Not done']"""
"""tbi_status""","""TBI""","""Categorical""","""['No TBI' 'TBI + Cy +- Other' …"
…,…,…,…
"""tce_div_match""","""T-cell epitope matching""","""Categorical""","""['Permissive mismatched' 'Bi-d…"
"""donor_related""","""Related vs. unrelated donor""","""Categorical""","""['Unrelated' 'Related' 'Multip…"
"""melphalan_dose""","""Melphalan dose (mg/m^2)""","""Categorical""","""['N/A, Mel not given' 'MEL' na…"
"""cardiac""","""Cardiac""","""Categorical""","""['No' 'Yes' nan 'Not done']"""


variable,description,type,values
str,str,str,str
"""efs""","""Event-free survival""","""Categorical""","""['Event' 'Censoring']"""
"""efs_time""","""Time to event-free survival, m…","""Numerical""",


In [6]:
my_tablereport = TableReport(train_ds)
my_tablereport

Processing column  60 / 60


ID,dri_score,psych_disturb,cyto_score,diabetes,hla_match_c_high,hla_high_res_8,tbi_status,arrhythmia,hla_low_res_6,graft_type,vent_hist,renal_issue,pulm_severe,prim_disease_hct,hla_high_res_6,cmv_status,hla_high_res_10,hla_match_dqb1_high,tce_imm_match,hla_nmdp_6,hla_match_c_low,rituximab,hla_match_drb1_low,hla_match_dqb1_low,prod_type,cyto_score_detail,conditioning_intensity,ethnicity,year_hct,obesity,mrd_hct,in_vivo_tcd,tce_match,hla_match_a_high,hepatic_severe,donor_age,prior_tumor,hla_match_b_low,peptic_ulcer,age_at_hct,hla_match_a_low,gvhd_proph,rheum_issue,sex_match,hla_match_b_high,race_group,comorbidity_score,karnofsky_score,hepatic_mild,tce_div_match,donor_related,melphalan_dose,hla_low_res_8,cardiac,hla_match_drb1_high,pulm_moderate,hla_low_res_10,efs,efs_time
0.0,N/A - non-malignant indication,No,,No,,,No TBI,No,6.0,Bone marrow,No,No,No,IEA,6.0,+/+,,2.0,,6.0,2.0,No,2.0,2.0,BM,,,Not Hispanic or Latino,2016.0,No,,Yes,,2.0,No,,No,2.0,No,9.942,2.0,FKalone,No,M-F,2.0,More than one race,0.0,90.0,No,,Unrelated,"N/A, Mel not given",8.0,No,2.0,No,10.0,0.0,42.356
1.0,Intermediate,No,Intermediate,No,2.0,8.0,"TBI +- Other, >cGy",No,6.0,Peripheral blood,No,No,No,AML,6.0,+/+,10.0,2.0,P/P,6.0,2.0,No,2.0,2.0,PB,Intermediate,MAC,Not Hispanic or Latino,2008.0,No,Positive,No,Permissive,2.0,No,72.29,No,2.0,No,43.705,2.0,Other GVHD Prophylaxis,No,F-F,2.0,Asian,3.0,90.0,No,Permissive mismatched,Related,"N/A, Mel not given",8.0,No,2.0,Yes,10.0,1.0,4.672
2.0,N/A - non-malignant indication,No,,No,2.0,8.0,No TBI,No,6.0,Bone marrow,No,No,No,HIS,6.0,+/+,10.0,2.0,P/P,6.0,2.0,No,2.0,2.0,BM,,,Not Hispanic or Latino,2019.0,No,,Yes,,2.0,No,,No,2.0,No,33.997,2.0,Cyclophosphamide alone,No,F-M,2.0,More than one race,0.0,90.0,No,Permissive mismatched,Related,"N/A, Mel not given",8.0,No,2.0,No,10.0,0.0,19.793
3.0,High,No,Intermediate,No,2.0,8.0,No TBI,No,6.0,Bone marrow,No,No,No,ALL,6.0,+/+,10.0,2.0,P/P,6.0,2.0,No,2.0,2.0,BM,Intermediate,MAC,Not Hispanic or Latino,2009.0,No,Positive,No,Permissive,2.0,No,29.23,No,2.0,No,43.245,2.0,FK+ MMF +- others,No,M-M,2.0,White,0.0,90.0,Yes,Permissive mismatched,Unrelated,"N/A, Mel not given",8.0,No,2.0,No,10.0,0.0,102.349
4.0,High,No,,No,2.0,8.0,No TBI,No,6.0,Peripheral blood,No,No,No,MPN,6.0,+/+,10.0,2.0,,5.0,2.0,No,2.0,2.0,PB,,MAC,Hispanic or Latino,2018.0,No,,Yes,,2.0,No,56.81,No,2.0,No,29.74,2.0,TDEPLETION +- other,No,M-F,2.0,American Indian or Alaska Native,1.0,90.0,No,Permissive mismatched,Related,MEL,8.0,No,2.0,No,10.0,0.0,16.223
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
28795.0,Intermediate - TED AML case <missing cytogenetics,,Favorable,No,2.0,8.0,No TBI,No,6.0,Peripheral blood,No,No,,ALL,6.0,-/-,10.0,2.0,P/P,6.0,2.0,No,2.0,2.0,PB,Intermediate,MAC,Not Hispanic or Latino,2018.0,No,Negative,Yes,Fully matched,2.0,No,24.212,Yes,2.0,No,51.136,2.0,FK+ MTX +- others(not MMF),,M-F,2.0,More than one race,0.0,,,Bi-directional non-permissive,,"N/A, Mel not given",8.0,,2.0,No,10.0,0.0,18.633
28796.0,High,No,Poor,Yes,1.0,4.0,No TBI,No,5.0,Peripheral blood,No,No,No,AML,3.0,-/+,6.0,2.0,G/G,4.0,1.0,No,2.0,2.0,PB,TBD,RIC,Hispanic or Latino,2017.0,No,Positive,No,,1.0,No,30.77,No,1.0,No,18.075,2.0,Cyclophosphamide +- others,No,M-F,1.0,Native Hawaiian or other Pacific Islander,3.0,90.0,No,GvH non-permissive,Related,"N/A, Mel not given",6.0,Yes,1.0,Yes,8.0,1.0,4.892
28797.0,TBD cytogenetics,,Poor,,2.0,8.0,No TBI,,6.0,Peripheral blood,No,,,IPA,6.0,-/+,10.0,2.0,G/G,6.0,2.0,,2.0,2.0,PB,Poor,MAC,Not Hispanic or Latino,2018.0,No,,No,GvH non-permissive,2.0,No,22.627,No,2.0,,51.005,2.0,FK+ MMF +- others,,M-F,2.0,Native Hawaiian or other Pacific Islander,5.0,90.0,,GvH non-permissive,Unrelated,"N/A, Mel not given",8.0,,2.0,No,10.0,0.0,23.157
28798.0,N/A - non-malignant indication,No,Poor,No,1.0,4.0,No TBI,No,3.0,Peripheral blood,No,,,IPA,3.0,+/+,5.0,1.0,P/P,3.0,1.0,No,1.0,1.0,PB,,NMA,Not Hispanic or Latino,2018.0,,,Yes,,1.0,No,58.074,Yes,1.0,,0.044,1.0,Cyclophosphamide alone,No,M-M,1.0,Black or African-American,1.0,90.0,No,Permissive mismatched,Related,MEL,4.0,No,1.0,No,5.0,0.0,52.351

Column,Column name,dtype,Null values,Unique values,Mean,Std,Min,Median,Max
0,ID,Int64,0 (0.0%),28800 (100.0%),14400.0,8310.0,0.0,14400.0,28800.0
1,dri_score,String,154 (0.5%),11 (< 0.1%),,,,,
2,psych_disturb,String,2062 (7.2%),3 (< 0.1%),,,,,
3,cyto_score,String,8068 (28.0%),7 (< 0.1%),,,,,
4,diabetes,String,2119 (7.4%),3 (< 0.1%),,,,,
5,hla_match_c_high,Float64,4620 (16.0%),3 (< 0.1%),1.76,0.432,0.0,2.0,2.0
6,hla_high_res_8,Float64,5829 (20.2%),7 (< 0.1%),6.88,1.56,2.0,8.0,8.0
7,tbi_status,String,0 (0.0%),8 (< 0.1%),,,,,
8,arrhythmia,String,2202 (7.6%),3 (< 0.1%),,,,,
9,hla_low_res_6,Float64,3270 (11.4%),5 (< 0.1%),5.14,1.21,2.0,6.0,6.0

Column 1,Column 2,Cramér's V
efs,efs_time,0.905
hla_low_res_6,hla_match_drb1_low,0.879
graft_type,prod_type,0.866
hla_low_res_6,hla_low_res_8,0.848
hla_match_drb1_low,hla_low_res_8,0.848
hla_high_res_8,hla_high_res_6,0.809
hla_low_res_6,hla_match_b_low,0.805
hla_high_res_8,hla_high_res_10,0.772
hla_match_c_low,hla_low_res_8,0.77
hla_high_res_6,hla_match_a_high,0.765
