In [1]:
%reload_ext autoreload
#%load_ext autoreload
%autoreload 2
%matplotlib inline

import datetime
import pandas as pd
#import modin.pandas as pd
import numpy as np
import scipy as sc
import matplotlib.pyplot as plt
import seaborn as sns
import umap
from tqdm import tqdm


#from sklearnex import patch_sklearn
#patch_sklearn()

from sklearn.preprocessing import FunctionTransformer, StandardScaler, QuantileTransformer, PowerTransformer
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

import os
import gc
import joblib
import dill as pickle

#os.chdir("T:\\laupodteam\\AIOS\\Bram\\notebooks\\code_dev\\celldyn_embedder\\sandbox")
#os.chdir("/media/UMCU/notebooks/code_dev/celldyn_embedder/sandbox")
sns.set_style("whitegrid")


# CUSTOM CLASSES
from hembedder.prepping import quality
from hembedder.prepping import transformer
from hembedder.prepping import recombinator
from hembedder.prepping import imputer

write_to_disk = False
read_from_disk = True
get_ranges=False
get_combos=True


#import ray
#ray.init()

* filter based on group flags
* filter based on suspect flags
* filter based on range
* apply scaling
* correct for diurnal rythm
* impute

There are no special NaN-modes; the NaN's are the result of running in non-reti modus, the affected measurements are:

`
'c_b_retc',
'c_b_pretc',
'c_b_irf',
'c_b_pHPO',
'c_b_pHPR',
'c_b_HDW',
'c_b_MCVr',
'c_b_MCHr',
'c_b_MCHCr',
'c_b_rbcimn',
'c_b_rbcicv',
'c_b_rbcfmn',
'c_b_rbcfcv'
`

In [2]:
# "/media/UMCU/data/CellDyn/celldyn.feather"
current_directory = os.path.dirname(os.path.realpath("."))
os.chdir("T:\\laupodteam\\AIOS\\Bram\\data\\CellDyn")

In [3]:
celldyn = pd.read_feather("celldyn.feather")
celldyn.columns = celldyn.columns.str.lower()
gc.collect()

67

In [4]:
celldyn = celldyn.query('c_b_ht>0.2 and c_b_hb>0.2 and c_b_rbco>0.1 and c_b_rbco<100 and c_b_hb<20')

celldyn.drop(['c_b_pblst', 'c_b_pnrbc', 'c_b_pbnd', 
              'c_b_blst', 'c_b_vlym', 'c_b_nrbc', 
              'c_b_bnd', 'c_b_ig',
              'c_b_pig', 'c_b_pvlym'], axis=1, inplace=True, errors='ignore')

In [5]:
meta_cols = ['asp_dt', 'analyzer', 'studyid_alle_celldyn', 
             'time_to_measurement', 'gender', 'age' , 'afname_dt',
             'day_of_year', 'week_of_year', 'year', 'first_day',
              'draw_hour', 'meas_hour']
count_columns = [c for c in celldyn.columns if 'c_cnt' in c]
meas_columns = [c for c in celldyn.columns if 'c_b' in c]+['plt']
mode_columns = [c for c in celldyn.columns if 'c_m' in c]
susp_columns = [c for c in celldyn.columns if 'c_s' in c]
alert_columns = [c for c in celldyn.columns if 'alrt' in c]
fail_columns = [c for c in celldyn.columns if 'fail' in c]
all_cols  = meta_cols+meas_columns+mode_columns+susp_columns+alert_columns+count_columns+fail_columns
other_cols = [c for c in celldyn.columns if c not in all_cols]

celldyn['gender']= celldyn.gender.str.decode(encoding='latin1')
celldyn['age'] = celldyn.age.astype(int)
celldyn['meas_hour'] = celldyn['asp_dt'].dt.hour#   apply(lambda x: x.hour)
celldyn['draw_hour'] = celldyn['afname_dt'].dt.hour # .apply(lambda x: x.hour)
celldyn['time_to_measurement'] = (celldyn.asp_dt-celldyn.afname_dt).dt.total_seconds()
celldyn['time_to_measurement'] = celldyn['time_to_measurement']//3600

cond = (celldyn.time_to_measurement<10) & (celldyn.time_to_measurement>-4)
celldyn = celldyn[cond]

def next_weekday(d, weekday):
    days_diff = weekday - d.weekday()
    return (d + datetime.timedelta(days_diff))\
    .replace(hour=0, minute=0, second=0, microsecond=0)

celldyn['day_of_year'] = np.nan
celldyn['day_of_year'] = celldyn.loc[celldyn.c_mode_xlyse==0, 'asp_dt']\
                            .apply(lambda x: x.dayofyear)
celldyn['week_of_year'] = celldyn.loc[celldyn.c_mode_xlyse==0, 'asp_dt']\
                            .apply(lambda x: x.weekofyear)

celldyn['year'] = celldyn.loc[celldyn.c_mode_xlyse==0, 'asp_dt'].apply(lambda x: x.year)
celldyn['first_day'] = celldyn.afname_dt.apply(lambda x: next_weekday(x, 0))

In [6]:
all_meta_cols = meta_cols+meas_columns+mode_columns+susp_columns+alert_columns+fail_columns

In [7]:
# get (0.05, 0.25, 0.5, 0.75, 0.95) percentiles
if get_ranges:
    range_tuples = []
    for c in tqdm(meas_columns):
        percent01, percent025, percent25, percent50, percent75, percent975, percent99 = \
            celldyn[c].quantile([0.01, 0.025, 0.25, 0.5, 0.75, 0.975, 0.99])
        range_tuples.append({'q01': percent01,
                             'q025': percent025,
                             'q25': percent25,
                             'q50': percent50,
                             'q75': percent75,
                             'q975': percent975,
                             'q99': percent99})
    range_df = pd.DataFrame(data=range_tuples, index=meas_columns)
    range_df.round(3).to_csv("/media/UMCU/data/CellDyn/ranges.csv", sep=';')

In [8]:
if get_combos:
  celldyn['mode_combos'] = celldyn[mode_columns]\
                          .apply(lambda x: ",".join([str(_x) for _x in x]),\
                          axis=1)

  nan_modes = {}
  for c in tqdm(celldyn['mode_combos'].unique()):
      condition = celldyn['mode_combos'] == c
      meas_df=celldyn.loc[condition, meas_columns]
      N = meas_df.shape[0]
      named_c = ",".join([f"{mode_columns[i]}:{str(int(float(v)))}"
                  for i, v in enumerate(c.split(","))])
      nan_list = meas_df.columns[meas_df.isna().sum()/N>=0.8].tolist()
      if len(nan_list)>0:
          nan_modes[named_c] = nan_list
          
  reti_mode_cols = list(set([_v for k,v in nan_modes.items() 
                              for _v in v if 'c_mode_rtc' in k
                            ]))
  celldyn.loc[celldyn.c_mode_rtc==0, reti_mode_cols] = np.nan

100%|██████████| 11/11 [00:07<00:00,  1.44it/s]


# Pipeline attempt

In [34]:
# get real current directory
qc = quality.QcControl(param_file=os.path.join(current_directory, "assets/CelldynParams.xlsx"), 
                       cols_include=meas_columns)

CellDynTransformer = transformer.CellDynTrans(log_scale=[],
                                  ord_scale=[],
                                  remove_original_columns=False)

imputer_clf = imputer.Imputer(imputer='forest',
                     backend='miceforest', 
                     meas_cols = meas_columns,
                     iterations=5, 
                     num_estimators=100, 
                     num_match_candidates=7,
                     save_all_iterations=False,
                     data_subset=0.25,
                     synthesize_working_data=True
                     )

recombiner = recombinator.CellDynRecombinator(scaler=None)

In [10]:
# missing at random are the variables that are missing because we did not run in Reti modus
not_reti_mode_cols = [c for c in meas_columns if c not in reti_mode_cols]

idx_nreti = celldyn[celldyn.c_mode_rtc==0].index
idx_reti = celldyn[celldyn.c_mode_rtc==1].index

keep_fullcase = idx_nreti[celldyn\
                            .loc[idx_nreti,
                                 not_reti_mode_cols]\
                            .isna().sum(axis=1)==0]
keep_reti_only_nans = idx_reti[celldyn\
                            .loc[idx_reti,
                                 meas_columns]\
                            .isna().sum(axis=1)==0]
celldyn_MAR= celldyn\
                    .loc[list(set(keep_fullcase)\
                    .union(keep_reti_only_nans))]
celldyn_MAR = celldyn_MAR[meta_cols+meas_columns]
celldyn= celldyn[meta_cols+meas_columns]

In [35]:
CleanPipeMAR = Pipeline([('qc', qc),
                      ('scaling', CellDynTransformer),
                      ('imputer', imputer_clf),
                      ('recombiner', recombiner), 
                      ('standardisation', PowerTransformer())
                     ], verbose=True)


celldyn_MAR_transformed = CleanPipeMAR.fit_transform(celldyn_MAR[meas_columns])
celldyn_MAR_transformed_df = pd.DataFrame(data=celldyn_MAR_transformed, 
                                   index=celldyn_MAR.index,
                                   columns=CleanPipeMAR.named_steps['recombiner'].out_columns)

remainder_cols = [c for c in celldyn_MAR.columns if c not in celldyn_MAR_transformed_df.columns]
celldyn_MAR_transformed_df = pd.concat([celldyn_MAR[meta_cols], celldyn_MAR_transformed_df], axis=1)
celldyn_MAR_transformed_df.reset_index().to_feather("celldyn_MAR_transformed_df_updated.feather")


del celldyn_MAR_transformed_df, celldyn_MAR, celldyn_MAR_transformed
gc.collect()


[Pipeline] ................ (step 1 of 5) Processing qc, total=   4.0s
[Pipeline] ........... (step 2 of 5) Processing scaling, total=   2.5s
[Pipeline] .......... (step 3 of 5) Processing imputer, total=150.0min
[Pipeline] ........ (step 4 of 5) Processing recombiner, total= 1.3min
[Pipeline] ... (step 5 of 5) Processing standardisation, total= 6.4min


188

In [36]:
CleanPipeFULL = Pipeline([('qc', qc),
                          ('scaling', CellDynTransformer),
                          ('imputer', imputer_clf),
                          ('recombiner', recombiner), 
                          ('standardisation', PowerTransformer())
                        ], verbose=True)


celldyn_FULL_transformed = CleanPipeFULL.fit_transform(celldyn[meas_columns])
celldyn_FULL_transformed_df = pd.DataFrame(data=celldyn_FULL_transformed, 
                                   index=celldyn.index,
                                   columns=CleanPipeFULL.named_steps['recombiner'].out_columns)

remainder_cols = [c for c in celldyn.columns if c not in celldyn_FULL_transformed_df.columns]
celldyn_FULL_transformed_df = pd.concat([celldyn[meta_cols], celldyn_FULL_transformed_df], axis=1)
celldyn_FULL_transformed_df.reset_index().to_feather("celldyn_FULL_transformed_df_updated.feather")

del celldyn_FULL_transformed_df, celldyn_FULL_transformed
gc.collect()

[Pipeline] ................ (step 1 of 5) Processing qc, total=   4.0s
[Pipeline] ........... (step 2 of 5) Processing scaling, total=   2.4s
[Pipeline] .......... (step 3 of 5) Processing imputer, total=234.6min
[Pipeline] ........ (step 4 of 5) Processing recombiner, total= 1.6min
[Pipeline] ... (step 5 of 5) Processing standardisation, total= 8.3min


188

In [37]:
toBePersisted = dict({
    'model': CleanPipeMAR,
    'meas_cols': meas_columns,
    'metadata': {
        'name': 'Cleaning pipeline for celldyn data including MAR imputation',
        'author': 'Bram van Es, Huibert-Jan Joosse, Chontira Chumsaeng',
        'date': '2023-03-23',
        'source_code_version': '1.0.0',
    }
})
pickle.dump(toBePersisted, open('CellDynCleanPipeMAR.pickle', 'wb'))

In [38]:
toBePersisted = dict({
    'model': CleanPipeFULL,
    'meas_cols': meas_columns,
    'metadata': {
        'name': 'Cleaning pipeline for celldyn data including FULL imputation',
        'author': 'Bram van Es, Huibert-Jan Joosse, Chontira Chumsaeng',
        'date': '2023-03-23',
        'source_code_version': '1.0.0',
    }
})
pickle.dump(toBePersisted, open('CellDynCleanPipeFULL.pickle', 'wb'))

In [39]:
del toBePersisted
gc.collect()

0

In [11]:
CleanPipeTEST= Pipeline([('qc', qc),
                         ('scaling', CellDynTransformer),
                         ('imputer', imputer_clf),
                         ('recombiner', recombiner), 
                         ('standardisation', PowerTransformer())
                     ], verbose=True)

CleanPipeTEST.fit(celldyn.sample(5000)[meas_columns]);

toBePersisted = dict({
    'model': CleanPipeTEST,
    'meas_cols': meas_columns,
    'metadata': {
        'name': 'Cleaning pipeline for celldyn data including imputation',
        'author': 'Bram van Es, Huibert-Jan Joosse, Chontira Chumsaeng',
        'date': '2022-11-22',
        'source_code_version': '1.0.0',
    }
})
pickle.dump(toBePersisted, open('CellDynCleanPipeTEST.pickle', 'wb'))

[Pipeline] ................ (step 1 of 5) Processing qc, total=   1.3s
[Pipeline] ........... (step 2 of 5) Processing scaling, total=   0.0s
[Pipeline] ........... (step 3 of 5) Processing imputer, total= 1.9min
[Pipeline] ........ (step 4 of 5) Processing recombiner, total=   0.3s
[Pipeline] ... (step 5 of 5) Processing standardisation, total=   0.8s


In [20]:
sample_df = celldyn.sample(5000)
test = CleanPipeTEST.transform(sample_df[meas_columns])
test_df = pd.DataFrame(data=test,
                       index=sample_df.index, 
                       columns=CleanPipeTEST.named_steps['recombiner'].out_columns)
test_df = pd.concat([sample_df[meta_cols], test_df], axis=1)

In [11]:
cleanpipe_test = pickle.load(open('CellDynCleanPipeTEST.pickle', 'rb'))
cleanpipe_test['metadata']
cleanpipe_test['model'].transform(celldyn.sample(50)[meas_columns])

(50, 99)