In [70]:
import pandas as pd
import numpy as np
import math
from functools import partial, reduce
import os
import re

In [71]:
analyte_name_dict = {'HRP2_pg_ml': ('HRP2', 'pg/ml'), 'LDH_Pan_pg_ml': ('LDH_Pan', 'pg/ml'),
                     'LDH_Pv_pg_ml': ('LDH_Pv', 'pg/ml'), 'LDH_Pf_pg_ml': ('LDH_Pf', 'pg/ml'),
                     'CRP_ng_ml': ('CRP', 'ng/ml')}

In [85]:
def set_llq_ulq(df, data_col):
    if isinstance(df[data_col], str):
        if '<' in df[data_col]:
            return 0
        elif '>' in df[data_col]:
            return 2
    elif np.isnan(df[data_col]):
        return np.nan
    return 1

In [86]:
def clean_strings(val):
    if isinstance(val, str):
        clean = val.replace('<', '')
        clean = clean.replace('>', '')
        try:
            return float(clean)
        except ValueError:
            return clean
    return val

In [87]:
order = ['study_id', 'sample_id', 'participant_id', 'sample_type', 'country', 'lat', 'long', 'age_yrs', 'fever',
         'timepoint_days', 'initial_specimen_category', 'confirmed_via', 'quansys_HRP2_pg_ml', 'quansys_HRP2_result',
         'quansys_HRP2_bin', 'quansys_LDH_Pan_pg_ml', 'quansys_LDH_Pan_result', 'quansys_LDH_Pan_bin',
         'quansys_LDH_Pv_pg_ml', 'quansys_LDH_Pv_result', 'quansys_LDH_Pv_bin', 'quansys_LDH_Pf_pg_ml',
         'quansys_LDH_Pf_result', 'quansys_LDH_Pf_bin', 'quansys_CRP_ng_ml', 'quansys_CRP_result', 'quansys_CRP_bin']

In [91]:
input_path = 'C:/Users/lzoeckler/Desktop/qfu'
files = [file for file in os.listdir(input_path) if 'csv' in file]
for fname in files:
    print(fname)
    df = pd.read_csv('{}/{}'.format(input_path, fname))
    modified_df = df.copy(deep=True)
    for analyte in analyte_name_dict.keys():
        data_col = 'quansys_{}'.format(analyte)
        name, unit = analyte_name_dict[analyte]
        partial_qs = partial(set_llq_ulq, data_col=data_col)
        modified_df['quansys_{}_bin'.format(name)] = modified_df.apply(partial_qs, axis=1)
        modified_df[data_col] = modified_df[data_col].apply(clean_strings)
    modified_df = modified_df[order]
    modified_df.to_csv('{}/binned/{}'.format(input_path, fname), index=False)
modified_df.head()

QV4_4plex_NIH_clinical.csv
QV4_4plex_WB_NIH_Pf_SPZ.csv
QV4_4plex_WB_NIH_Pf_TBV.csv
QV4_4plex_WB_PATH_validation.csv
QV4_4plex_WB_QIMR_Pf_CHMIS.csv
QV4_4plex_WB_QIMR_Pm_CHMIS.csv
QV4_4plex_WB_QIMR_Pv_CHMIS.csv
QV4_4plex_WB_SMRU.csv
QV4_4plex_WB_UCSF_HRP2_Persistence.csv
QV4_4plex_WB_UCSF_uRDT.csv
QV4_5plex_WB_DLS_Pf_Pv.csv
QV4_5plex_WB_DLS_Pm.csv
QV4_5plex_WB_FIND_Pf_Pv.csv
QV4_5plex_WB_PATH_validation.csv
QV4_5plex_WB_QIMR_SCID_Pf_CHMIS.csv
QV4_5plex_WB_UPCH_Clinical_progeny.csv


Unnamed: 0,study_id,sample_id,participant_id,sample_type,country,lat,long,age_yrs,fever,timepoint_days,...,quansys_LDH_Pan_bin,quansys_LDH_Pv_pg_ml,quansys_LDH_Pv_result,quansys_LDH_Pv_bin,quansys_LDH_Pf_pg_ml,quansys_LDH_Pf_result,quansys_LDH_Pf_bin,quansys_CRP_ng_ml,quansys_CRP_result,quansys_CRP_bin
0,17,PE01F04 200,,WB,Peru,,,,,,...,1,22.67,0,1,4037.26,1,1,429.3,,1
1,17,PE01F04 2000,,WB,Peru,,,,,,...,1,57.96,0,1,62046.48,1,1,4700.01,,1
2,17,PE01F06 200,,WB,Peru,,,,,,...,1,18.82,0,1,11576.34,1,1,716.28,,1
3,17,PE01F06 2000,,WB,Peru,,,,,,...,1,8.27,0,1,43665.81,1,1,4821.29,,1
4,17,PEO1F07 200,,WB,Peru,,,,,,...,1,30.14,0,1,55887.4,1,1,979.54,,1


In [89]:
df[['quansys_HRP2_pg_ml']].head()

Unnamed: 0,quansys_HRP2_pg_ml
0,17215.98
1,> 492800.00
2,3.64
3,5.17
4,3.37


In [90]:
modified_df[['quansys_HRP2_pg_ml', 'quansys_HRP2_bin']].dtypes

quansys_HRP2_pg_ml    float64
quansys_HRP2_bin        int64
dtype: object