In [4]:
import pandas as pd
import numpy as np
import math
from functools import partial, reduce
import os
import re

In [5]:
analyte_name_dict = {'HRP2_pg_ml': ('HRP2', 'pg/ml'), 'LDH_Pan_pg_ml': ('LDH_Pan', 'pg/ml'),
                     'LDH_Pv_pg_ml': ('LDH_Pv', 'pg/ml'), 'LDH_Pf_pg_ml': ('LDH_Pf', 'pg/ml'),
                     'CRP_ng_ml': ('CRP', 'ng/ml')}

In [6]:
def set_llq_ulq(df, data_col):
    if isinstance(df[data_col], str):
        if '<' in df[data_col]:
            return 0
        elif '>' in df[data_col]:
            return 2
    elif np.isnan(df[data_col]):
        return np.nan
    return 1

In [7]:
def clean_strings(val):
    if isinstance(val, str):
        clean = val.replace('<', '')
        clean = clean.replace('>', '')
        try:
            return float(clean)
        except ValueError:
            return clean
    return val

In [8]:
def fix_days(val):
    return int(val.strip('D'))

In [9]:
order = ['study_id', 'sample_id', 'participant_id', 'sample_type', 'country', 'lat', 'long', 'age_yrs', 'timepoint_days',
         'febrile', 'survey', 'infection_category', 'PCR_pos', 'microscopy_pos', 'p_spp', 'pf', 'pv', 'pm', 'pk', 'po',
         'coinfection', 'HRP2_deletion', 'quansys_HRP2_pg_ml', 'quansys_LDH_Pan_pg_ml', 'quansys_LDH_Pv_pg_ml',
         'quansys_LDH_Pf_pg_ml', 'quansys_CRP_ng_ml', 'quansys_HRP2_result', 'quansys_LDH_Pan_result',
         'quansys_LDH_Pv_result', 'quansys_LDH_Pf_result', 'quansys_CRP_result', 'quansys_type']

In [10]:
names = ['4plex_WB_NIH_Pf_SPZ', '4plex_WB_NIH_Pf_TBV', '4plex_WB_QIMR_Pf_CHMIS', '4plex_WB_QIMR_Pm_CHMIS',
         '4plex_WB_QIMR_Pv_CHMIS', '4plex_WB_SMRU', '4plex_WB_UCSF_uRDT', '5plex_WB_DLS_Pf_Pv', '5plex_WB_DLS_Pm',
         '5plex_WB_FIND_Pf_Pv', '5plex_WB_FIND_Pm_Po', '5plex_WB_UPCH_Clinical_progeny']

In [12]:
input_path = 'C:/Users/lzoeckler/Desktop/gold_standard_classes/'
for name in names:
    df_path = '{}/{}/formatted_{}.csv'.format(input_path, name, name)
    df = pd.read_csv(df_path)
    try:
        df.rename(columns={'pv ': 'pv'}, inplace=True)
    except KeyError:
        pass
    try:
        df = df[order]
        df.to_csv(df_path, index=False)
    except KeyError:
        print(name)
        raise

In [13]:
df.columns

Index(['study_id', 'sample_id', 'participant_id', 'sample_type', 'country',
       'lat', 'long', 'age_yrs', 'timepoint_days', 'febrile', 'survey',
       'infection_category', 'PCR_pos', 'microscopy_pos', 'p_spp', 'pf', 'pv',
       'pm', 'pk', 'po', 'coinfection', 'HRP2_deletion', 'quansys_HRP2_pg_ml',
       'quansys_LDH_Pan_pg_ml', 'quansys_LDH_Pv_pg_ml', 'quansys_LDH_Pf_pg_ml',
       'quansys_CRP_ng_ml', 'quansys_HRP2_result', 'quansys_LDH_Pan_result',
       'quansys_LDH_Pv_result', 'quansys_LDH_Pf_result', 'quansys_CRP_result',
       'quansys_type'],
      dtype='object')

In [15]:
input_path = 'C:/Users/lzoeckler/Desktop/gold_standard_classes/'
test_path = 'C:/Users/lzoeckler/Desktop'
df = pd.read_csv('{}/4plex_WB_QIMR_Pv_CHMIS/formatted_4plex_WB_QIMR_Pv_CHMIS.csv'.format(input_path))
df.head()

Unnamed: 0,study_id,sample_id,participant_id,sample_type,country,lat,long,age_yrs,timepoint_days,febrile,...,quansys_LDH_Pan_pg_ml,quansys_LDH_Pv_pg_ml,quansys_LDH_Pf_pg_ml,quansys_CRP_ng_ml,quansys_HRP2_result,quansys_LDH_Pan_result,quansys_LDH_Pv_result,quansys_LDH_Pf_result,quansys_CRP_result,quansys_type
0,8,1 Day 0 AM R009 S013 (),"R009, S013",WB,,,,,0.0,,...,< 14.41,5.35,,44.36213,0,0,0,,,4plex
1,8,7 Day 4 AM R009 S013 (),"R009, S013",WB,,,,,4.0,,...,92.42,26.51,,87.87346,0,1,0,,,4plex
2,8,13 Day 5 AM R009 S013 (),"R009, S013",WB,,,,,5.0,,...,74.01,19.96,,58.78319,0,1,0,,,4plex
3,8,19 Day 5 PM R009 S013 (),"R009, S013",WB,,,,,5.5,,...,255.13,64.01,,80.01065,0,1,0,,,4plex
4,8,25 Day 6 AM R009 S013 (),"R009, S013",WB,,,,,6.0,,...,259.8,60.36,,654.96711,0,1,0,,,4plex


In [142]:
df['timepoint_days'].head()

0    0.0
1    4.0
2    5.0
3    5.5
4    6.0
Name: timepoint_days, dtype: float64

In [129]:
rebuilt = []
participants = df['participant_id'].unique()
for participant in participants:
    p_df = df.loc[df['participant_id'] == participant]
    p_df['timepoint_days'] = p_df['timepoint_days'].apply(lambda x: x.strip('T=Day '))
    p_df['t_value'] = p_df['timepoint_days'].apply(lambda x: x.split('T')[-1] if len(x.split('T')) != 1 else 'FILLER')
    t_df = p_df.loc[p_df['t_value'] != 'FILLER']
    t_df.loc[t_df['t_value'] == 'FILLER', 't_value'] = '0'
    p_df = p_df.loc[p_df['t_value'] == 'FILLER']
    p_df['timepoint_days'] = p_df['timepoint_days'].apply(lambda x: x.strip('AM'))
    p_df['timepoint_days'] = p_df['timepoint_days'].apply(lambda x: x.strip('A'))
    p_df['timepoint_days'] = p_df['timepoint_days'].apply(lambda x: x.replace('P', '.5'))
    p_df['timepoint_days'] = p_df['timepoint_days'].apply(lambda x: float(x))
    rebuilt.append(p_df)
    t_df['timepoint_days'] = t_df['timepoint_days'].apply(lambda x: x.split('T')[0])
    t_df['timepoint_days'] = t_df['timepoint_days'].apply(lambda x: x.split('A')[0])
    t_df['timepoint_days'] = t_df['timepoint_days'].apply(lambda x: x.split('P')[0])
    t_df['timepoint_days'] = t_df['timepoint_days'].apply(lambda x: int(x))
    t_df['t_value'] = t_df['t_value'].apply(lambda x: int(x.strip('T')))
    days = t_df['timepoint_days'].unique()
    for day in days:
        p_d_df = t_df.loc[t_df['timepoint_days'] == day]
        if len(p_d_df) > 1:
            min_t_val = min(p_d_df['t_value'])
            p_d_df['t_value'] = p_d_df['t_value'].subtract(min_t_val)
            p_d_df['t_value'] = p_d_df['t_value'].divide(24)
            p_d_df['timepoint_days'] = p_d_df['timepoint_days'].add(p_d_df['t_value'])
        rebuilt.append(p_d_df)
    min_day = min(p_df['timepoint_days'])
    p_df['timepoint_days'] = p_df['timepoint_days'].subtract(min_day)
    rebuilt.append(p_df)
fixed_df = pd.concat(rebuilt)
fixed_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value

Unnamed: 0,study_id,sample_id,participant_id,sample_type,country,lat,long,age_yrs,febrile,survey,...,quansys_LDH_Pv_pg_ml,quansys_LDH_Pf_pg_ml,quansys_CRP_ng_ml,quansys_HRP2_result,quansys_LDH_Pan_result,quansys_LDH_Pv_result,quansys_LDH_Pf_result,quansys_CRP_result,quansys_type,t_value
0,10,1 T=Day 0AM 11/14/17 S006 R103 (),"R103, S006",WB,,,,,,longitudinal challenge,...,< 2.33,,228.51,0,0,0,,,4plex,FILLER
1,10,7 T=Day 4AM 11/18/17 S006 R103 (),"R103, S006",WB,,,,,,longitudinal challenge,...,< 2.33,,178.4,1,0,0,,,4plex,FILLER
2,10,10 T=Day 5AM 11/19/17 S006 R103 (),"R103, S006",WB,,,,,,longitudinal challenge,...,< 2.33,,206.23,1,0,0,,,4plex,FILLER
3,10,16 T=Day 6AM 11/20/17 S006 R103 (),"R103, S006",WB,,,,,,longitudinal challenge,...,< 2.33,,232.11,1,0,0,,,4plex,FILLER
4,10,22 T=Day 7AM 11/21/17 S006 R103 (),"R103, S006",WB,,,,,,longitudinal challenge,...,< 2.33,,233.61,1,1,0,,,4plex,FILLER


In [130]:
fixed_df.head()

Unnamed: 0,study_id,sample_id,participant_id,sample_type,country,lat,long,age_yrs,febrile,survey,...,quansys_LDH_Pv_pg_ml,quansys_LDH_Pf_pg_ml,quansys_CRP_ng_ml,quansys_HRP2_result,quansys_LDH_Pan_result,quansys_LDH_Pv_result,quansys_LDH_Pf_result,quansys_CRP_result,quansys_type,t_value
0,10,1 T=Day 0AM 11/14/17 S006 R103 (),"R103, S006",WB,,,,,,longitudinal challenge,...,< 2.33,,228.51,0,0,0,,,4plex,FILLER
1,10,7 T=Day 4AM 11/18/17 S006 R103 (),"R103, S006",WB,,,,,,longitudinal challenge,...,< 2.33,,178.4,1,0,0,,,4plex,FILLER
2,10,10 T=Day 5AM 11/19/17 S006 R103 (),"R103, S006",WB,,,,,,longitudinal challenge,...,< 2.33,,206.23,1,0,0,,,4plex,FILLER
3,10,16 T=Day 6AM 11/20/17 S006 R103 (),"R103, S006",WB,,,,,,longitudinal challenge,...,< 2.33,,232.11,1,0,0,,,4plex,FILLER
4,10,22 T=Day 7AM 11/21/17 S006 R103 (),"R103, S006",WB,,,,,,longitudinal challenge,...,< 2.33,,233.61,1,1,0,,,4plex,FILLER


In [134]:
# fixed_df.to_csv('{}/4plex_WB_QIMR_Pf_CHMIS/CLEANED_4plex_WB_QIMR_Pf_CHMIS.csv'.format(input_path), index=True)

In [132]:
fixed_df.to_csv('{}/test.csv'.format(test_path))

In [17]:
fix = pd.read_csv('{}/4plex_WB_QIMR_Pf_CHMIS/formatted_4plex_WB_QIMR_Pf_CHMIS.csv'.format(input_path))
fix.head()

Unnamed: 0,study_id,sample_id,participant_id,sample_type,country,lat,long,age_yrs,timepoint_days,febrile,...,quansys_LDH_Pan_pg_ml,quansys_LDH_Pv_pg_ml,quansys_LDH_Pf_pg_ml,quansys_CRP_ng_ml,quansys_HRP2_result,quansys_LDH_Pan_result,quansys_LDH_Pv_result,quansys_LDH_Pf_result,quansys_CRP_result,quansys_type
0,10,1 T=Day 0AM 11/14/17 S006 R103 (),"R103, S006",WB,,,,,0.0,,...,< 14.41,< 2.33,,228.51,0,0,0,,,4plex
1,10,7 T=Day 4AM 11/18/17 S006 R103 (),"R103, S006",WB,,,,,4.0,,...,< 14.41,< 2.33,,178.4,1,0,0,,,4plex
2,10,10 T=Day 5AM 11/19/17 S006 R103 (),"R103, S006",WB,,,,,5.0,,...,< 14.41,< 2.33,,206.23,1,0,0,,,4plex
3,10,16 T=Day 6AM 11/20/17 S006 R103 (),"R103, S006",WB,,,,,6.0,,...,31.49,< 2.33,,232.11,1,0,0,,,4plex
4,10,22 T=Day 7AM 11/21/17 S006 R103 (),"R103, S006",WB,,,,,7.0,,...,94.51,< 2.33,,233.61,1,1,0,,,4plex


In [18]:
fix.drop_duplicates(inplace=True)
fix.head()

Unnamed: 0,study_id,sample_id,participant_id,sample_type,country,lat,long,age_yrs,timepoint_days,febrile,...,quansys_LDH_Pan_pg_ml,quansys_LDH_Pv_pg_ml,quansys_LDH_Pf_pg_ml,quansys_CRP_ng_ml,quansys_HRP2_result,quansys_LDH_Pan_result,quansys_LDH_Pv_result,quansys_LDH_Pf_result,quansys_CRP_result,quansys_type
0,10,1 T=Day 0AM 11/14/17 S006 R103 (),"R103, S006",WB,,,,,0.0,,...,< 14.41,< 2.33,,228.51,0,0,0,,,4plex
1,10,7 T=Day 4AM 11/18/17 S006 R103 (),"R103, S006",WB,,,,,4.0,,...,< 14.41,< 2.33,,178.4,1,0,0,,,4plex
2,10,10 T=Day 5AM 11/19/17 S006 R103 (),"R103, S006",WB,,,,,5.0,,...,< 14.41,< 2.33,,206.23,1,0,0,,,4plex
3,10,16 T=Day 6AM 11/20/17 S006 R103 (),"R103, S006",WB,,,,,6.0,,...,31.49,< 2.33,,232.11,1,0,0,,,4plex
4,10,22 T=Day 7AM 11/21/17 S006 R103 (),"R103, S006",WB,,,,,7.0,,...,94.51,< 2.33,,233.61,1,1,0,,,4plex


In [19]:
fix.to_csv('{}/4plex_WB_QIMR_Pf_CHMIS/formatted_4plex_WB_QIMR_Pf_CHMIS.csv'.format(input_path))