In [1]:
import pandas as pd
import numpy as np
import math
from functools import partial, reduce
import os
import re

In [2]:
root_path = 'C:/Users/lzoeckler/Desktop/mali_meta'

In [3]:
def parse_id(df):
    all_ids = df['participant_id'].split('-')
    return all_ids[2]

In [4]:
semi_formatted = pd.read_csv('{}/formatted_4plex_NIH_clinical.csv'.format(root_path))
semi_formatted['Study ID number'] = semi_formatted.apply(parse_id, axis=1)
semi_formatted['timepoint_days'] = semi_formatted['timepoint_days'].apply(int)
semi_formatted['Study ID number'] = semi_formatted['Study ID number'].apply(int)
semi_formatted = semi_formatted[['sample_id', 'participant_id', 'timepoint_days', 'RDT_pos', 'HRP2_pg_ml',
                                 'LDH_Pan_pg_ml', 'LDH_Pv_pg_ml', 'CRP_ng_ml', 'HRP2_result',
                                 'LDH_Pan_result', 'LDH_Pv_result', 'Study ID number']]
semi_formatted.head()

Unnamed: 0,sample_id,participant_id,timepoint_days,RDT_pos,HRP2_pg_ml,LDH_Pan_pg_ml,LDH_Pv_pg_ml,CRP_ng_ml,HRP2_result,LDH_Pan_result,LDH_Pv_result,Study ID number
0,1 MV-15-301-F D547 (),MV-15-301-F,547,0.0,< 1.07,24.56,6.4182,3911.39702,negative,negative,negative,301
1,24 MV-15-301-F D554 (),MV-15-301-F,554,0.0,1.28,27.49,8.7457,1383.32281,negative,negative,negative,301
2,14 301-F D568 (),MV-15-301-F,568,1.0,1096.93,188.9,5.5062,195.1877,positive,positive,negative,301
3,32 301-F D582 (),MV-15-301-F,582,1.0,5024.8,< 14.41,< 2.33,< 26.7293,positive,negative,negative,301
4,2 MV-15-302-k D547 (),MV-15-302-k,547,0.0,< 1.07,19.58,6.5968,1572.37057,negative,negative,negative,302


In [5]:
fixed_days = []
for sid in semi_formatted['Study ID number'].unique():
    sid_df = semi_formatted.loc[semi_formatted['Study ID number'] == sid]
    all_days = sid_df['timepoint_days'].unique().tolist()
    fixed_list = [day for day in all_days if ~np.isnan(day)]
    min_day = min(fixed_list)
    sid_df['timepoint_days'] = sid_df['timepoint_days'].subtract(min_day)
    fixed_days.append(sid_df)
fixed_df = pd.concat(fixed_days)
fixed_df.sort_values(['Study ID number', 'timepoint_days'], inplace=True)
fixed_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


Unnamed: 0,sample_id,participant_id,timepoint_days,RDT_pos,HRP2_pg_ml,LDH_Pan_pg_ml,LDH_Pv_pg_ml,CRP_ng_ml,HRP2_result,LDH_Pan_result,LDH_Pv_result,Study ID number
0,1 MV-15-301-F D547 (),MV-15-301-F,0,0.0,< 1.07,24.56,6.4182,3911.39702,negative,negative,negative,301
1,24 MV-15-301-F D554 (),MV-15-301-F,7,0.0,1.28,27.49,8.7457,1383.32281,negative,negative,negative,301
2,14 301-F D568 (),MV-15-301-F,21,1.0,1096.93,188.9,5.5062,195.1877,positive,positive,negative,301
3,32 301-F D582 (),MV-15-301-F,35,1.0,5024.8,< 14.41,< 2.33,< 26.7293,positive,negative,negative,301
4,2 MV-15-302-k D547 (),MV-15-302-k,0,0.0,< 1.07,19.58,6.5968,1572.37057,negative,negative,negative,302


In [6]:
dem_par = pd.read_csv('{}/demographic_parasitemia_data.csv'.format(root_path))
dem_par = dem_par[['Study ID number', 'P. falciparum', 'P. malariae', 'P. ovale', 'Visit Date',
                   'Age at visit', 'studyday']]
dem_par.rename(columns={'Visit Date': 'date', 'studyday': 'timepoint_days',
                       'Age at visit': 'age_yrs'}, inplace=True)
dem_par.head()

Unnamed: 0,Study ID number,P. falciparum,P. malariae,P. ovale,date,age_yrs,timepoint_days
0,301,,,,2016-08-16,32.19,510.0
1,301,,,,2016-09-05,32.24,
2,301,,,,2016-09-15,32.27,540.0
3,301,,,,2016-09-20,32.28,
4,301,,,,2016-09-22,32.29,547.0


In [7]:
med_info = pd.read_csv('{}/malaria_conmeds.csv'.format(root_path))
med_info = med_info.loc[med_info['Drug1 Name'] != 'COMPLEX B']
med_info = med_info.loc[med_info['Drug1 Indication'].isin(['ACCES PALUSTRE', 'ACCES PALUSTREQ'])]
med_info = med_info[['Study ID number', 'Drug1 Name', 'Drug1 Start Date']]
med_info.rename(columns={'Drug1 Name': 'drug', 'Drug1 Start Date': 'date'}, inplace=True)
med_info.head()

Unnamed: 0,Study ID number,drug,date
0,329,ARTEMETHER LUMEFANTRINE,2016-07-17
1,318,COARTEM,2016-07-21
2,487,ARTEMETHER LUMEFANTRINE,2016-08-02
3,457,ARTEMETHER LUMEFANTRINE,2016-08-06
4,412,ARTEMETHER LUMEFANTRINE,2016-08-08


In [8]:
check_combo = med_info.merge(dem_par, how='outer', on=['Study ID number', 'date'], suffixes=(False, False))
check_combo.sort_values('Study ID number', inplace=True)
check_combo['timepoint_days'] = check_combo['timepoint_days'].apply(lambda x: x if np.isnan(x) else int(x))
check_combo['Study ID number'] = check_combo['Study ID number'].apply(lambda x: x if np.isnan(x) else int(x))
check_combo.head()

Unnamed: 0,Study ID number,drug,date,P. falciparum,P. malariae,P. ovale,age_yrs,timepoint_days
147,301,,2016-11-03,,,,32.41,589.0
90,301,COARTEM,2016-10-30,27.0,,,32.39,585.0
148,301,,2016-11-24,,,,32.46,610.0
132,301,,2016-08-16,,,,32.19,510.0
133,301,,2016-09-05,,,,32.24,


In [9]:
all_combo = check_combo.merge(semi_formatted, how='outer',
                              on=['Study ID number', 'timepoint_days'], suffixes=(False, False))
all_combo = all_combo[['Study ID number', 'sample_id', 'participant_id', 'date', 'timepoint_days', 'drug',
                       'RDT_pos', 'P. falciparum', 'P. malariae', 'P. ovale', 'age_yrs',
                       'HRP2_pg_ml', 'LDH_Pan_pg_ml', 'LDH_Pv_pg_ml', 'CRP_ng_ml',
                       'HRP2_result', 'LDH_Pan_result', 'LDH_Pv_result']]
all_combo.sort_values(['Study ID number'], inplace=True)
all_combo['date'] = pd.to_datetime(all_combo['date'])
all_combo.head()

Unnamed: 0,Study ID number,sample_id,participant_id,date,timepoint_days,drug,RDT_pos,P. falciparum,P. malariae,P. ovale,age_yrs,HRP2_pg_ml,LDH_Pan_pg_ml,LDH_Pv_pg_ml,CRP_ng_ml,HRP2_result,LDH_Pan_result,LDH_Pv_result
0,301,,,2016-11-03,589.0,,,,,,32.41,,,,,,,
17,301,14 301-F D568 (),MV-15-301-F,2016-10-13,568.0,,1.0,2.0,,,32.35,1096.93,188.9,5.5062,195.1877,positive,positive,negative
16,301,,,2016-10-07,561.0,,,,,,32.33,,,,,,,
15,301,,,2016-10-10,564.0,,,1.0,,,32.34,,,,,,,
14,301,24 MV-15-301-F D554 (),MV-15-301-F,2016-09-30,554.0,,0.0,,,,32.31,1.28,27.49,8.7457,1383.32281,negative,negative,negative


In [10]:
check = all_combo.loc[all_combo['Study ID number'] == 301]
check.sort_values('timepoint_days', inplace=True)
check.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,Study ID number,sample_id,participant_id,date,timepoint_days,drug,RDT_pos,P. falciparum,P. malariae,P. ovale,age_yrs,HRP2_pg_ml,LDH_Pan_pg_ml,LDH_Pv_pg_ml,CRP_ng_ml,HRP2_result,LDH_Pan_result,LDH_Pv_result
3,301,,,2016-08-16,510.0,,,,,,32.19,,,,,,,
6,301,,,2016-09-15,540.0,,,,,,32.27,,,,,,,
7,301,1 MV-15-301-F D547 (),MV-15-301-F,2016-09-22,547.0,,0.0,,,,32.29,< 1.07,24.56,6.4182,3911.39702,negative,negative,negative
8,301,,,2016-09-25,550.0,,,,,,32.3,,,,,,,
14,301,24 MV-15-301-F D554 (),MV-15-301-F,2016-09-30,554.0,,0.0,,,,32.31,1.28,27.49,8.7457,1383.32281,negative,negative,negative


In [11]:
rebuilt_df = []
for sid in all_combo['Study ID number'].unique():
# for sid in [318]:
    sid_df = all_combo.loc[all_combo['Study ID number'] == sid]
    pids = sid_df['participant_id'].unique().tolist()
    pid = [pid for pid in pids if pid is not np.nan]
    if len(pid) > 0:
        sid_df['participant_id'] = pid[-1]
    all_days = sid_df['timepoint_days'].unique().tolist()
    fixed_days = [day for day in all_days if ~np.isnan(day)]
    min_day = min(fixed_days)
    sid_df['timepoint_days'] = sid_df['timepoint_days'].subtract(min_day)
    sid_df['zero_date'] = sid_df.loc[sid_df['timepoint_days'] == 0, 'date'].item()
    sid_df['zero_date'] = pd.to_datetime(sid_df['zero_date'])
    sid_df['date_dif'] = sid_df['date'] - sid_df['zero_date']
    sid_df['date_dif'] = sid_df['date_dif'] / np.timedelta64(1, 'D')
    sid_df.drop(['date', 'zero_date'], axis=1, inplace=True)
    rebuilt_df.append(sid_df)
rebuilt_df = pd.concat(rebuilt_df)
rebuilt_df.sort_values(['Study ID number', 'timepoint_days'], inplace=True)
rebuilt_df.rename(columns={'Study ID number': 'id_number'}, inplace=True)
rebuilt_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.ht

Unnamed: 0,id_number,sample_id,participant_id,timepoint_days,drug,RDT_pos,P. falciparum,P. malariae,P. ovale,age_yrs,HRP2_pg_ml,LDH_Pan_pg_ml,LDH_Pv_pg_ml,CRP_ng_ml,HRP2_result,LDH_Pan_result,LDH_Pv_result,date_dif
3,301,,MV-15-301-F,0.0,,,,,,32.19,,,,,,,,0.0
6,301,,MV-15-301-F,30.0,,,,,,32.27,,,,,,,,30.0
7,301,1 MV-15-301-F D547 (),MV-15-301-F,37.0,,0.0,,,,32.29,< 1.07,24.56,6.4182,3911.39702,negative,negative,negative,37.0
8,301,,MV-15-301-F,40.0,,,,,,32.3,,,,,,,,40.0
14,301,24 MV-15-301-F D554 (),MV-15-301-F,44.0,,0.0,,,,32.31,1.28,27.49,8.7457,1383.32281,negative,negative,negative,45.0


In [13]:
rebuilt_df.to_csv('{}/for_viz.csv'.format(root_path), index=False)