In [75]:
import pandas as pd
import numpy as np
import math
from functools import partial, reduce
import os
import re

In [76]:
dfs = []
input_path = 'C:/Users/lzoeckler/Desktop/menzies_raw'
for fname in os.listdir(input_path):
    plex_data = pd.read_csv('{}/{}'.format(input_path, fname),
                            skiprows=13, names=['patient_id', 'type', 'well', 'error',
                                               'HRP2_pg_ml', 'LDH_Pan_pg_ml',
                                               'LDH_Pv_pg_ml', 'LDH_Pf_pg_ml',
                                               'CRP_ng_ml'])
    plex_data = plex_data.applymap(lambda x: x.lower() if isinstance(x, str) else x)
    plex_data['patient_id'] = plex_data['patient_id'].fillna(method='ffill')
    plex_data = plex_data[~plex_data['patient_id'].isnull()]
#     test = plex_data.loc[plex_data['patient_id'].str.contains('pa-001')]
#     if len(test) > 1: 
#         print(fname)
    dfs.append(plex_data)
combined = pd.concat(dfs)
combined = combined.loc[~combined['patient_id'].str.contains('ctrl')]
combined = combined.loc[~combined['type'].isnull()]
combined = combined.loc[~combined['type'].str.contains('replicate')]
combined.head()

Unnamed: 0,patient_id,type,well,error,HRP2_pg_ml,LDH_Pan_pg_ml,LDH_Pv_pg_ml,LDH_Pf_pg_ml,CRP_ng_ml
15,qdm 1434 2019/03/07 edta wb (neat),reduced concentration,c5,,< 0.68,41.02,28.01,< 5.08,> 38000.00
16,qdm 1434 2019/03/07 edta wb (1:20),reduced concentration,d5,,< 13.60,< 327.20,< 93.80,< 101.60,87928.67
17,qdm 1479 2019/04/10 edta plas (neat),reduced concentration,c4,,5.14,962.33,526.7,15.76,> 38000.00
18,qdm 1479 2019/04/10 edta plas (1:20),reduced concentration,d4,,< 13.60,1056.63,350.9,< 101.60,> 760000.00
19,qem 31 2010/10/19 pfp (neat),reduced concentration,c2,,6.26,30554.78,19174.42,60.81,> 38000.00


In [77]:
def fix_concentrations(df):
    con = df['concentration'].partition(':')[2]
    con = con.partition(')')[0]
    if len(con) != 0:
        return con
    else:
        return '1'

In [86]:
samples_data = combined.copy(deep=True)
samples_data = samples_data.drop('type', axis=1)
samples_data['concentration'] = samples_data['patient_id'].apply(lambda x: x.split(' ')[-1])
samples_data['patient_id'] = samples_data['patient_id'].apply(lambda x: '_'.join(x.split(' ')[:3]).replace('/', '_'))
samples_data = samples_data.loc[(samples_data['concentration'].str.contains('neat|20'))]
samples_data = samples_data.loc[~samples_data['well'].isnull()]
samples_data['concentration'] = samples_data.apply(fix_concentrations, axis=1)
samples_data = samples_data.sort_values(['patient_id', 'concentration'])
samples_data.head()

Unnamed: 0,patient_id,well,error,HRP2_pg_ml,LDH_Pan_pg_ml,LDH_Pv_pg_ml,LDH_Pf_pg_ml,CRP_ng_ml,concentration
39,kk_103_2014_04_09,e11,,6.52,66763.83,17747.82,81.37,> 38000.00,1
42,kk_103_2014_04_09,g8,,3.02,> 67000.00,19185.53,71.94,> 38000.00,1
40,kk_103_2014_04_09,f11,,23.05,82537.55,43492.8,227.52,> 760000.00,20
43,kk_103_2014_04_09,h8,,< 13.60,1563.69,952.99,< 101.60,46420.38,20
41,kk_107_2014_04_15,e12,,2.82,900.77,479.96,22.38,> 38000.00,1


In [87]:
samples_data['concentration'].unique()

array(['1', '20'], dtype=object)

In [88]:
sample_ids = samples_data['patient_id'].unique().tolist()
sample_set = set(sample_ids)

In [89]:
# threshhold values for various analytes
threshholds = {'HRP2_pg_ml': 2800, 'LDH_Pan_pg_ml': 67000,
               'LDH_Pv_pg_ml': 19200, 'LDH_Pf_pg_ml': 20800,
               'CRP_ng_ml': 38000}

In [90]:
# constant to apply to the threshhold for different dilutions
dil_constants = {'20': 1}

In [91]:
# positivity threshholds for various analytes
pos_threshholds = {'HRP2_pg_ml': 2.3, 'LDH_Pan_pg_ml': 47.8,
                   'LDH_Pv_pg_ml': 75.1, 'CRP_ng_ml': np.nan}

In [92]:
# dilution sets for various dilutions
dilution_sets = {'20': ('1', '20', 'fail')} 

In [93]:
duplicates = samples_data.loc[samples_data.duplicated(subset=['patient_id', 'concentration'], keep=False)]
deduped_dfs = []
for analyte in threshholds.keys():
    dup_analyte = duplicates[['patient_id', 'well', 'error', 'concentration', analyte]]
    pid_dfs = []
    for pid in duplicates['patient_id'].unique():
        dup_data = dup_analyte.loc[dup_analyte['patient_id'] == pid]
        con_dfs = []
        for concentration in dup_data['concentration'].unique():
            fill_df = pd.DataFrame(columns=['patient_id', 'well', 'error',
                                            'concentration', analyte])
            dup_con = dup_data.loc[dup_data['concentration'] == concentration]
            values = dup_con[analyte]
            wells = dup_con['well'].tolist()
            wells = ''.join(c for c in str(wells) if c not in ["[", "]", "'"])
            errors = dup_con['error'].tolist()
            non_nan_error = [e for e in errors if e is not np.nan]
            if not non_nan_error:
                errors = np.nan
            else:
                errors = non_nan_error
            try:
                values = [float(val) for val in values.tolist()]
                val = sum(values) / len(values)
            except ValueError:
                values = values.tolist()
                num_vals = [val for val in values if ('<' not in val) & ('>' not in val)]
                if len(num_vals) == 1:
                    val = num_vals[0]
                else:
                    val = values[0]
            fill_df = fill_df.append({'patient_id': pid, 'well': wells, 'error': errors,
                           'concentration': concentration, analyte: val}, ignore_index=True)
            con_dfs.append(fill_df)
        con_df = pd.concat(con_dfs)
        pid_dfs.append(con_df)
    pid_df = pd.concat(pid_dfs)
    deduped_dfs.append(pid_df)
deduped = reduce(lambda left, right: pd.merge(left, right, on=['patient_id', 'well', 'error', 'concentration']), deduped_dfs)
deduped.head()

Unnamed: 0,patient_id,well,error,concentration,HRP2_pg_ml,LDH_Pan_pg_ml,LDH_Pv_pg_ml,LDH_Pf_pg_ml,CRP_ng_ml
0,kk_103_2014_04_09,"e11, g8",,1,4.77,66763.83,18466.7,76.655,> 38000.00
1,kk_103_2014_04_09,"f11, h8",,20,23.05,42050.6,22222.9,227.52,46420.38
2,kk_107_2014_04_15,"e12, g9",,1,2.47,1232.33,623.835,16.54,> 38000.00
3,kk_107_2014_04_15,"f12, h9",,20,13.99,105097.0,60401.8,270.81,58265.1
4,kk_113_2014_05_12,"c11, g10",,1,2.82,218.1,9647.35,102.46,> 38000.00


In [94]:
deduped_dfs[0].head()

Unnamed: 0,patient_id,well,error,concentration,HRP2_pg_ml
0,kk_103_2014_04_09,"e11, g8",,1,4.77
0,kk_103_2014_04_09,"f11, h8",,20,23.05
0,kk_107_2014_04_15,"e12, g9",,1,2.47
0,kk_107_2014_04_15,"f12, h9",,20,13.99
0,kk_113_2014_05_12,"c11, g10",,1,2.82


In [95]:
no_duplicates = samples_data.drop_duplicates(subset=['patient_id', 'concentration'], keep=False)
no_duplicates = pd.concat([no_duplicates, deduped])
no_duplicates.head()

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


Unnamed: 0,CRP_ng_ml,HRP2_pg_ml,LDH_Pan_pg_ml,LDH_Pf_pg_ml,LDH_Pv_pg_ml,concentration,error,patient_id,well
15,> 38000.00,< 0.68,41.02,< 5.08,28.01,1,,qdm_1434_2019_03_07,c5
16,87928.67,< 13.60,< 327.20,< 101.60,< 93.80,20,,qdm_1434_2019_03_07,d5
16,> 38000.00,< 0.68,< 16.36,< 5.08,< 4.69,1,,qdm_1434_2019_03_08,e6
17,108725.99,< 13.60,< 327.20,< 101.60,< 93.80,20,,qdm_1434_2019_03_08,f6
18,> 38000.00,< 0.68,< 16.36,< 5.08,< 4.69,1,,qdm_1434_2019_03_09,e7


In [96]:
for err in no_duplicates['error'].unique():
    print(err)
    print(type(err))

nan
<class 'float'>


In [100]:
def return_decisions(low, high, fail='fail'):
    # Columns = [neat_above, neat_below, neat_LLQ, neat_ULQ, NA]
    # Rows = [dil_above, dil_below, dil_LLQ, dil_ULQ, NA]
    HRP2_matrix = np.array([[high, high, high, high, high],
                            [high, low, low, high, fail],
                            [high, low, low, fail, fail],
                            [high, high, fail, high, high],
                            [fail, high, high, fail, fail]])

    other_matrix = np.array([[high, low, low, high, high],
                               [high, low, low, high, fail],
                               [high, low, low, fail, fail],
                               [high, low, fail, high, high],
                               [fail, low, low, fail, fail]])

    # decisions for various analytes
    decisions = {'HRP2_pg_ml': HRP2_matrix, 'LDH_Pan_pg_ml': other_matrix,
                 'LDH_Pv_pg_ml': other_matrix, 'LDH_Pf_pg_ml': other_matrix,
                 'CRP_ng_ml': other_matrix}
    return(decisions)

In [101]:
def run_compare(df, analyte_val, dil_val):
    above, below, LLQ, ULQ, NA = False, False, False, False, False
    val = df[analyte_val]
    thresh_val = dil_constants[dil_val] * threshholds[analyte_val]
    try:
        float_val = float(val)
        if math.isnan(float_val):
            NA = True
        elif float_val > thresh_val:
            above = True
        elif float_val < thresh_val:
            below = True
    except ValueError:
        if '<' in val:
            LLQ = True
        elif '>' in val:
            ULQ = True
    finally:
        return(np.array([above, below, LLQ, ULQ, NA]))

In [102]:
analyte_dfs = []
error_pids = {}
for analyte in threshholds.keys():
# for analyte in ['HRP2_pg_ml']:
    print(analyte)
    patient_dfs = []
    for pid in no_duplicates['patient_id'].unique():
#     for pid in ['pa-001-3', 'pa-014-63', 'pa-129-14']:
        patient_data = no_duplicates.loc[no_duplicates['patient_id'] == pid]
        dilution_values = sorted([val for val in patient_data['concentration'].unique() if val != '1'], key=len)
        best_decision = '1'
        for max_dilution in dilution_values:
#         for max_dilution in ['15625000000']:
            dil_data = patient_data.loc[patient_data['concentration'].isin([best_decision, max_dilution])]          
            partial_compare = partial(run_compare, analyte_val=analyte, dil_val=max_dilution)
            dil_data['decision_vector'] = dil_data.apply(partial_compare, axis=1)
            decisions = return_decisions(best_decision, max_dilution)
            decision_matrix = decisions[analyte]
            best_df = pd.DataFrame(columns=['patient_id', 'errors', analyte,
                                            '{}_dilution'.format(analyte),
                                            '{}_well'.format(analyte),
                                            '{}_max_dilution'.format(analyte)])
            vector_low = dil_data.loc[dil_data['concentration'] == best_decision,
                                      'decision_vector'].item()
            vector_high = dil_data.loc[dil_data['concentration'] == max_dilution,
                                       'decision_vector'].item()
            decision = decision_matrix[vector_high, vector_low].item()
            if decision in [best_decision, max_dilution]:
                val = dil_data.loc[dil_data['concentration'] == decision,
                                   analyte].item()
                well = dil_data.loc[dil_data['concentration'] == decision,
                                    'well'].item()
                error = dil_data.loc[dil_data['concentration'] == decision,
                                    'error'].item()
            elif decision == 'fail':
                val = 'fail'
                well = 'fail'
                error = np.nan
                error_pids[pid] = '{} failure'.format(analyte)
            else:
                raise ValueError("Unexpected decision value: {}".format(decision))
            other_dilutions = [val for val in patient_data['concentration'].unique()]
            other_dilutions = [float(val) for val in other_dilutions if val != 'fail']
            max_dilution = int(max(other_dilutions))
            df_decision = decision if decision != 'fail' else np.nan
            best_decision = decision
            best_df = best_df.append({'patient_id': pid, 'errors': error, analyte: val,
                                      '{}_dilution'.format(analyte): df_decision,
                                      '{}_well'.format(analyte): well,
                                      '{}_max_dilution'.format(analyte): max_dilution}, ignore_index=True)
            if decision == 'fail':
                break
        patient_dfs.append(best_df)
    patient_df = pd.concat(patient_dfs)
    patient_df['errors'] = patient_df['errors'].astype('object')
    analyte_dfs.append(patient_df)
final_df = reduce(lambda left, right: pd.merge(left, right, on=['patient_id', 'errors']), analyte_dfs)

HRP2_pg_ml




LDH_Pan_pg_ml
LDH_Pv_pg_ml
LDH_Pf_pg_ml
CRP_ng_ml


In [103]:
super_final = final_df.copy(deep=True)
for pid in error_pids.keys():
    error = error_pids[pid]
    pid_df = final_df.loc[final_df['patient_id'] == pid]
    pid_df['errors'] = pid_df['errors'].apply(lambda x: error if np.isnan(x) else x + ' ' + error)
    if len(pid_df) > 0:
        super_final = super_final.loc[super_final['patient_id'] != pid]
        super_final = super_final.append(pid_df)
super_final.tail()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


Unnamed: 0,patient_id,errors,HRP2_pg_ml,HRP2_pg_ml_dilution,HRP2_pg_ml_well,HRP2_pg_ml_max_dilution,LDH_Pan_pg_ml,LDH_Pan_pg_ml_dilution,LDH_Pan_pg_ml_well,LDH_Pan_pg_ml_max_dilution,...,LDH_Pv_pg_ml_well,LDH_Pv_pg_ml_max_dilution,LDH_Pf_pg_ml,LDH_Pf_pg_ml_dilution,LDH_Pf_pg_ml_well,LDH_Pf_pg_ml_max_dilution,CRP_ng_ml,CRP_ng_ml_dilution,CRP_ng_ml_well,CRP_ng_ml_max_dilution
328,tk_3_2014_03_05,,6.3,1.0,"g5, c5",20,519.745,1,"g5, c5",20,...,"g5, c5",20,16.215,1,"g5, c5",20,95046.1,20.0,"h5, d5",20
329,tk_7_2014_03_10,,1.7,1.0,"g3, c9",20,628.41,1,"g3, c9",20,...,"g3, c9",20,7.945,1,"g3, c9",20,299864,20.0,"h3, d9",20
10,qdm_1446_2019_03_14,HRP2_pg_ml failure,fail,,fail,20,< 16.36,1,g5,20,...,g5,20,< 5.08,1,g5,20,27378.22,20.0,h5,20
56,qem_150_2011_03_22,CRP_ng_ml failure,5.98,1.0,c8,20,39.47,1,c8,20,...,c8,20,23.05,1,c8,20,fail,,fail,20
85,qem_205_2011_05_06,CRP_ng_ml failure,> 56000.00,20.0,h12,20,60698.7,1,g12,20,...,g12,20,145979.26,20,h12,20,fail,,fail,20


In [104]:
super_final['errors'].unique()

array([nan, 'HRP2_pg_ml failure', 'CRP_ng_ml failure'], dtype=object)

In [105]:
final_ids = super_final['patient_id'].unique().tolist()
final_set = set(final_ids)
assert final_set == sample_set, print(sample_set - final_set)

In [108]:
def split_time(df):
    sub = df['patient_id'].split('_')
    try:
        time = '_'.join(sub[2:])
        return time
    except IndexError:
        return 0

In [111]:
def remove_time(df):
    patient = df['patient_id'].split('_')
    return '_'.join(patient[0:2])

In [112]:
time_df = super_final.copy(deep=True)
time_df['date'] = time_df.apply(split_time, axis=1)
time_df['patient_id'] = time_df.apply(remove_time, axis=1)
time_df.sort_values(['patient_id', 'date'], inplace=True)
time_df.set_index(['patient_id', 'date'], inplace=True)
time_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,errors,HRP2_pg_ml,HRP2_pg_ml_dilution,HRP2_pg_ml_well,HRP2_pg_ml_max_dilution,LDH_Pan_pg_ml,LDH_Pan_pg_ml_dilution,LDH_Pan_pg_ml_well,LDH_Pan_pg_ml_max_dilution,LDH_Pv_pg_ml,...,LDH_Pv_pg_ml_well,LDH_Pv_pg_ml_max_dilution,LDH_Pf_pg_ml,LDH_Pf_pg_ml_dilution,LDH_Pf_pg_ml_well,LDH_Pf_pg_ml_max_dilution,CRP_ng_ml,CRP_ng_ml_dilution,CRP_ng_ml_well,CRP_ng_ml_max_dilution
patient_id,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
kk_103,2014_04_09,,4.77,1,"e11, g8",20,66763.83,1,"e11, g8",20,18466.7,...,"e11, g8",20,76.655,1,"e11, g8",20,46420.38,20,"f11, h8",20
kk_107,2014_04_15,,2.47,1,"e12, g9",20,1232.33,1,"e12, g9",20,623.835,...,"e12, g9",20,16.54,1,"e12, g9",20,58265.1,20,"f12, h9",20
kk_113,2014_05_12,,2.82,1,"c11, g10",20,218.1,1,"c11, g10",20,9647.35,...,"c11, g10",20,102.46,1,"c11, g10",20,68987.6,20,"d11, h10",20
kk_114,2014_05_12,,1.48,1,"c12, g11",20,160.115,1,"c12, g11",20,67.535,...,"c12, g11",20,9.36,1,"c12, g11",20,57048.8,20,"d12, h11",20
kk_115,2014_05_13,,4.25,1,"e1, g12",20,33460.9,1,"e1, g12",20,9847.58,...,"e1, g12",20,50.45,1,"e1, g12",20,218800.74,20,"f1, h12",20


In [114]:
time_df.to_csv('C:/Users/lzoeckler/Desktop/final_dilutions.csv')

In [142]:
partial_format = samples_data.copy(deep=True)
partial_format['time_point_days'] = partial_format.apply(split_time, axis=1)
partial_format['patient_id'] = partial_format.apply(remove_time, axis=1)
partial_format = partial_format.merge(add_info, how='left', on=['patient_id', 'time_point_days'])
partial_format.sort_values(['patient_id', 'time_point_days'], inplace=True)
partial_format.set_index(['patient_id', 'time_point_days'], inplace=True)
partial_format.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,well,error,HRP2_pg_ml,LDH_Pan_pg_ml,LDH_Pv_pg_ml,CRP_ng_ml,concentration,day0_enrollee,days_since_tx,initial_sample,got_PQ,age,sex,fever48_r,enrolled_from,returned_with_fever,when_returned_with_fever,retreated,when_retreated
patient_id,time_point_days,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
pa-001,3,g6,,145.95,142.30,15.35,5678.58,1,1.0,3.0,,1.0,2.0,female,1.0,health facility,,,,
pa-001,3,f3,,397468.45,< 1801250.00,< 291250.00,< 3342500.00,125000,1.0,3.0,,1.0,2.0,female,1.0,health facility,,,,
pa-001,3,e3,,318702.59,74360.14,10385.14,< 66850.00,2500,1.0,3.0,,1.0,2.0,female,1.0,health facility,,,,
pa-001,3,h3,,1537195777.01,119019730410.15,16185275422.09,286153421264.94,312500000,1.0,3.0,,1.0,2.0,female,1.0,health facility,,,,
pa-001,3,h6,,10273.61,2535.11,733.44,49491.76,50,1.0,3.0,,1.0,2.0,female,1.0,health facility,,,,


In [143]:
partial_format.to_csv('C:/Users/lzoeckler/Desktop/4plex/output_data/partially_formatted.csv')