In [1]:
import pandas as pd
import numpy as np

biop_data = pd.read_stata("../../data/braggss/analysis_elig_dayson.dta")

print(biop_data.shape)

One or more strings in the dta file could not be decoded using utf-8, and
so the fallback encoding of latin-1 is being used.  This can happen when a file
has been incorrectly encoded by Stata or some other software. You should verify
the string values returned are correct.


(10976, 328)


In [2]:
acpa_dataframe = pd.read_csv('./data/patient_acpa_data.csv', delimiter = ',')

In [3]:
biop_data_dir = {}

fupno_vals = np.unique(biop_data['fupno'])

for n in fupno_vals:
    idx = np.where(biop_data['fupno'] == n)

    biop_data_c = biop_data.iloc[idx]
    biop_data_c = biop_data_c.reset_index()
    
    biop_data_dir[n] = biop_data_c

In [4]:
biop_data_sets = []

for n in fupno_vals:
    das_columns = ['das_swol', 'das_tend', 'das_pat_global', 'das_crp', 'das_esr', 'crp_bioC']
    
    if n == 0:
        columns = ['sample_id', 'acr_rfpos', 'first_bio', 'weight_intbl_baseline', 'height', 'disease_duration', 'smoking', 'drug_id_intbl_bio', 'age_onset', 'current_dmard', 'gender', 'haq_score']
    else:
        columns = ['sample_id']
    columns.extend(das_columns)
        
    rename_columns = {}
    for das_column in das_columns:
        
        rename_columns[das_column] = f'{das_column}.{n}'
    
    data = biop_data_dir[n][columns]
    data = data.rename(columns = rename_columns)
    
    biop_data_sets.append(data)

In [5]:
bl_crp = biop_data_sets[0]['crp_bioC.0'].to_numpy()
crp_idx = np.where(np.logical_and(pd.isnull(bl_crp), ~pd.isnull(biop_data_sets[0]['das_crp.0'])))[0]
bl_crp[crp_idx] = biop_data_sets[0]['das_crp.0'].iloc[crp_idx]

bl_crp[bl_crp < 0] = np.nan

biop_data_sets[0]['das_crp_comb.0'] = np.maximum(bl_crp, 0)

In [6]:
fu_crp = biop_data_sets[2]['crp_bioC.2'].to_numpy()
crp_idx = np.where(np.logical_and(pd.isnull(fu_crp), ~pd.isnull(biop_data_sets[2]['das_crp.2'])))[0]
fu_crp[crp_idx] = biop_data_sets[2]['das_crp.2'].iloc[crp_idx]

fu_crp[fu_crp < 0] = np.nan

biop_data_sets[2]['das_crp_comb.2'] = fu_crp

In [7]:
biop_output_data = biop_data_sets[0].astype({'drug_id_intbl_bio': 'str'})

for n in [2]:
    biop_output_data = biop_output_data.merge(biop_data_sets[n], on = 'sample_id', how = 'left')

In [8]:
biop_output_data = biop_output_data.merge(acpa_dataframe, how = 'left')

In [9]:
bio = biop_output_data['first_bio'].astype('str')

bio[np.where(bio == 'yes')[0]] = '1'
bio[np.where(bio == 'no')[0]] = '0'
bio.replace('nan', np.nan, inplace = True)

biop_output_data['FIRSTBIO'] = bio

In [10]:
gender = biop_output_data['gender'].astype('str')

gender[np.where(gender == 'male')[0]] = '0'
gender[np.where(gender == 'female')[0]] = '1'
gender.replace('nan', np.nan, inplace = True)

biop_output_data['SEX'] = gender

In [11]:
smoke = biop_output_data['smoking'].astype('str')

smoke[np.where([x.startswith('current') for x in smoke])[0]] = 'current'
smoke[np.where([x.startswith('ex') for x in smoke])[0]] = 'past'
smoke[np.where([x.startswith('never') for x in smoke])[0]] = 'never'
smoke.replace('nan', np.nan, inplace = True)

biop_output_data['SMOKE'] = smoke

In [12]:
np.unique(biop_output_data['drug_id_intbl_bio'].astype('str'))

array(['10436.0', '10437.0', '10449.0', 'BS Etan(Benepali)',
       'BS Etan(Erelzi)', 'BS Inflix(Inflectra)', 'BS Inflix(Remsima)',
       'BS Ritux(Rixathon)', 'BS Ritux(Truxima)',
       'Baricitinib (Olumiant)', 'Biosimilar Adalimumab (Amgevita)',
       'Certolizumab', 'Other biologic', 'Tofacit(Xeljanz)', 'abatacept',
       'adalimumab', 'etanercept', 'golimumab', 'infliximab', 'nan',
       'rituximab', 'tocilizumab'], dtype=object)

In [13]:
bio = biop_output_data['drug_id_intbl_bio'].astype('str')

for idx, val in enumerate(bio):
    if val == 'adalimumab' or 'Adalimumab' in val:
        val = 'adalimumab'
    elif val == 'etanercept' or 'Etan' in val:
        val = 'etanercept'
    elif val == 'rituximab' or 'Ritux' in val:
        val = 'rituximab'
    elif val == 'infliximab' or 'Inflix' in val:
        val = 'infliximab'
    elif val == 'abatacept' or val == 'golimumab' or val == 'Certolizumab' or val == 'tocilizumab':
        val = val
    else:
        val = 'Other'
        
    bio[idx] = val
    
biop_output_data['BIO'] = bio

In [14]:
dmard = biop_output_data['current_dmard'].astype('str')

dmard[np.where(dmard == 'yes')[0]] = '1'
dmard[np.where(dmard == 'no')[0]] = '0'
dmard.replace('nan', np.nan, inplace = True)
dmard.replace('9.0', np.nan, inplace = True)

biop_output_data['CONCURRENT_DMARD'] = dmard

In [15]:
rf_pos = biop_output_data['acr_rfpos']
acpa = biop_output_data['acpa_positive']

In [16]:
sero_pos = np.zeros(biop_output_data.shape[0])

both_idx = np.where(np.logical_and(~pd.isnull(rf_pos), ~pd.isnull(acpa)))[0]
sero_pos[both_idx] = np.logical_or(rf_pos[both_idx], acpa[both_idx])

rf_pos_idx = np.where(np.logical_and(~pd.isnull(rf_pos), pd.isnull(acpa)))[0]
sero_pos[rf_pos_idx] = rf_pos[rf_pos_idx]

acpa_idx = np.where(np.logical_and(pd.isnull(rf_pos), ~pd.isnull(acpa)))[0]
sero_pos[acpa_idx] = acpa[acpa_idx]

none_idx = np.where(np.logical_and(pd.isnull(rf_pos), pd.isnull(acpa)))[0]
sero_pos[none_idx] = np.nan

biop_output_data['SERO'] = sero_pos

In [17]:
biop_output_data = biop_output_data.rename(columns = {'das_pat_global.0': 'das_vas.0', 'das_pat_global.2': 'das_vas.2', 'acr_rfpos': 'RFPOS', 'weight_intbl_baseline': 'WEIGHT', 'height': 'HEIGHT', 'disease_duration': 'DISDUR', 'age_onset': 'AGEONSET', 'haq_score': 'HAQ', 'acpa_positive': 'ACPA', 'das_crp_comb.0': 'crp.0', 'das_crp_comb.2': 'crp.2', 'das_esr.0': 'esr.0', 'das_esr.2': 'esr.2'})

In [18]:
biop_output_data.columns

Index(['sample_id', 'RFPOS', 'first_bio', 'WEIGHT', 'HEIGHT', 'DISDUR',
       'smoking', 'drug_id_intbl_bio', 'AGEONSET', 'current_dmard', 'gender',
       'HAQ', 'das_swol.0', 'das_tend.0', 'das_vas.0', 'das_crp.0', 'esr.0',
       'crp_bioC.0', 'crp.0', 'das_swol.2', 'das_tend.2', 'das_vas.2',
       'das_crp.2', 'esr.2', 'crp_bioC.2', 'crp.2', 'ACPA', 'FIRSTBIO', 'SEX',
       'SMOKE', 'BIO', 'CONCURRENT_DMARD', 'SERO'],
      dtype='object')

In [19]:
import functools

def calc_esr_das(tjc, sjc, ghvas, esr):
    return (56 / 100) * np.sqrt(tjc) + (28 / 100) * np.sqrt(sjc) + (70 / 100) * np.log(esr + 1e-8) + (14 / 1000) * ghvas

def calc_crp_das(tjc, sjc, ghvas, crp):
    return (56 / 100) * np.sqrt(tjc) + (28 / 100) * np.sqrt(sjc) + (36 / 100) * np.log(crp + 1) + (14 / 1000) * ghvas + 0.96

das_data = []

for index, row in biop_output_data.iterrows():
    data = {'sample_id': row['sample_id']}
    
    crp_bl = [~pd.isnull(row[x]) for x in ['das_swol.0', 'das_tend.0', 'das_vas.0', 'crp.0']]
    crp_fu = [~pd.isnull(row[x]) for x in ['das_swol.2', 'das_tend.2', 'das_vas.2', 'crp.2']]
    esr_bl = [~pd.isnull(row[x]) for x in ['das_swol.0', 'das_tend.0', 'das_vas.0', 'esr.0']]
    esr_fu = [~pd.isnull(row[x]) for x in ['das_swol.2', 'das_tend.2', 'das_vas.2', 'esr.2']]
    if functools.reduce(np.logical_and, crp_bl):
        crp_bl_das = calc_crp_das(row['das_tend.0'], row['das_swol.0'], row['das_vas.0'], row['crp.0'])
        
        data['das28crp.0'] = crp_bl_das
    if functools.reduce(np.logical_and, crp_fu):
        crp_fu_das = calc_crp_das(row['das_tend.2'], row['das_swol.2'], row['das_vas.2'], row['crp.2'])
        
        data['das28crp.2'] = crp_fu_das
    if functools.reduce(np.logical_and, esr_bl):
        esr_bl_das = calc_esr_das(row['das_tend.0'], row['das_swol.0'], row['das_vas.0'], row['esr.0'])
        
        data['das28esr.0'] = esr_bl_das
    if functools.reduce(np.logical_and, esr_fu):
        esr_fu_das = calc_esr_das(row['das_tend.2'], row['das_swol.2'], row['das_vas.2'], row['esr.2'])
        
        data['das28esr.2'] = esr_fu_das
        
    das_data.append(data)
    
das_data = pd.DataFrame(das_data, index = np.arange(len(das_data)))
    
biop_output_data = biop_output_data.merge(das_data, on = 'sample_id', how = 'left')

In [20]:
output_columns = ['sample_id', 'das_swol.0', 'das_tend.0', 'das_vas.0', 'crp.0', 'esr.0', 'das_swol.2', 'das_tend.2', 'das_vas.2', 'crp.2', 'esr.2', 
                  'das28crp.0', 'das28crp.2', 'das28esr.0', 'das28esr.2', 'RFPOS', 'FIRSTBIO', 'WEIGHT', 'HEIGHT', 'DISDUR', 'SMOKE', 'BIO', 'AGEONSET', 'CONCURRENT_DMARD', 'SEX', 'HAQ', 'ACPA', 'SERO']

biop_output_data[output_columns].to_csv('./data/biop_clinical.csv', index = False)