In [1]:
import pandas as pd
import numpy as np

# Load .fam file
biol_data = pd.read_stata("../../data/braggss/clinical_covariates/raw/biol_analysis_eligible.dta") 

print(biol_data.shape)

(2556, 86)


In [2]:
sex_data = pd.read_csv('../../data/braggss/outcomes/braggss.sex', delimiter = ' ', header = None)

In [3]:
# 1 = Male, 2 = Female

sex_data = pd.read_csv('../../data/braggss/outcomes/braggss.sex', delimiter = ' ', header = None)
sex_data['SEX'] = sex_data[2] - 1
sex_data['SEX'].replace(-1, np.nan, inplace = True)

biol_data = biol_data.merge(sex_data[[0, 'SEX']], left_on = 'sample_id', right_on = 0, how = 'left')

In [4]:
wtc_mapping = np.char.decode(np.loadtxt('../../data/braggss/wtccc-braggss.sampleids', delimiter = '\t', dtype = np.string_))

print(wtc_mapping.shape)

(595, 4)


In [5]:
wtc_mapping

array([['BIOL0001', 'BIOL0001', 'WTCCC171791', 'WTCCC171791'],
       ['BIOL0003', 'BIOL0003', 'WTCCC171792', 'WTCCC171792'],
       ['BIOL0004', 'BIOL0004', 'WTCCC171793', 'WTCCC171793'],
       ...,
       ['BIOL1206', 'BIOL1206', 'WTCCC188809', 'WTCCC188809'],
       ['BIOL1207', 'BIOL1207', 'WTCCC188726', 'WTCCC188726'],
       ['BIOL1210', 'BIOL1210', 'WTCCC188738', 'WTCCC188738']],
      dtype='<U11')

In [6]:
c_fam_ids = biol_data['sample_id'].to_numpy()

for n in range(c_fam_ids.shape[0]):
    iid = c_fam_ids[n]
    
    wtc_idx = np.where(wtc_mapping[:, 0] == iid)[0]
    if(len(wtc_idx) > 0):
        c_fam_ids[n] = wtc_mapping[wtc_idx[0], 2]

# Overwrite sample ids in file with samples ids used in fam file (WTC IDs)
biol_data['sample_id'] = c_fam_ids

In [7]:
c_fam_ids

array(['WTCCC171791', 'BIOL0002', 'WTCCC171792', ..., 'BIOL2987',
       'BIOL2988', 'BIOL2990'], dtype=object)

In [8]:
biol_data

Unnamed: 0,studyno,sample_id,datend,daswol,daesr,dacrp,daglob,dascore,datend_fup1,daswol_fup1,...,eularmod,noeularmod,modeularmod,goodeularmod,onMTX,inflix,etan,adalim,0,SEX
0,0000200,WTCCC171791,19.0,15.0,42.0,10.0,29.0,6.547285,26.0,0.0,...,0.0,1.0,0.0,0.0,on MTX,1.0,0.0,0.0,,
1,0000206,BIOL0002,18.0,13.0,128.0,107.0,50.0,7.485063,28.0,24.0,...,0.0,1.0,0.0,0.0,on MTX,1.0,0.0,0.0,,
2,0000207,WTCCC171792,28.0,15.0,110.0,90.0,,8.470000,24.0,0.0,...,1.0,0.0,1.0,0.0,,1.0,0.0,0.0,,
3,0000208,WTCCC171793,15.0,7.0,95.0,105.0,38.0,6.628213,28.0,17.0,...,0.0,1.0,0.0,0.0,on MTX,1.0,0.0,0.0,,
4,0000209,BIOL0005,20.0,18.0,101.0,130.0,91.0,8.209730,0.0,0.0,...,2.0,0.0,0.0,1.0,on MTX,1.0,0.0,0.0,BIOL0005,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2551,0013930,BIOL2985,11.0,10.0,31.0,,60.0,5.994605,,,...,2.0,0.0,0.0,1.0,,0.0,0.0,1.0,,
2552,0001781,BIOL2986,28.0,28.0,43.0,,85.0,8.279411,6.0,6.0,...,1.0,0.0,1.0,0.0,on MTX,1.0,0.0,0.0,,
2553,0009789,BIOL2987,10.0,22.0,5.0,,60.0,5.065749,0.0,2.0,...,2.0,0.0,0.0,1.0,,0.0,0.0,1.0,,
2554,0004360,BIOL2988,23.0,12.0,74.0,,70.0,7.652337,6.0,1.0,...,1.0,0.0,1.0,0.0,,0.0,0.0,1.0,,


In [9]:
first_bio = biol_data['firstbio'].astype('str')

first_bio.replace('9', np.nan, inplace = True)

biol_data['FIRSTBIO'] = first_bio

In [10]:
rfpos = biol_data['acrrpos'].astype('str')

rfpos.replace('9', np.nan, inplace = True)

biol_data['RFPOS'] = rfpos

In [11]:
np.unique(biol_data['acrrpos'].astype('str'), return_counts = True)

(array(['0', '1', '9'], dtype=object), array([ 974, 1579,    3]))

In [12]:
bio = biol_data['pribio'].astype('str')

for idx, val in enumerate(bio):
    if val == '1':
        val = 'etanercept'
    elif val == '2':
        val = 'infliximab'
    elif val == '4':
        val = 'adalimumab'
        
    bio[idx] = val
    
biol_data['BIO'] = bio

In [13]:
biol_data = biol_data.rename(columns = {'datend': 'das_tend.0', 'daswol': 'das_swol.0', 'daesr': 'das_esr.0', 'dacrp': 'das_crp.0', 'daglob': 'das_vas.0', 
                                        'datend_fup1':'das_tend.2', 'daswol_fup1': 'das_swol.2', 'daesr_fup1': 'das_esr.2', 'dacrp_fup1': 'das_crp.2', 'daglob_fup1': 'das_vas.2', 
                                        'weight1': 'WEIGHT', 'height1': 'HEIGHT', 'disdur': 'DISDUR',
                                        'smokestat': 'SMOKE', 'ageatonset': 'AGEONSET', 'curdmard': 'CONCURRENT_DMARD', 'ovmean': 'HAQ'
                                        })

In [14]:
import functools

def calc_esr_das(tjc, sjc, ghvas, esr):
    return (56 / 100) * np.sqrt(tjc) + (28 / 100) * np.sqrt(sjc) + (70 / 100) * np.log(esr + 1e-8) + (14 / 1000) * ghvas

def calc_crp_das(tjc, sjc, ghvas, crp):
    return (56 / 100) * np.sqrt(tjc) + (28 / 100) * np.sqrt(sjc) + (36 / 100) * np.log(crp + 1) + (14 / 1000) * ghvas + 0.96

das_data = []

for index, row in biol_data.iterrows():
    data = {'sample_id': row['sample_id']}
    
    crp_bl = [~pd.isnull(row[x]) for x in ['das_swol.0', 'das_tend.0', 'das_vas.0', 'das_crp.0']]
    crp_fu = [~pd.isnull(row[x]) for x in ['das_swol.2', 'das_tend.2', 'das_vas.2', 'das_crp.2']]
    esr_bl = [~pd.isnull(row[x]) for x in ['das_swol.0', 'das_tend.0', 'das_vas.0', 'das_esr.0']]
    esr_fu = [~pd.isnull(row[x]) for x in ['das_swol.2', 'das_tend.2', 'das_vas.2', 'das_esr.2']]
    if functools.reduce(np.logical_and, crp_bl):
        crp_bl_das = calc_crp_das(row['das_tend.0'], row['das_swol.0'], row['das_vas.0'], row['das_crp.0'])
        
        data['das28crp.0'] = crp_bl_das
    if functools.reduce(np.logical_and, crp_fu):
        crp_fu_das = calc_crp_das(row['das_tend.2'], row['das_swol.2'], row['das_vas.2'], row['das_crp.2'])
        
        data['das28crp.2'] = crp_fu_das
    if functools.reduce(np.logical_and, esr_bl):
        esr_bl_das = calc_esr_das(row['das_tend.0'], row['das_swol.0'], row['das_vas.0'], row['das_esr.0'])
        
        data['das28esr.0'] = esr_bl_das
    if functools.reduce(np.logical_and, esr_fu):
        esr_fu_das = calc_esr_das(row['das_tend.2'], row['das_swol.2'], row['das_vas.2'], row['das_esr.2'])
        
        data['das28esr.2'] = esr_fu_das
        
    das_data.append(data)
    
das_data = pd.DataFrame(das_data, index = np.arange(len(das_data)))
    
biol_data = biol_data.merge(das_data, on = 'sample_id', how = 'left')

In [15]:
output_columns = ['sample_id', 'das_swol.0', 'das_tend.0', 'das_vas.0', 'das_crp.0', 'das_esr.0', 'das_swol.2', 'das_tend.2', 'das_vas.2', 'das_crp.2', 'das_esr.2', 
                  'das28crp.0', 'das28crp.2', 'das28esr.0', 'das28esr.2', 'RFPOS', 'FIRSTBIO', 'WEIGHT', 'HEIGHT', 'DISDUR', 'SMOKE', 'BIO', 'AGEONSET', 'CONCURRENT_DMARD', 'SEX', 'HAQ']

biol_data[output_columns].to_csv('./data/biol_clinical.csv')