In [1]:
import numpy as np
import pandas as pd

biop_covariates = pd.read_csv('./data/biop_clinical.csv', delimiter = ',')
biol_covariates = pd.read_csv('./data/biol_clinical.csv', delimiter = ',')

acpa_dataframe = pd.read_csv('./data/patient_acpa_data.csv', delimiter = ',')

full_dataframe = biop_covariates.append(biol_covariates, ignore_index = True)

In [2]:
import functools

das_bins = {
    0: {0: 'Good', 1: 'Moderate', 2: 'Moderate'}, 
    1: {0: 'Moderate', 1: 'Moderate', 2: 'None'},
    2: {0: 'Moderate', 1: 'None', 2: 'None'}
}

def get_das_bin(das_fu, delta_das):
    f_up_bin = 1
    delta_bin = 1
    
    if das_fu <= 3.2:
        f_up_bin = 0
    elif das_fu > 5.2:
        f_up_bin = 2
        
    if delta_das > 1.2:
        delta_bin = 0
    elif delta_das <= 0.6:
        delta_bin = 2
        
    return das_bins[f_up_bin][delta_bin]

def write_eular_phenotype_file(df, das_type, non0_columns):    
    outcomes_das = []
    
    das28_idx = np.where(np.logical_and(~pd.isnull(df[f'das28{das_type}.0']), ~pd.isnull(df[f'das28{das_type}.2'])))[0]
    das28_outcomes = df.iloc[das28_idx].reset_index(drop = True)
    
    for index, row in das28_outcomes.iterrows():
        outcome_das = {
            'IID': row['sample_id']
        }
        
        das = row[f'das28{das_type}.0']
        das_fu = row[f'das28{das_type}.2']
        delta_das = das - das_fu

        outcome_das['delta_das'] = delta_das
        outcome_das['eular_bin'] = get_das_bin(das_fu, delta_das)

        outcomes_das.append(outcome_das)
        
    das28_df = pd.DataFrame(outcomes_das, index = np.arange(len(outcomes_das)))
    das28_df = df.merge(das28_df, how = 'right', left_on = 'sample_id', right_on = 'IID')
    
    das28_df = das28_df.astype({'eular_bin': 'category'})
    das28_df['class'] = das28_df['eular_bin'].cat.codes

    bio_idx = np.where([bio in ['adalimumab', 'etanercept', 'infliximab'] for bio in das28_df['BIO']])
    das28_df = das28_df.iloc[bio_idx].reset_index(drop = True)
    
    das28_df['das_type'] = das_type
    
    idx = [pd.isnull(das28_df[x]) for x in non0_columns]
    non0_idx = np.where(np.logical_not(functools.reduce(np.logical_or, idx)))[0]
    
    das28_df = das28_df.iloc[non0_idx].reset_index(drop = True)
    das28_df.merge(acpa_dataframe, how = 'left')
    
    print(das28_df.shape, f'samples before cleaing {das_type} df')
    
    # Drop samples that are missing all covariates
    idx = [pd.isnull(das28_df[x]) for x in ['FIRSTBIO', 'WEIGHT', 'HEIGHT', 'DISDUR', 'SMOKE', 'BIO', 'AGEONSET', 'HAQ', 'SEX', 'SERO', 'CONCURRENT_DMARD']]
    
    drop_idx = np.where(np.logical_not(functools.reduce(np.logical_and, idx)))[0]
    das28_df = das28_df.iloc[drop_idx].reset_index(drop = True)
    
    print(das28_df.shape, f'left after cleaing {das_type} df')
    
    for col in ['FIRSTBIO', 'WEIGHT', 'HEIGHT', 'DISDUR', 'SMOKE', 'BIO', 'AGEONSET', 'HAQ', 'SEX', 'SERO', 'CONCURRENT_DMARD']:
        n_mis = (np.count_nonzero(pd.isnull(das28_df[col])) / das28_df.shape[0]) * 100
        
        print(f'{col} is missing for {n_mis:2.4f}% of samples')
        
    das28_df.to_csv(f'./data/das28_BIOP_{das_type}_outcomes.csv', index = False)

In [3]:
write_eular_phenotype_file(biop_covariates, 'crp', ['das_tend.0', 'das_vas.0', 'das_swol.0', 'crp.0', 'BIO', 'SMOKE'])

(623, 33) samples before cleaing crp df
(623, 33) left after cleaing crp df
FIRSTBIO is missing for 0.3210% of samples
WEIGHT is missing for 7.5441% of samples
HEIGHT is missing for 16.8539% of samples
DISDUR is missing for 1.7657% of samples
SMOKE is missing for 0.0000% of samples
BIO is missing for 0.0000% of samples
AGEONSET is missing for 1.7657% of samples
HAQ is missing for 14.1252% of samples
SEX is missing for 0.6421% of samples
SERO is missing for 2.8892% of samples
CONCURRENT_DMARD is missing for 2.2472% of samples


In [4]:
write_eular_phenotype_file(biop_covariates, 'esr', ['das_tend.0', 'das_vas.0', 'das_swol.0', 'esr.0', 'BIO', 'SMOKE'])

(569, 33) samples before cleaing esr df
(569, 33) left after cleaing esr df
FIRSTBIO is missing for 0.5272% of samples
WEIGHT is missing for 7.2056% of samples
HEIGHT is missing for 15.4657% of samples
DISDUR is missing for 1.9332% of samples
SMOKE is missing for 0.0000% of samples
BIO is missing for 0.0000% of samples
AGEONSET is missing for 1.9332% of samples
HAQ is missing for 15.2900% of samples
SEX is missing for 0.7030% of samples
SERO is missing for 2.8120% of samples
CONCURRENT_DMARD is missing for 2.1090% of samples
