# Make RA data

In [1]:
import pandas as pd

## Load data

In [2]:
data = pd.read_pickle('../data/ra_data.pkl')

In [3]:
# Ignore IL-1 therapies.
data.replace(['il-1 mono', 'il-1 combo'], 'other therapies', inplace=True)

In [4]:
def rename_therapies(therapies):
    therapy_mapper = {
        'abatacept combo': 'Abatacept combo',
        'abatacept mono': 'Abatacept mono',
        'il-1 combo': 'IL-1Ri combo',
        'il-1 mono': 'IL-1Ri mono',
        'il-6 combo': 'IL-6Ri combo',
        'il-6 mono': 'IL-6Ri mono',
        'rituximab combo': 'Rituximab combo',
        'rituximab mono': 'Rituximab mono',
        'tnfi combo': 'TNFi combo',
        'tnfi mono': 'TNFi mono',
        'jaki combo': 'JAKi combo',
        'jaki mono': 'JAKi mono',
        'csdmard combo': 'csDMARD combo',
        'csdmard mono': 'csDMARD mono',
        'no dmard': 'No DMARD',
        'other therapies': 'Other',
    }

    new_therapies = pd.Categorical(
        [therapy_mapper[t] for t in therapies],
        categories=therapy_mapper.values()
    )
    
    return new_therapies.remove_unused_categories()

data.therapy = rename_therapies(data.therapy)

## Select features

### Demographics

In [5]:
demographics = [
    'age',
    'gender',
    'race',
    'final_education',
    'work_status',
]

X_demographics = data[demographics]

If possible, missing values are imputed by forward filling previously observed values.

In [6]:
X_demographics = X_demographics.groupby(by=data.id).fillna(method='ffill')

### Insurance

In [7]:
insurance = [
    'insurance_private',
    'insurance_medicare',
    'insurance_medicaid',
    'insurance_none',
]

X_insurance = data[insurance]

If possible, missing values are imputed by forward filling previously observed values.

In [8]:
X_insurance = X_insurance.groupby(by=data.id).fillna(method='ffill')

### Medical signs

In [9]:
medical_signs = [
    'bmi',
    'seatedbp1',
    'seatedbp2',
]

X_medical_signs = data[medical_signs]

If possible, missing values are imputed by forward filling previously observed values.

In [10]:
X_medical_signs = X_medical_signs.groupby(by=data.id).fillna(method='ffill')

### Pregnancy

In [11]:
pregnancy = [
    'pregnant_current',
]

X_pregnancy = data[pregnancy]

### RA assessment

In [12]:
ra_assessment = [
    'pt_pain',
    'pt_fatigues',
    'cdai',
    'das',
]

X_ra_assessment = data[ra_assessment]

If possible, missing values are imputed by forward filling previously observed values.

In [13]:
X_ra_assessment = X_ra_assessment.groupby(by=data.id).fillna(method='ffill')

### RA biomarkers

In [14]:
to_replace = {'False': 'Negative', 'True': 'Positive'}
ccpos = data.ccppos.astype('string').fillna('Not tested').replace(to_replace).astype('category')
rfpos = data.rfpos.astype('string').fillna('Not tested').replace(to_replace).astype('category')

mapper = {'ccppos': 'ccp', 'rfpos': 'rf'}
X_ra_biomarkers = pd.concat([ccpos, rfpos], axis=1).rename(columns=mapper)

### Infections

Missing values are implicitly encoded as `False`.

In [15]:
infections = [
    'hospinf',
    'ivinf',
]

X_infections = pd.Series(
    data[infections].sum(axis=1) > 0,
    name='infections',
    dtype=bool,
)

### Comorbidities

Missing values are implicitly encoded as `False`.

In [16]:
comor_metabolic = [
    'comor_hld',
    'comor_diabetes',
]

X_comor_metabolic = pd.Series(
    data[comor_metabolic].sum(axis=1) > 0,
    name='comor_metabolic',
    dtype=bool,
)

In [17]:
comor_cardiovascular = [
    'comor_htn_hosp',
    'comor_htn',
    'comor_revasc',
    'comor_ven_arrhythm',
    'comor_mi',
    'comor_acs',
    'comor_unstab_ang',
    'comor_cor_art_dis',
    'comor_chf_hosp',
    'comor_chf_nohosp',
    'comor_stroke',
    'comor_tia',
    'comor_card_arrest',
    'comor_oth_clot',
    'comor_pulm_emb',
    'comor_pef_art_dis',
    'comor_pat_event',
    'comor_urg_par',
    'comor_pi',
    'comor_carotid',
    'comor_other_cv',
]

X_comor_cvd = pd.Series(
    data[comor_cardiovascular].sum(axis=1) > 0,
    name='comor_cvd',
    dtype=bool,
)

In [18]:
comor_respiratory = [
    'comor_copd',
    'comor_asthma',
    'comor_fib',
]

X_comor_respiratory = pd.Series(
    data[comor_respiratory].sum(axis=1) > 0,
    name='comor_respiratory',
    dtype=bool,
)

In [19]:
comor_dil = [
    'comor_drug_ind_sle',
]

X_comor_dil = pd.Series(
    data[comor_dil].sum(axis=1) > 0,
    name='comor_dil',
    dtype=bool,
)

In [20]:
comor_cancer = [
    'comor_bc',
    'comor_lc',
    'comor_lymphoma',
    'comor_skin_cancer_squa',
    'comor_skin_cancer_mel',
    'comor_oth_cancer',
]

X_comor_cancer = pd.Series(
    data[comor_cancer].sum(axis=1) > 0,
    name='comor_cancer',
    dtype=bool,
)

In [21]:
comor_gi_liver = [
    'comor_ulcer',
    'comor_bowel_perf',
    'comor_hepatic_wbiop',
    'comor_hepatic_nobiop',
]

X_comor_gi_liver  = pd.Series(
    data[comor_gi_liver ].sum(axis=1) > 0,
    name='comor_gi_liver ',
    dtype=bool,
)

In [22]:
comor_musculoskeletal = [
    'sec_sjog',  # 0: No, 1: Yes, 2: New
    'jt_deform',  # 0: No, 1: Yes, 2: New
]

X_musculoskeletal = pd.Series(
    data[comor_musculoskeletal].replace(2.0, 1.0).sum(axis=1) > 0,
    name='comor_musculoskeletal',
    dtype=bool,
)

In [23]:
comor_other = [
    'comor_psoriasis',
    'comor_depression',
    'comor_fm',
    'comor_oth_neuro',
    'comor_hemorg_hosp',
    'comor_hemorg_nohosp',
    'comor_oth_cond',
]

X_comor_other = pd.Series(
    data[comor_other ].sum(axis=1) > 0,
    name='comor_other ',
    dtype=bool,
)

In [24]:
X_comor = pd.concat(
    [
        X_comor_metabolic,
        X_comor_cvd,
        X_comor_respiratory,
        X_comor_dil,
        X_comor_cancer,
        X_comor_gi_liver,
        X_musculoskeletal,
        X_comor_other
    ],
    axis=1,
)

### Targeted adverse events

Missing values are implicitly encoded as `False`.

In [25]:
targeted_adverse_events = [
    'comor_htn_hosp',  # hypertension (serious)
    'comor_revasc',  # CABG or angioplasty
    'comor_ven_arrhythm',  # ventricular arrhythmia
    'comor_mi',  # myocardial infarction
    'comor_acs',  # acute coronary syndrome
    'comor_unstab_ang',  # unstable angina
    'comor_chf_hosp',  # congestive heart failure (serious)
    'comor_stroke',  # stroke
    'comor_tia',  # transient ischemic attack
    # other cardiac condition (serious)
    'comor_oth_clot',  # deep vein thrombosis
    'comor_pulm_emb',  # pulmonary embolism
    'comor_pat_event',  # peripheral arterial thromboembolic event
    'comor_urg_par',  # urgent peripheral arterial revascularization
    'comor_pi',  # peripheral ischemia or gangrene
    # other vascular condition (serious)
    # COPD exacerbation (serious)
    # drug hypersensitivity reaction (severe)
    # drug hypersensitivity reaction (anaphylaxis)
    'comor_bc',  # breast cancer
    'comor_lc',  # lung cancer
    # colon cancer
    # uterine cancer
    # cervical cancer
    # prostate cancer
    # leukemia
    'comor_lymphoma',  # lymphoma
    # multiple myeloma
    'comor_skin_cancer_squa',  # non-melanoma skin cancer (basal or squamous cell)
    'comor_skin_cancer_mel',  # melanoma skin cancer
    'comor_oth_cancer',  # other malignancy
    'comor_bowel_perf',  # GI perforation
    'comor_hepatic_wbiop',  # hepatic event requiring biopsy (serious)
    # drug-induced liver injury (serious)
    # fracture (serious)
    # suicidal thoughts
    # self-injury
    # suicide attempt
    # demyelinating disease
    # other neurological disorder (serious)
    # chronic kidney disease (moderate/severe)
    # acute kidney disease (moderate/severe)
    'comor_hemorg_hosp',  # hemorrhage (serious)
]

X_taes = pd.Series(
    data[targeted_adverse_events].sum(axis=1) > 0,
    name='taes',
    dtype=bool,
)

### Patient-reported medical problems

In [26]:
medical_problems = [
    #'med_probs_anxiety',
    #'med_probs_depression',
]

X_medical_problems = data[medical_problems]

### Other features

In [27]:
other_features = [
    'duration_ra',
    'smoker',
    'drinker',
]

X_other = data[other_features]

X_other.insert(0, 'year', data.visitdate.dt.year)

If possible, missing values are imputed by forward filling previously observed values.

In [28]:
X_other = X_other.groupby(by=data.id).fillna(method='ffill')

## Save data

In [29]:
X = pd.concat(
    [
        data.visitdate,
        X_demographics,
        X_other,
        X_insurance,
        X_medical_signs,
        X_pregnancy,
        X_ra_assessment,
        X_ra_biomarkers,
        X_infections,
        X_comor,
        X_taes,
        X_medical_problems,
    ],
    axis=1
)

In [30]:
Xg = pd.concat([X, data.id], axis=1)

In [31]:
Xgy = pd.concat([Xg, data.therapy], axis=1)

In [32]:
Xgy.to_pickle('../data/ra_Xgy.pkl')

In [34]:
#Xgy.select_dtypes(include=['bool'])

Unnamed: 0,smoker,drinker,insurance_private,insurance_medicare,insurance_medicaid,insurance_none,pregnant_current,infections,comor_metabolic,comor_cvd,comor_respiratory,comor_dil,comor_cancer,comor_gi_liver,comor_musculoskeletal,comor_other,taes
0,False,False,True,False,False,False,False,False,True,True,False,False,False,False,True,False,True
1,,True,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False
2,,True,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False
3,False,True,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False
4,False,True,True,False,False,False,,False,False,False,False,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
294238,,False,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False
294239,False,False,True,True,False,False,,False,True,True,False,False,False,False,False,False,True
294240,True,False,True,False,False,False,,False,False,False,True,False,False,True,True,False,False
294241,False,,True,True,False,False,,False,True,True,False,False,False,False,False,True,False
