# Make RA data

In [1]:
import os
import pandas as pd

## Load data

In [2]:
data = pd.read_pickle('../data/ra_data.pkl')

In [3]:
# Ignore IL-1 therapies.
data.replace(['il-1 mono', 'il-1 combo'], 'other therapies', inplace=True)

In [4]:
def rename_therapies(therapies):
    therapy_mapper = {
        'abatacept combo': 'Abatacept combo',
        'abatacept mono': 'Abatacept mono',
        'il-1 combo': 'IL-1Ri combo',
        'il-1 mono': 'IL-1Ri mono',
        'il-6 combo': 'IL-6Ri combo',
        'il-6 mono': 'IL-6Ri mono',
        'rituximab combo': 'Rituximab combo',
        'rituximab mono': 'Rituximab mono',
        'tnfi combo': 'TNFi combo',
        'tnfi mono': 'TNFi mono',
        'jaki combo': 'JAKi combo',
        'jaki mono': 'JAKi mono',
        'csdmard combo': 'csDMARD combo',
        'csdmard mono': 'csDMARD mono',
        'no dmard': 'No DMARD',
        'other therapies': 'Other',
    }

    new_therapies = pd.Categorical(
        [therapy_mapper[t] for t in therapies],
        categories=therapy_mapper.values()
    )
    
    return new_therapies.remove_unused_categories()

data.therapy = rename_therapies(data.therapy)

## A: Continous variables

In [5]:
continuous_variables = [
    'age',
    'bmi',
    'duration_ra',
    'tender_jts_28',
    'swollen_jts_28',
    'cdai',
    'das',
    'pt_pain',
    'pt_fatigues',
]
A = data[continuous_variables]
A.insert(0, 'year', data.visitdate.dt.year)

## B: Categorical variables

In [6]:
categorical_variables = [
    'gender',
    'race',
    'final_education',
    'insurance_private',
    'insurance_medicare',
    'insurance_medicaid',
    'insurance_none',
    'smoker',
    'work_status',
    'rfpos',
    'ccppos',
    'comor_hld',
    'comor_diabetes',
    'med_probs_anxiety',
    'med_probs_depression',
    'med_probs_inf_hosp',
    'am_stiffness',
]
B = data[categorical_variables]

## C: Medical history

In [7]:
cvd = [
    'hxcor_art_dis',
    'hxstroke',
    'hxtia',
    'hxcarotid',
    'hxpef_art_dis',
    'hxoth_clot',
    'hxpulm_emb',
    'hxmi',
]
hxcvd = pd.Series(data[cvd].sum(axis=1) > 0, name='hxcvd', dtype=bool)

cancer = [
    'hxbc',
    'hxlc',
    'hxlymphoma',
    'hxskin_cancer_mel',
    'hxoth_cancer',
]
hxcancer = pd.Series(data[cancer].sum(axis=1) > 0, name='hxcancer', dtype=bool)

htn = ['hxhtn_hosp', 'hxhtn']
hxhtn = pd.Series(data[htn].sum(axis=1) > 0, name='hxhtn', dtype=bool)

C = pd.concat([hxcvd, hxcancer, hxhtn], axis=1)

## D: Treatment history

In [8]:
hxtherapy = [
    'hxcsdmard',
    'hxtnfi',
    'hxabatacept',
    'hxrituximab',
    'hxil-6',
    'hxjaki',
]
D  = data[hxtherapy].astype('boolean')

## Save data

In [9]:
X = pd.concat([A, B, C, D, data.visitdate], axis=1)

In [10]:
X = X.groupby(by=data.id, sort=False).fillna(method='ffill')

In [11]:
Xg = pd.concat([X, data.id], axis=1)

In [12]:
Xgy = pd.concat([Xg, data.therapy], axis=1)

In [13]:
Xgy.to_pickle('../data/ra_Xgy.pkl')