# Make RA data

In [1]:
import pandas as pd

In [2]:
impute = True

In [3]:
def summarize(X, name, impute=False, min_count=0):
    if impute:
        return pd.Series(X.sum(axis=1) > 0, name=name, dtype='boolean')
    else:
        num_nan = X.isna().sum(axis=1)
        n = X.shape[1]
        assert ((num_nan == n) | (num_nan <= (n-min_count))).all()

        X = X.sum(axis=1, min_count=min_count)
        X = X.where(X.isna(), X > 0)
        X.name = name

        return X.astype('boolean')

## Load data

In [5]:
data = pd.read_pickle('../data/ra_data.pkl')

In [5]:
# Ignore IL-1 therapies.
data.replace(['il-1 mono', 'il-1 combo'], 'other therapies', inplace=True)

In [6]:
def rename_therapies(therapies):
    therapy_mapper = {
        'abatacept combo': 'Abatacept combo',
        'abatacept mono': 'Abatacept mono',
        'il-1 combo': 'IL-1Ri combo',
        'il-1 mono': 'IL-1Ri mono',
        'il-6 combo': 'IL-6Ri combo',
        'il-6 mono': 'IL-6Ri mono',
        'rituximab combo': 'Rituximab combo',
        'rituximab mono': 'Rituximab mono',
        'tnfi combo': 'TNFi combo',
        'tnfi mono': 'TNFi mono',
        'jaki combo': 'JAKi combo',
        'jaki mono': 'JAKi mono',
        'csdmard combo': 'csDMARD combo',
        'csdmard mono': 'csDMARD mono',
        'no dmard': 'No DMARD',
        'other therapies': 'Other',
    }

    new_therapies = pd.Categorical(
        [therapy_mapper[t] for t in therapies],
        categories=therapy_mapper.values()
    )
    
    return new_therapies.remove_unused_categories()

data.therapy = rename_therapies(data.therapy)

## Select features

### Demographics

In [7]:
demographics = [
    'age',
    'gender',
    'race',
    'final_education',
    'work_status',
]

X_demographics = data[demographics]

If possible, missing values are imputed by forward filling previously observed values.

In [8]:
if impute:
    X_demographics = X_demographics.groupby(by=data.id).fillna(method='ffill')

### Insurance

In [9]:
insurance = [
    'insurance_private',
    'insurance_medicare',
    'insurance_medicaid',
    'insurance_none',
]

X_insurance = data[insurance]

If possible, missing values are imputed by forward filling previously observed values.

In [10]:
if impute:
    X_insurance = X_insurance.groupby(by=data.id).fillna(method='ffill')

### Medical signs

In [11]:
medical_signs = [
    'bmi',
    'seatedbp1',
    'seatedbp2',
]

X_medical_signs = data[medical_signs]

If possible, missing values are imputed by forward filling previously observed values.

In [12]:
if impute:
    X_medical_signs = X_medical_signs.groupby(by=data.id).fillna(method='ffill')

### Pregnancy

In [13]:
pregnancy = [
    'pregnant_current',
]

X_pregnancy = data[pregnancy]

### RA assessment

In [14]:
ra_assessment = [
    'pt_pain',
    'pt_fatigues',
    'cdai',
    'das',
]

X_ra_assessment = data[ra_assessment]

If possible, missing values are imputed by forward filling previously observed values.

In [15]:
if impute:
    X_ra_assessment = X_ra_assessment.groupby(by=data.id).fillna(method='ffill')

### RA biomarkers

In [16]:
to_replace = {'False': 'Negative', 'True': 'Positive'}
ccpos = data.ccppos.astype('string').fillna('Not tested').replace(to_replace).astype('category')
rfpos = data.rfpos.astype('string').fillna('Not tested').replace(to_replace).astype('category')

mapper = {'ccppos': 'ccp', 'rfpos': 'rf'}
X_ra_biomarkers = pd.concat([ccpos, rfpos], axis=1).rename(columns=mapper)

### Infections

Missing values are implicitly encoded as `False`.

In [17]:
infections = [
    'hospinf',
    'ivinf',
]

X_infections = summarize(
    data[infections],
    name='infections',
    impute=impute,
    min_count=len(infections)
)

### Comorbidities

Missing values are implicitly encoded as `False`.

In [18]:
comor_metabolic = [
    'comor_hld',
    'comor_diabetes',
]

X_comor_metabolic = summarize(
    data[comor_metabolic],
    name='comor_metabolic',
    impute=impute,
    min_count=len(comor_metabolic)
)

In [19]:
comor_cardiovascular = [
    'comor_htn_hosp',
    'comor_htn',
    'comor_revasc',
    'comor_ven_arrhythm',
    'comor_mi',
    'comor_acs',
    'comor_unstab_ang',
    'comor_cor_art_dis',
    'comor_chf_hosp',
    'comor_chf_nohosp',
    'comor_stroke',
    'comor_tia',
    'comor_card_arrest',
    'comor_oth_clot',
    'comor_pulm_emb',
    'comor_pef_art_dis',
    'comor_pat_event',
    'comor_urg_par',
    'comor_pi',
    'comor_carotid',
    'comor_other_cv',
]

X_comor_cvd = summarize(
    data[comor_cardiovascular],
    name='comor_cvd',
    impute=impute,
    min_count=12
)

In [20]:
comor_respiratory = [
    'comor_copd',
    'comor_asthma',
    'comor_fib',
]

X_comor_respiratory = summarize(
    data[comor_respiratory],
    name='comor_respiratory',
    impute=impute,
    min_count=len(comor_respiratory)
)

In [21]:
comor_dil = [
    'comor_drug_ind_sle',
]

X_comor_dil = summarize(
    data[comor_dil],
    name='comor_dil',
    impute=impute,
    min_count=len(comor_dil)
)

In [22]:
comor_cancer = [
    'comor_bc',
    'comor_lc',
    'comor_lymphoma',
    'comor_skin_cancer_squa',
    'comor_skin_cancer_mel',
    'comor_oth_cancer',
]

X_comor_cancer = summarize(
    data[comor_cancer],
    name='comor_cancer',
    impute=impute,
    min_count=len(comor_cancer)
)

In [23]:
comor_gi_liver = [
    'comor_ulcer',
    'comor_bowel_perf',
    'comor_hepatic_wbiop',
    'comor_hepatic_nobiop',
]

X_comor_gi_liver = summarize(
    data[comor_gi_liver],
    name='comor_gi_liver',
    impute=impute,
    min_count=len(comor_gi_liver)
)

In [24]:
comor_musculoskeletal = [
    'sec_sjog',  # 0: No, 1: Yes, 2: New
    'jt_deform',  # 0: No, 1: Yes, 2: New
]

X_comor_musculoskeletal = summarize(
    data[comor_musculoskeletal],
    name='comor_musculoskeletal',
    impute=impute,
    min_count=1
)

In [25]:
comor_other = [
    'comor_psoriasis',
    'comor_depression',
    'comor_fm',
    'comor_oth_neuro',
    'comor_hemorg_hosp',
    'comor_hemorg_nohosp',
    'comor_oth_cond',
]

X_comor_other = summarize(
    data[comor_other],
    name='comor_other',
    impute=impute,
    min_count=len(comor_other)
)

In [26]:
X_comor = pd.concat(
    [
        X_comor_metabolic,
        X_comor_cvd,
        X_comor_respiratory,
        X_comor_dil,
        X_comor_cancer,
        X_comor_gi_liver,
        X_comor_musculoskeletal,
        X_comor_other
    ],
    axis=1,
)

### Targeted adverse events

Missing values are implicitly encoded as `False`.

In [27]:
targeted_adverse_events = [
    'comor_htn_hosp',  # hypertension (serious)
    'comor_revasc',  # CABG or angioplasty
    'comor_ven_arrhythm',  # ventricular arrhythmia
    'comor_mi',  # myocardial infarction
    'comor_acs',  # acute coronary syndrome
    'comor_unstab_ang',  # unstable angina
    'comor_chf_hosp',  # congestive heart failure (serious)
    'comor_stroke',  # stroke
    'comor_tia',  # transient ischemic attack
    # other cardiac condition (serious)
    'comor_oth_clot',  # deep vein thrombosis
    'comor_pulm_emb',  # pulmonary embolism
    'comor_pat_event',  # peripheral arterial thromboembolic event
    'comor_urg_par',  # urgent peripheral arterial revascularization
    'comor_pi',  # peripheral ischemia or gangrene
    # other vascular condition (serious)
    # COPD exacerbation (serious)
    # drug hypersensitivity reaction (severe)
    # drug hypersensitivity reaction (anaphylaxis)
    'comor_bc',  # breast cancer
    'comor_lc',  # lung cancer
    # colon cancer
    # uterine cancer
    # cervical cancer
    # prostate cancer
    # leukemia
    'comor_lymphoma',  # lymphoma
    # multiple myeloma
    'comor_skin_cancer_squa',  # non-melanoma skin cancer (basal or squamous cell)
    'comor_skin_cancer_mel',  # melanoma skin cancer
    'comor_oth_cancer',  # other malignancy
    'comor_bowel_perf',  # GI perforation
    'comor_hepatic_wbiop',  # hepatic event requiring biopsy (serious)
    # drug-induced liver injury (serious)
    # fracture (serious)
    # suicidal thoughts
    # self-injury
    # suicide attempt
    # demyelinating disease
    # other neurological disorder (serious)
    # chronic kidney disease (moderate/severe)
    # acute kidney disease (moderate/severe)
    'comor_hemorg_hosp',  # hemorrhage (serious)
]

X_taes = summarize(
    data[targeted_adverse_events],
    name='taes',
    impute=impute,
    min_count=17
)

### Patient-reported medical problems

In [28]:
medical_problems = [
    #'med_probs_anxiety',
    #'med_probs_depression',
]

X_medical_problems = data[medical_problems]

### Other features

In [29]:
other_features = [
    'duration_ra',
    'smoker',
    'drinker',
]

X_other = data[other_features]

X_other.insert(0, 'year', data.visitdate.dt.year)

If possible, missing values are imputed by forward filling previously observed values.

In [30]:
if impute:
    X_other = X_other.groupby(by=data.id).fillna(method='ffill')

## Save data

In [32]:
X = pd.concat(
    [
        data.visitdate,
        X_demographics,
        X_other,
        X_insurance,
        X_medical_signs,
        X_pregnancy,
        X_ra_assessment,
        X_ra_biomarkers,
        X_infections,
        X_comor,
        X_taes,
        X_medical_problems,
    ],
    axis=1
)

In [34]:
Xg = pd.concat([X, data.id], axis=1)

In [35]:
Xgy = pd.concat([Xg, data.therapy], axis=1)

In [36]:
#patients_with_other = Xgy.loc[Xgy.therapy=='Other', 'id'].unique()
#Xgy = Xgy.loc[~Xgy.id.isin(patients_with_other)]
#Xgy.therapy = Xgy.therapy.cat.remove_unused_categories()

In [37]:
Xgy.to_pickle('../data/ra_Xgy_0403.pkl')

In [38]:
#Xgy.select_dtypes(include=['bool'])