In [1]:
import os

import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
import tableone
import warnings

In [2]:
pd.options.display.max_columns = 200

In [3]:
data_path = pd.read_csv(
    "/Users/mxposed/Documents/data_paths/ssc.csv",
    index_col=0
).value["clinical_data"]

In [4]:
data_path = os.path.join(data_path, "processed")

# Create Table 1 for the cohort

The goal of this notebook is to create clinical Table 1 for our cohort.

We will combine all processed data tables

## 1. Load all processed data

In [5]:
samples = pd.read_csv("../00all-samples.csv")

Take only our samples

In [6]:
samples = samples.loc[samples.Study.eq("this"), :].copy()

In [7]:
samples["Patient_id"] = samples.Sample

Strip `_followup` to get Redcap patient ids

In [8]:
samples.Patient_id = samples.Patient_id.str.replace("_followup$", "", regex=True)

Number of unique patients among our samples

In [9]:
samples.Patient_id.nunique()

14

In [10]:
pt_info = pd.read_csv(os.path.join(data_path, "01_pt_info.csv"), index_col=0)

In [11]:
disease_info = pd.read_csv(os.path.join(data_path, "02_disease_info.csv"), index_col=0)

In [12]:
clin_info = pd.read_csv(os.path.join(data_path, "03_clin_info.csv"), index_col=0)

In [13]:
meds_info = pd.read_csv(os.path.join(data_path, "04_meds_info.csv"), index_col=0)

In [14]:
date_info = pd.read_csv(os.path.join(data_path, "05_date_info.csv"), index_col=0)

## 2. Determine day-0 BAL per patient

In [15]:
bronchs = date_info.loc[date_info.event.eq("bronch_date")].copy()

In [16]:
bronchs["Sample"] = bronchs.study_code

In [17]:
bronchs.loc[bronchs.redcap_event_name.eq("follow_up_arm_1"), "Sample"] += "_followup"

In [18]:
bronchs = bronchs.merge(samples, on="Sample")

In [19]:
bronchs.sort_values("date", inplace=True)

In [21]:
day0_dates = bronchs.groupby("study_code").head(1)[
    ["study_code", "date"]
].set_index("study_code").date

## 3. Determine non-Raynaud symptom onset

In [22]:
non_raynaud = date_info.loc[date_info.event.eq("ssc_nonraynaud_date")].copy()

In [23]:
non_raynaud.sort_values("date", inplace=True)

In [24]:
non_raynaud_dates = non_raynaud.groupby("study_code").head(1)[
    ["study_code", "date"]
].set_index("study_code").date

In [25]:
ild_diag = date_info.loc[date_info.event.eq('ild_diag_date')].copy()

In [26]:
ild_diag.sort_values('date', inplace=True)

In [27]:
ild_diag_dates = ild_diag.set_index('study_code').date

## 4. Take Clinical values closest to BAL

This step possibly needs curation, because the closest BAL date could be miles away

In [28]:
clin_info["bal_date"] = pd.to_datetime(day0_dates[clin_info.study_code].values)

In [29]:
for col in ("crp_date", "pft_date", "monocyte_date", "tte_date", "6mwd_date"):
    clin_info[col] = pd.to_datetime(clin_info[col])
    clin_info[f"{col}_delta"] = (clin_info.bal_date - clin_info[col]).dt.days.abs()

In [30]:
idx = clin_info.dlco_ratio >= 0.85
clin_info.loc[idx, 'dlco_delta'] = (
    clin_info.bal_date[idx] - clin_info.pft_date[idx]
).dt.days.abs()
clin_info.loc[~idx, 'dlco_pred'] = np.nan

All deltas in days

In [None]:
clin_info[
    ["study_code", "redcap_event_name"]
    + clin_info.columns[clin_info.columns.str.contains("delta")].tolist()
]

In [32]:
clin_condensed = pd.DataFrame(index=pt_info.study_code)

In [33]:
cols = {
    "crp_date": ["crp"],
    "pft_date": ["fvc_pred", "fev1_pred", "tlc_pred", "fev1_fvc", "dlco_pred"],
    "monocyte_date": ["monocyte_absolute", "monocyte_percentage"],
    "tte_date": ["echo_rvsp"],
    "6mwd_date": ["6mwd_act"]
}
for col, to_take in cols.items():
    df = clin_info.sort_values(f"{col}_delta").groupby("study_code").head(1).set_index("study_code")
    clin_condensed[to_take] = df[to_take]

In [34]:
duke_pft = pd.read_csv(os.path.join(data_path, "06_duke_pft.csv"), index_col=0).set_index('Sample')

In [36]:
for col in duke_pft.columns:
    clin_condensed.loc[duke_pft.index, col] = duke_pft[col]

Add Kazerooni scores because they are per-patient, without specific dates

In [37]:
clin_condensed_2 = clin_condensed.merge(
    clin_info[['study_code', 'k_score_ggo', 'k_score_fib']].drop_duplicates(),
    left_index=True,
    right_on='study_code',
    how='left'
)

Index was not preserved, but rows didn't change, we just copy it

In [38]:
clin_condensed_2.index = clin_condensed.index.copy()

In [39]:
clin_condensed_2.drop(columns='study_code', inplace=True)

Add Kazerooni scores for bronched lobe

In [40]:
clin_condensed_2 = clin_condensed_2.merge(
    clin_info[['study_code', 'lobe_ggo_score', 'lobe_fib_score']].dropna().drop_duplicates(),
    left_index=True,
    right_on='study_code',
    how='left'
)

In [41]:
clin_condensed_2.index = clin_condensed.index.copy()

In [42]:
clin_condensed_2.drop(columns='study_code', inplace=True)

In [None]:
clin_condensed_2 = clin_condensed_2.reset_index().merge(
    meds_info.loc[:, ['study_code', 'cv_med_mmf_v2']],
    on='study_code',
    how='left'
).set_index('study_code').rename(columns={'cv_med_mmf_v2': 'patient_on_mmf'})

In [44]:
clin_condensed_2.to_csv("../00clinical-v2.csv")

## X. Create table 1

In [45]:
data = pt_info.copy()

In [46]:
data["age_at_bal"] = pd.NaT

In [47]:
idx = data.year_of_birth.notna()
data.loc[idx, 'age_at_bal'] = (
    data.year_of_birth[idx].astype(int).astype(str) + '-01-01'
)

In [48]:
idx = data.study_code.isin(day0_dates.index)
data.loc[idx, "bal_date"] = pd.to_datetime(day0_dates[data.study_code[idx]].values)

In [49]:
data.age_at_bal = (data.bal_date - data.age_at_bal).dt.days / 365

In [50]:
idx = data.age.notna()
data.loc[idx, 'age_at_bal'] = data.age[idx]

In [51]:
idx = data.study_code.isin(non_raynaud_dates.index)
data.loc[idx, "disease_duration"] = pd.to_datetime(non_raynaud_dates[data.study_code[idx]].values)

In [52]:
data.disease_duration = (data.bal_date - data.disease_duration).dt.days / 365

In [53]:
idx = data.study_code.isin(ild_diag_dates.index)
data.loc[idx, "ild_disease_duration"] = pd.to_datetime(ild_diag_dates[data.study_code[idx]].values)

In [54]:
data.ild_disease_duration = (data.bal_date - data.ild_disease_duration).dt.days / 365

In [55]:
data.shape

(22, 14)

In [56]:
data = data.merge(disease_info, on="study_code")

In [57]:
data.smoker = data.smoker.replace({
    True: "current or former",
    False: "never"
})

In [58]:
pos = "Positive"
data["ab_any"] = (
    data.ab_scl70.eq(pos) | data.ab_aca.eq(pos) | data.ab_rnaiii.eq(pos) | data.ab_ana.eq(pos)
)
data.loc[data.disease.eq("control"), "ab_any"] = np.nan
data.ab_any = data.ab_any.replace({
    True: "positive",
    False: "negative"
})

In [59]:
for ab_col in ("ab_scl70", "ab_aca", "ab_rnaiii", "ab_ana"):
    data[ab_col] = data[ab_col].replace({"Unknown": np.nan}).str.lower()

In [60]:
data = data.merge(meds_info, on="study_code", how="left")

Remove `Prednisone` becasuse its dose is just 1mg/day

In [61]:
data["any_meds"] = data.cv_med_mmf_v2 | data.cv_med_rtx_v2

In [62]:
data.loc[data.disease.eq("control"), "any_meds"] = np.nan

In [63]:
data.any_meds = data.any_meds.replace({
    True: "taking",
    False: "not taking"
})

In [64]:
for med_col in ("cv_med_mmf_v2", "cv_med_rtx_v2"):
    data[med_col] = data[med_col].replace({
        True: "taking",
        False: "not taking"
    })

In [65]:
data = data.merge(clin_condensed_2.astype(float), on="study_code")

In [66]:
idx = data.ssc_subtype.eq('lcSSc')
data.loc[idx, 'mrss_lcssc'] = data.mrss[idx]
idx = data.ssc_subtype.eq('dcSSc')
data.loc[idx, 'mrss_dcssc'] = data.mrss[idx]

In [67]:
data.cough = data.cough.replace({0: 'negative', 1: 'Cough'})

In [68]:
data.dyspnea = data.dyspnea.replace({0: 'negative', 1: 'Dyspnea'})

In [None]:
columns = [
    "disease", "age_at_bal", "sex", "race", "smoker",
    "ssc_subtype", "disease_duration", 'ild_disease_duration', 'cough', 'dyspnea',
    "ab_ana", "ab_scl70", "ab_aca", "ab_rnaiii", "crp",
    "monocyte_percentage", "mrss", "mrss_lcssc", "mrss_dcssc", "acr_ulc_pit",
    "any_meds", "cv_med_mmf_v2", "cv_med_rtx_v2",
    "fvc_pred", "fev1_pred", "fev1_fvc", "tlc_pred",
    "dlco_pred", "echo_rvsp", "6mwd_act",
    'k_score_ggo', 'k_score_fib', 'lobe_ggo_score', 'lobe_fib_score'
]
nonnormal = [
    "age_at_bal", "disease_duration", 'ild_disease_duration',
    "crp", "monocyte_percentage", "mrss", "mrss_lcssc", "mrss_dcssc",
    "fvc_pred", "fev1_pred", "fev1_fvc", "tlc_pred",
    "dlco_pred", "echo_rvsp", "6mwd_act",
    'k_score_ggo', 'k_score_fib', 'lobe_ggo_score', 'lobe_fib_score'
]
renames = {
    "disease": "Patient disease",
    "age_at_bal": "Age, years",
    "sex": "Sex",
    "race": "Race",
    "smoker": "Smoker",
    "ssc_subtype": "SSc subtype",
    "disease_duration": "Time since SSc diagnosis, years",
    "ild_disease_duration": "Time since SSc-ILD diagnosis, years",
    'cough': 'Symptoms',
    'dyspnea': 'Dyspnea',
    # "ab_any": "SSc-specific autoantibodies",
    "ab_ana": "Anti-nuclear",
    "ab_scl70": "Anti-topoisomerase I (Scl-70)",
    "ab_aca": "Anti-centromere",
    "ab_rnaiii": "Anti-RNA polymerase III",
    "crp": "C-reactive protein, mg/l",
    "monocyte_percentage": "Blood monocyte %",
    "mrss": "Modified Rodnan skin score",
    "acr_ulc_pit": "Digital ulcer or pitting scars",
    "any_meds": "Medications",
    "cv_med_mmf_v2": "Mycophenolate",
    # "cv_med_pred_v2": "Prednisone",
    "cv_med_rtx_v2": "Rituximab",
    "fvc_pred": "FVC % predicted",
    "fev1_pred": "FEV1 % predicted",
    "fev1_fvc": "FEV1/FVC ratio, %",
    "tlc_pred": "TLC % predicted",
    "dlco_pred": "DLCO % predicted",
    "echo_rvsp": "Estimated RVSP, mmHg",
    "6mwd_act": "6MWD distance, m",
    'k_score_ggo': 'Average Ground-glass opacity score per lobe',
    'k_score_fib': 'Average Fibrosis score per lobe',
    'lobe_ggo_score': 'Ground-glass opacity score',
    'lobe_fib_score': 'Fibrosis score'
}
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    table1 = tableone.TableOne(
        data,
        columns=columns,
        nonnormal=nonnormal,
        groupby=["disease"],
        missing=False,
        overall=False,
        decimals={
            "age_at_bal": 1,
            "sex": 0,
            "race": 0,
            "smoker": 0,
            "ssc_subtype": 0,
            "disease_duration": 1,
            "ild_disease_duration": 1,
            'cough': 0,
            'dyspnea': 0,
            # "ab_any": 0,
            "ab_ana": 0,
            "ab_scl70": 0,
            "ab_aca": 0,
            "ab_rnaiii": 0,
            "crp": 1,
            "monocyte_percentage": 1,
            "mrss": 0,
            "mrss_lcssc": 0,
            "mrss_dcssc": 0,
            "acr_ulc_pit": 0,
            "any_meds": 0,
            "cv_med_mmf_v2": 0,
            "cv_med_pred_v2": 0,
            "cv_med_rtx_v2": 0,
            "fvc_pred": 1,
            "fev1_pred": 1,
            "fev1_fvc": 1,
            "tlc_pred": 1,
            "dlco_pred": 1,
            "echo_rvsp": 1,
            "6mwd_pct": 1,
            'k_score_gg': 1,
            'k_score_fib': 1,
            'lobe_ggo_score': 1,
            'lobe_fib_score': 1,
        },
        # limit=1,
        rename=renames
    )

Heavy manual curation of Table One output below

In [70]:
table1.tableone.replace("nan (nan)", "", inplace=True)
table1.tableone.replace("nan [nan,nan]", "", inplace=True)

In [71]:
table1.tableone.reset_index(inplace=True)

Add number of datapoints where it is not full

In [72]:
for var, name in renames.items():
    for disease in data.disease.unique():
        n_na = data.loc[data.disease.eq(disease), var].isna().sum()
        n_not_na = data.loc[data.disease.eq(disease), var].notna().sum()
        if n_na > 0:
            idx = table1.tableone.level_0.str.startswith(name)
            col = ('Grouped by Patient disease', disease)
            val = table1.tableone.loc[idx, col].values[0]
            if len(val) > 0:
                table1.tableone.loc[idx, col] = (
                    table1.tableone.loc[idx, col] + f'\\newline(\\textit{{n}} = {n_not_na})'
                )
                print(f'{n_na} NA values for {var} for {disease}')

1 NA values for ab_rnaiii for SSc
1 NA values for crp for SSc
5 NA values for fvc_pred for control
5 NA values for fev1_pred for control
1 NA values for tlc_pred for SSc
2 NA values for dlco_pred for SSc
1 NA values for echo_rvsp for SSc
2 NA values for 6mwd_act for SSc


In [73]:
table1.tableone

Unnamed: 0_level_0,level_0,level_1,Grouped by Patient disease,Grouped by Patient disease
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,SSc,control
0,n,,9,13
1,"Age, years, median [Q1,Q3]",,"53.8 [37.9,60.4]","29.0 [24.0,34.0]"
2,"Sex, n (%)",Female,6 (67),9 (69)
3,"Sex, n (%)",Male,3 (33),4 (31)
4,"Race, n (%)",African American,1 (11),2 (15)
5,"Race, n (%)",Caucasian,5 (56),8 (62)
6,"Race, n (%)",Hispanic or Latino,3 (33),2 (15)
7,"Race, n (%)",Asian,,1 (8)
8,"Smoker, n (%)",current or former,1 (11),
9,"Smoker, n (%)",never,8 (89),13 (100)


In [74]:
table1.tableone.loc[table1.tableone.level_0.eq("Anti-centromere, n (%)"), "level_1"] = "positive"
table1.tableone.loc[table1.tableone.level_0.eq("Anti-centromere, n (%)"), ("Grouped by Patient disease", "SSc")] = "0 (0)"

In [75]:
table1.tableone = table1.tableone.loc[~table1.tableone.level_1.isin(
    ["negative", "not taking", "no digital ulcers or pitting scars"]
)].copy()

In [76]:
table1.tableone.loc[table1.tableone.level_0.eq("SSc-specific autoantibodies, n (%)"), "level_1"] = ""

In [77]:
idx = table1.tableone.level_1.eq("positive")
table1.tableone.loc[idx, "level_1"] = table1.tableone.level_0[idx].values
table1.tableone.loc[idx, "level_0"] = "SSc-specific autoantibodies, n (%)"

In [78]:
table1.tableone.loc[table1.tableone.level_0.eq("Medications, n (%)"), "level_1"] = ""

In [79]:
idx = table1.tableone.level_1.eq("taking")
table1.tableone.loc[idx, "level_1"] = table1.tableone.level_0[idx].values
table1.tableone.loc[idx, "level_0"] = "Medications, n (%)"

In [80]:
table1.tableone[("Grouped by Patient disease", "control")].replace("", "—", inplace=True)

In [81]:
table1.tableone[("Grouped by Patient disease", "SSc")].replace("", "—", inplace=True)

In [82]:
table1.tableone.loc[
    table1.tableone.level_0.eq("mrss_lcssc, median [Q1,Q3]"),
    "level_1"
] = "lcSSc"
table1.tableone.loc[
    table1.tableone.level_0.eq("mrss_lcssc, median [Q1,Q3]"),
    "level_0"
] = ""
table1.tableone.loc[
    table1.tableone.level_0.eq("mrss_dcssc, median [Q1,Q3]"),
    "level_1"
] = "dcSSc"
table1.tableone.loc[
    table1.tableone.level_0.eq("mrss_dcssc, median [Q1,Q3]"),
    "level_0"
] = ""
table1.tableone.loc[
    table1.tableone.level_0.eq("Modified Rodnan skin score, median [Q1,Q3]"),
    "level_1"
] = "total"

In [83]:
idx = table1.tableone.level_0.str.contains("%")

In [84]:
for col in table1.tableone.columns[2:]:
    table1.tableone.loc[idx, col] = table1.tableone.loc[idx, col].str.replace(r"\((\d+)\)", r"(\1%)", regex=True)

In [85]:
for col in table1.tableone.columns[:2]:
    table1.tableone[col] = table1.tableone[col].str.replace(" (%)", "", regex=False)

In [86]:
table1.tableone.loc[table1.tableone.level_0.eq(', n'), 'level_0'] = ''

In [87]:
idx = table1.tableone.level_0.str.startswith('Average')
table1.tableone.loc[idx, 'level_1'] = table1.tableone.level_0[idx].values
table1.tableone.level_0 = table1.tableone.level_0.replace({
    'Average Ground-glass opacity score per lobe, median [Q1,Q3]': 'Average HRCT score per lobe, median [Q1,Q3]',
    'Average Fibrosis score per lobe, median [Q1,Q3]': '',
})

In [88]:
idx = table1.tableone.level_0.eq('Ground-glass opacity score, median [Q1,Q3]')
table1.tableone.loc[idx, 'level_1'] = 'Ground glass opacity'
idx = table1.tableone.level_0.eq('Fibrosis score, median [Q1,Q3]')
table1.tableone.loc[idx, 'level_1'] = 'Fibrosis'
table1.tableone.level_0 = table1.tableone.level_0.replace({
    'Ground-glass opacity score, median [Q1,Q3]': 'HRCT score in lavaged lobe, median [Q1,Q3]',
    'Fibrosis score, median [Q1,Q3]': '',
})

In [89]:
idx = table1.tableone.level_0.eq('Time since SSc diagnosis, years, median [Q1,Q3]')
table1.tableone.loc[idx, 'level_1'] = 'SSc'
idx = table1.tableone.level_0.eq('Time since SSc-ILD diagnosis, years, median [Q1,Q3]')
table1.tableone.loc[idx, 'level_1'] = 'SSc-ILD'

In [90]:
table1.tableone.level_0 = table1.tableone.level_0.replace({
    'Time since SSc diagnosis, years, median [Q1,Q3]': 'Time since diagnosis, years, median [Q1,Q3]',
    'Time since SSc-ILD diagnosis, years, median [Q1,Q3]': '',
    'Dyspnea, n': '',
})

In [91]:
table1.tableone.level_1 = table1.tableone.level_1.replace({
    'Average Ground-glass opacity score per lobe, median [Q1,Q3]': 'Ground glass opacity',
    'Average Fibrosis score per lobe, median [Q1,Q3]': 'Fibrosis',
})

In [92]:
table1.tableone.level_1 = table1.tableone.level_1.str.replace(', n', '')

In [93]:
table1.tableone.set_index(["level_0", "level_1"], inplace=True, drop=True)

In [94]:
table1.tableone.index.names = [None, None]

In [95]:
table1.tableone.columns = table1.tableone.columns.remove_unused_levels()

In [96]:
print(
    table1.tabulate(tablefmt="latex")
        .replace('\\textbackslash{}', '\\')
        .replace('\\{', '{')
        .replace('\\}', '}')
)

\begin{tabular}{llll}
\hline
                                             &                               & SSc                                         & control                                     \\
\hline
 n                                           &                               & 9                                           & 13                                          \\
 Age, years, median [Q1,Q3]                  &                               & 53.8 [37.9,60.4]                            & 29.0 [24.0,34.0]                            \\
 Sex, n                                      & Female                        & 6 (67\%)                                     & 9 (69\%)                                     \\
                                             & Male                          & 3 (33\%)                                     & 4 (31\%)                                     \\
 Race, n                                     & African American              & 1 (11\%)             

Copying the above to [Overleaf project](https://www.overleaf.com/project/641e418abce263520201027a) to get nice pdf table