# Generate tables and figures for paper

In [1]:
import pickle
from pathlib import Path

import nibabel as nib
import pandas as pd

from livingpark_utils.zeighamietal.constants import (
    COL_AGE,
    COL_PAT_ID, 
    COL_SEX,
    COL_FOLLOWUP,
    COL_STATUS, 
    COL_UPDRS2,
    COL_UPDRS3,
    COL_ADL,
    COL_UPDRS1,
    COL_MOCA,
    COL_PIGD,
    COL_GCO,
    STATUS_HC,
    STATUS_PD,
)

In [48]:
experiment_map = {
    1: '4554467823934740347_nlr4_dbm8_from_nifti_old_pipeline_qc',
    2: '4554467823934740347_nlr4_dbm8_from_nifti_old_pipeline_qc-30',
    3: '_460520092359662532_from_nifti_old_pipeline_qc',
    4: '_460520092359662532_from_nifti_old_pipeline_qc-30',
    5: '_460520092359662532_old_pipeline_qc',
    6: '_460520092359662532_old_pipeline_qc-30',
    7: '_460520092359662532_from_nifti_qc',
    8: '_460520092359662532_from_nifti_qc-30',
    9: '_460520092359662532_qc',
    10: '_460520092359662532_qc-30',
    11: '6874312680158848047_qc-30',
}

## Load results

In [76]:
experiment = 11
long_tag = experiment_map[experiment]

dpath_outputs = Path("outputs")
fpath_results = dpath_outputs / f'results-{long_tag}.pkl'

with fpath_results.open('rb') as file_results:
    results = pickle.load(file_results)

# print(f'auc:\t\t{results["auc"]}')
# print(f'auc_p_value:\t{results["auc_p_value"]}')
print(f'ica_corr:\t{results["ica_corr"]}')

# fill_width = max(len(key) for key in results.keys())
# for key, value in results.items():
#     if isinstance(value, nib.nifti1.Nifti1Image):
#         value = f'<Nifti1Image{value.shape}>'
#     elif isinstance(value, pd.DataFrame):
#         value = f'<pd.DataFrame{value.shape}>'
#     elif 'subjects' in key:
#         value = f'<{len(value)} subjects>'
#     print(f'{key.ljust(fill_width)}\t\t{value}')

ica_corr:	0.7237659999014978


## Pre-DBM demographics table (PD and HC)

In [43]:
df_assessments = pd.read_csv(results['fpath_assessments'])
# df_assessments.describe()

In [44]:
# print(df_assessments[COL_FOLLOWUP].value_counts(dropna=False))
# for is_followup in df_assessments[COL_FOLLOWUP].drop_duplicates():
#     print(f'===== is_followup: {is_followup} =====')
#     print(
#         df_assessments.loc[
#             df_assessments[COL_FOLLOWUP] == is_followup, 
#             COL_STATUS,
#         ].value_counts(dropna=False)
#     )

In [45]:
def to_1_decimal_str(f):
    return str(round(f, 1))

col_male = "is_male"
col_cohort = "cohort"
col_age_diag = "age_diag"
COL_FOLLOWUP = 'is_followup'

# only use baseline 
df_assessments_baseline = df_assessments.loc[~df_assessments[COL_FOLLOWUP]]

# append
dfs_summary = []
for status in [STATUS_PD, STATUS_HC]:
    df_summary = df_assessments_baseline.loc[df_assessments_baseline[COL_STATUS] == status]
    df_summary = df_summary.drop(columns=[COL_PAT_ID, COL_SEX])
    df_summary[col_cohort] = f"{status} (n = {len(df_summary)})"
    dfs_summary.append(df_summary)

df_summary = pd.concat(dfs_summary)
df_summary_means = (
    df_summary.groupby([col_cohort]).mean(numeric_only=True).applymap(to_1_decimal_str)
)
df_summary_stds = (
    df_summary.groupby([col_cohort]).std(numeric_only=True).applymap(to_1_decimal_str)
)
df_summary_stds = " (" + df_summary_stds + ")"
df_summary_stds.loc[:, col_male] = ""
df_summary_combined = (df_summary_means + df_summary_stds).T
df_summary_combined = df_summary_combined.applymap(lambda x: "-" if "nan" in x else x)
df_summary_combined = df_summary_combined.loc[[COL_AGE, col_male, COL_MOCA]]
df_summary_combined = df_summary_combined.rename(
    index={
        col_male: "Male (%)",
        COL_MOCA: "MoCA",
        COL_AGE: "Age",
    }
)

print(long_tag)
df_summary_combined

6874312680158848047_qc-30


cohort,Healthy Control (n = 116),Parkinson's Disease (n = 230)
Age,59.7 (11.3),61.3 (9.1)
Male (%),66.4,63.9
MoCA,28.3 (1.2),27.3 (2.2)


## Classification demographics table (PD only)

In [46]:
results['df_assessments_classification'][COL_FOLLOWUP].value_counts(dropna=False)

False    221
True     221
Name: is_followup, dtype: int64

In [47]:
def to_1_decimal_str(f):
    return str(round(f, 1))


# only report subjects used in the classification
# subjects = df_assessments_with_followup.loc[
#     (
#         df_assessments_with_followup[COL_FOLLOWUP], 
#     COL_PAT_ID
# ].drop_duplicates()

# the paper only report measures for the PD patients
# subjects_pd = df_status.loc[
#     (df_status[COL_PAT_ID].isin(subjects)) & (df_status[COL_STATUS] == STATUS_PD),
#     COL_PAT_ID,
# ]

# filter
# df_summary = df_assessments_with_followup.loc[
#     df_assessments_with_followup[COL_PAT_ID].isin(subjects_pd),
# ]

# append
dfs_summary = []
df_assessments_classification: pd.DataFrame = results['df_assessments_classification']
for is_followup in df_assessments_classification[COL_FOLLOWUP].drop_duplicates():
    df_summary = df_assessments_classification.loc[df_assessments_classification[COL_FOLLOWUP] == is_followup]
    df_summary[col_cohort] = f"PD patients (n = {len(df_assessments_classification.drop_duplicates(COL_PAT_ID))})"
    dfs_summary.append(df_summary)

df_summary = pd.concat(dfs_summary)
print(df_summary.columns)
df_summary = df_summary.drop(columns=[COL_PAT_ID, COL_AGE, COL_PIGD, 'INFODT', 'EVENT_ID', 'COHORT_DEFINITION'])
df_summary_means = (
    df_summary.groupby([col_cohort, COL_FOLLOWUP]).mean().applymap(to_1_decimal_str)
)
df_summary_stds = (
    df_summary.groupby([col_cohort, COL_FOLLOWUP]).std().applymap(to_1_decimal_str)
)
df_summary_stds = " (" + df_summary_stds + ")"
df_summary_stds.loc[:, col_male] = ""
df_summary_combined = (df_summary_means + df_summary_stds).T
df_summary_combined = df_summary_combined.loc[
    [col_age_diag, col_male, COL_UPDRS2, COL_UPDRS3, COL_ADL, COL_UPDRS1, COL_MOCA, COL_GCO]
]
# df_summary_combined = df_summary_combined.applymap(lambda x: "-" if "nan" in x else x)
df_summary_combined = df_summary_combined.rename(
    index={
        col_age_diag: "Age at diagnosis",
        col_male: "Male (%)",
        COL_UPDRS2: "UPDRS Part II",
        COL_UPDRS3: "UPDRS Part III",
        COL_ADL: "S&E ADL",
        # COL_PIGD: "PIGD",
        COL_UPDRS1: "UPDRS Part I",
        COL_MOCA: "MoCA",
        COL_GCO: "Global composite outcome",
    }
)
df_summary_combined

Index(['PATNO', 'INFODT', 'EVENT_ID', 'NP2PTOT', 'NP3TOT', 'MSEADLG', 'PIGD',
       'NP1RTOT+NP1PTOT', 'MCATOT', 'is_followup', 'AGE_AT_VISIT',
       'COHORT_DEFINITION', 'SEX', 'BIRTHDT', 'PDDXDT', 'age_diag', 'is_male',
       'GCO', 'cohort'],
      dtype='object')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_summary[col_cohort] = f"PD patients (n = {len(df_assessments_classification.drop_duplicates(COL_PAT_ID))})"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_summary[col_cohort] = f"PD patients (n = {len(df_assessments_classification.drop_duplicates(COL_PAT_ID))})"
  df_summary.groupby([col_cohort, COL_FOLLOWUP]).mean().applymap(to_1_decimal_str)
  df_summary.groupby([col_cohort, COL_FOLLOWUP]).std().applymap(to_1_decimal_str)


cohort,PD patients (n = 221),PD patients (n = 221)
is_followup,False,True
Age at diagnosis,60.7 (9.1),60.7 (9.1)
Male (%),64.3,64.3
UPDRS Part II,5.7 (3.9),10.8 (7.4)
UPDRS Part III,21.8 (9.1),23.1 (12.0)
S&E ADL,93.6 (5.7),86.1 (11.9)
UPDRS Part I,5.4 (3.8),9.5 (6.3)
MoCA,27.3 (2.2),26.5 (3.3)
Global composite outcome,-0.0 (0.4),0.2 (0.6)
