## Notebook purpose:
Count labels in labeled cohort file


In [None]:
import pandas as pd

from file_parsing.echomaster_file import load_echomaster
from file_parsing.labeled_cohort_file import load_labeled_cohort

pd.set_option("display.max_colwidth", None)
pd.set_option("display.max_rows", None)

### Load Patient Level Labels

In [None]:
# Read the patient level diagnosis data
labels = load_labeled_cohort()

# print(labels.columns)
# we keep these columns but you can also include columns specific to the datasets:
# cardiac path reports, pyp reports, and mayo labs
labels = labels[
    [
        "ir_id",
        "label__amyloid_diagnosis_date",
        "label__amyloid_diagnosis",
        "label__amyloid_subtype_diagnosis",
        "label__ttr_amyloid_subtype_diagnosis",
        "full_chart_review", # Flag for full chart review
        "label__chart_review", # Flag for chart review
        "pyp_or_tafamidis_only", # Flag for pyp or tafamidis
        "Tafamidis_cohort_entry", # Flag for Tafamidis
        "Amyloidosis", # Flag for Amyloidosis ICD code
        "label__missing_diagnosis", # Flag for ICD code but no diagnosis
        "echos_cohort_entry", # Flag for patient in echos cohort
        "notes_cohort_entry", # Flag for patient in notes cohort
        "HF_cohort_entry", # Flag for patient in HF cohort
        "patient_group__amyloid_cases", # Flag for patient with ICD or confirmed diagnosis
        "patient_group__HF_control", # Flag for confirmed negative, no ICD, no indeterminate and HF
        "patient_group__non_HF_control", # Flag for confirmed negative, no ICD, no indeterminate and not HF        
    ]
]
# TODO: Add condition to check if full chart review was done
labels["label__definitive"] = labels["label__amyloid_diagnosis"].isin(["POSITIVE", "NEGATIVE"]).astype(int)


In [None]:
labels.ir_id.nunique()

In [None]:
labels.label__chart_review.value_counts()

In [None]:
print(
    f'We have {len(labels["ir_id"].unique())} patients with a gold standard amyloid diagnosis.'
)

print(labels["label__amyloid_diagnosis"].value_counts(dropna=False))

print(labels["label__definitive"].value_counts(dropna=False))

print(
    labels[labels["label__amyloid_diagnosis"] == "POSITIVE"][
        "label__amyloid_subtype_diagnosis"
    ].value_counts(dropna=False)
)

print(
    labels[labels["label__amyloid_subtype_diagnosis"] == "TTR"][
        "label__ttr_amyloid_subtype_diagnosis"
    ].value_counts(dropna=False)
)

In [None]:
print(
    f'We have {len(labels[labels["Tafamidis_cohort_entry"]==1]["ir_id"].unique())} patients with Amyloid because they take Tafimifis medication.'
)
print(
    f'We have {len(labels[labels["pyp_or_tafamidis_only"]==1]["ir_id"].unique())} patients with Amyloid because they take Tafimifis medication or have positive PYP only.'
)
print(
    f'We have {len(labels["ir_id"].unique())} patients in the cohort.'
)

In [None]:
print(
    f'We have {len(labels[labels["Amyloidosis"] == 1]["ir_id"].unique())} patients with an Amyloid ICD code.'
)



In [None]:
labels["Amyloidosis"].value_counts(dropna=False)

In [None]:
labels["label__missing_diagnosis"].value_counts(dropna=False)

#### Load Echos Data

In [None]:
echos = load_echomaster()

# filter the echos to keep studies that are not limited and that have desired desired type 
keep_echo_types = ["Transthoracic", "Exercise Stress", "Pharmacological Stress", "Stress Type Unknown"] 
echos = echos.loc[(echos["echo_type"].isin(keep_echo_types)) & (echos["limited_echo"] == 0)]
# drop records with an echo extractor id
echos = echos[echos["echo_extractor_id"].isna()]
# sort chronologically so that the aggregation gives us a list of echos information in chronological order
echos.sort_values(by=['echo_date'], inplace=True)

# Group Rows into List on All columns
echos_by_patient = echos.groupby("ir_id").agg(list)
echos_by_patient.reset_index(inplace=True)

In [None]:
print(
    f'We have {len(echos)} echos.'
)
print(
    f'We have {len(echos_by_patient["ir_id"].unique())} patients with echos.'
)

print(
    f'We have {len(labels[labels["echos_cohort_entry"]==1])} patients with echos in the cohort.'
)

### Load Notes Data

In [None]:
labels.label__amyloid_diagnosis.value_counts(dropna=False)

In [None]:
labels[labels.label__amyloid_diagnosis == "POSITIVE"].label__amyloid_subtype_diagnosis.value_counts(dropna=False)

In [None]:
labels[labels.label__amyloid_subtype_diagnosis == "TTR"].label__ttr_amyloid_subtype_diagnosis.value_counts(dropna=False)

In [None]:
labels[labels.echos_cohort_entry == 1].label__amyloid_diagnosis.value_counts(dropna=False)

In [None]:
labels[labels.notes_cohort_entry == 1].label__amyloid_diagnosis.value_counts(dropna=False)

In [None]:
# ICD code
labels.Amyloidosis.value_counts(dropna=False)

In [None]:
labels.pyp_or_tafamidis_only.value_counts(dropna=False)

In [None]:
labels.patient_group__amyloid_cases.value_counts(dropna=False)


In [None]:
labels.patient_group__HF_control.value_counts(dropna=False)

In [None]:
labels.patient_group__non_HF_control.value_counts(dropna=False)