## Notebook purpose:
In this notebook we take patient level amyloid diagnosis data for our cohort and merge it with chart review data.

### Datasets:
1. **cohort patient labels** is the dataframe of labels for patients in our cohort that is output from the notebook `merge__labels_clinical_chart_review_tafamidis_2023.ipynb`
2. Labels from clinical chart review following an analysis of clashes in 1, and labeling of notes and mri reports of patients with amyloid ICD codes. **These labels replace the labels from source 1**.
  - Gold Standard
  - Classes: 
    - AL
      - mapped to (POSITIVE, AL, NaN)
    - hTTR
      - mapped to (POSITIVE, TTR, HTTR)
    - wTTR
      - mapped to (POSITIVE, TTR, INDETERMINATE)
    - TTR - subtype pending
      - mapped to (POSITIVE, TTR, TTR - w/u pending)
    - positive - subtype pending
      - mapped to (POSITIVE, INDETERMINATE, NaN)
    - negative
      - mapped to (NEGATIVE, NaN, NaN)
    - other
      - mapped to (INDETERMINATE, NaN, NaN)
    - other - unknown
      - mapped to (INDETERMINATE, NaN, NaN)


### Output
**cohort_amyloid_labels__chart_reviewed.csv** is the dataframe of labels for patients in our cohort. The `label__amyloid_diagnosis`, `label__amyloid_subtype_diagnosis`, `label__ttr_amyloid_subtype_diagnosis`, `label__amyloid_diagnosis_date` columns have the label obtained from merging the previous labels with the chart review data. 


In [34]:
import numpy as np
import pandas as pd
from pathlib import Path


pd.set_option("display.max_colwidth", None)
pd.set_option("display.max_rows", None)


In [69]:
# The path to the Amyloid data
DATASET_PATH = Path("/data/datasets/Amyloidosis/")

# new chart revieew
cohort_chart_reviews_path = (
    DATASET_PATH / "patient_amyloid_diagnosis" / "cohort_chart_reviews_simons_copy.xlsx"
)

chart_reviews_2023_path = (
    DATASET_PATH / "2023 pull" / "Amyloid_Clinic_Anna.xlsx"
)

# Patient level diagnoses from cardiac path reports, pyp reports, and mayo labs
# Merged with clinic chart review labels
# This yields the cohort amyloid labels, which are GOLD STANDARD for amyloid diagnosis
cohort_amyloid_labels_path = (
    DATASET_PATH / "2023 pull" / "patient_amyloid_diagnosis" / "cohort_amyloid_labels 2023.csv"
)


### Load Cohort Labels Data

In [70]:
cohort_labels = pd.read_csv(cohort_amyloid_labels_path)

In [74]:
cohort_labels.label__amyloid_diagnosis_date = pd.to_datetime(cohort_labels.label__amyloid_diagnosis_date)

In [None]:
print(
    f'We have {len(cohort_labels["ir_id"].unique())} patients with a gold standard amyloid diagnosis.'
)

print(cohort_labels["final__amyloid_diagnosis"].value_counts(dropna=False))

print(cohort_labels["final__amyloid_subtype_diagnosis"].value_counts(dropna=False))

print(cohort_labels["final__ttr_amyloid_subtype_diagnosis"].value_counts(dropna=False))

### Load Cohort Chart Review Data

In [77]:
chart_reviews = pd.read_excel(cohort_chart_reviews_path)
chart_reviews = chart_reviews[chart_reviews["amyloid_type"].notna()]
chart_reviews.ir_id = chart_reviews.ir_id.astype(int)
chart_reviews.label = chart_reviews.label.astype("string").str.strip()
chart_reviews.document_description = chart_reviews.document_description.astype("string").str.strip()
chart_reviews.diagnosis_method = chart_reviews.diagnosis_method.astype("string").str.strip()
chart_reviews.AL_cardiac_test = chart_reviews.AL_cardiac_test.astype("string").str.strip()
chart_reviews.amyloid_type = chart_reviews.amyloid_type.astype("string").str.strip()
chart_reviews.priority =  chart_reviews.priority.astype(int)
chart_reviews.diagnosis_date_norm = pd.to_datetime(chart_reviews.diagnosis_date_norm, errors="coerce")
if "Unnamed: 0" in chart_reviews.columns:
    chart_reviews.drop(columns=["Unnamed: 0"], inplace=True)

In [78]:
assert chart_reviews.shape[0] == chart_reviews["ir_id"].nunique(), "Some patients have more than one record"
# add prefix 'chart_reviews__' to each column except for 'ir_id'
chart_reviews = pd.concat(
    [
        chart_reviews[chart_reviews.columns[0]],
        chart_reviews[chart_reviews.columns[1:]].add_prefix("full_chart_reviews__"),
    ],
    axis=1,
)

In [80]:
chart_reviews.drop(columns=["full_chart_reviews__diagnosis_date", "full_chart_reviews__label_date"], inplace=True)
chart_reviews.rename(columns={"full_chart_reviews__label": "full_chart_reviews__suggested_label"}, inplace=True)

In [None]:
print(
    f'We have {chart_reviews.ir_id.nunique()} patients Chart review.'
)

chart_reviews.full_chart_reviews__amyloid_type.value_counts(dropna=False)


### Read Chart Review 2023 Data

In [83]:
chart_reviews_2023 = pd.read_excel(chart_reviews_2023_path)

chart_reviews_2023.ir_id = chart_reviews_2023.ir_id.astype(int)
chart_reviews_2023.drop(columns=["DOB", "Unnamed: 7", "Genetics"], inplace=True)
chart_reviews_2023.Label = chart_reviews_2023.Label.astype("string")
chart_reviews_2023["Date Of Diagnosis"] = pd.to_datetime(chart_reviews_2023["Date Of Diagnosis"])
chart_reviews_2023["Column2"] = chart_reviews_2023["Column2"].astype("string")
chart_reviews_2023["Column3"] = chart_reviews_2023["Column3"].astype("string")

chart_reviews_2023.rename(columns={
    "Label": "full_chart_reviews__amyloid_type",
    "Date Of Diagnosis": "full_chart_reviews__diagnosis_date_norm",
    "Column2": "full_chart_reviews__diagnosis_method",
    "Column3": "full_chart_reviews__Notes",
}, inplace=True)

chart_reviews_2023.full_chart_reviews__amyloid_type = chart_reviews_2023.full_chart_reviews__amyloid_type.map(
    {
        "wtATTR": "wTTR",
        "hATTR": "hTTR",
        "wtTTR": "wTTR",
        "ATTR": "TTR - subtype pending",
        "AL": "AL"
    }
)

In [None]:
print(
    f'We have {chart_reviews_2023.ir_id.nunique()} patients Chart review.'
)

chart_reviews_2023.full_chart_reviews__amyloid_type.value_counts(dropna=False)


In [87]:
final_chart_reviews = pd.concat([chart_reviews, chart_reviews_2023])

In [None]:
final_chart_reviews.full_chart_reviews__amyloid_type.value_counts(dropna=False)

### Merge Labels with Full Chart Review and Clinic Cohort

We use an outer merge here because some patients are present in the chart review cohort and not in the labels

In [89]:
new_cohort_labels = cohort_labels.merge(final_chart_reviews, on="ir_id", how="outer")

In [None]:
print(f"chart review values:\n{new_cohort_labels.full_chart_reviews__amyloid_type.unique()}")
print()
print(f"amyloid labels:\n {new_cohort_labels.label__amyloid_diagnosis.unique()}")
print()
print(f"amyloid subtype labels:\n {new_cohort_labels.label__amyloid_subtype_diagnosis.unique()}")
print()
print(f"ttr amyloid subtype labels:\n {new_cohort_labels.label__ttr_amyloid_subtype_diagnosis.unique()}")
print()


In [91]:
amyloid_map = {
    "AL": "POSITIVE",
    "negative": "NEGATIVE",
    "hTTR": "POSITIVE",
    "wTTR": "POSITIVE",
    "other - unknown": "INDETERMINATE",
    "other": "INDETERMINATE",
    "positive - subtype pending": "POSITIVE",
    "TTR - subtype pending": "POSITIVE",
}
amyloid_subtype_map = {
    "AL": "AL",
    "negative": np.nan,
    "hTTR": "TTR",
    "wTTR": "TTR",
    "other - unknown": np.nan,
    "other": np.nan,
    "positive - subtype pending": "INDETERMINATE",
    "TTR - subtype pending": "TTR",
}
ttr_amyloid_subtype_map = {
    "AL": np.nan,
    "negative": np.nan,
    "hTTR": "HTTR",
    "wTTR": "INDETERMINATE",
    "other - unknown": np.nan,
    "other": np.nan,
    "positive - subtype pending": np.nan,
    "TTR - subtype pending": "TTR - w/u pending",
}

In [92]:
new_cohort_labels["full_chart_review"] = new_cohort_labels.full_chart_reviews__amyloid_type.notna()

In [93]:
chart_reviewed_records = new_cohort_labels[new_cohort_labels.full_chart_review == True].copy(deep=True)
other_records = new_cohort_labels[new_cohort_labels.full_chart_review == False].copy(deep=True)

In [94]:
chart_reviewed_records["label__amyloid_diagnosis_date"] = chart_reviewed_records[
    "full_chart_reviews__diagnosis_date_norm"
]
chart_reviewed_records["label__amyloid_diagnosis"] = chart_reviewed_records[
    "full_chart_reviews__amyloid_type"
].map(amyloid_map)
chart_reviewed_records["label__amyloid_subtype_diagnosis"] = chart_reviewed_records[
    "full_chart_reviews__amyloid_type"
].map(amyloid_subtype_map)
chart_reviewed_records["label__ttr_amyloid_subtype_diagnosis"] = chart_reviewed_records[
    "full_chart_reviews__amyloid_type"
].map(ttr_amyloid_subtype_map)


In [96]:
chart_reviewed_cohort_labels = pd.concat(
    [chart_reviewed_records, other_records]
)
chart_reviewed_cohort_labels.sort_values(by="ir_id", inplace=True)
chart_reviewed_cohort_labels.reset_index(drop=True, inplace=True)



In [97]:
chart_reviewed_cohort_labels.columns

Index(['ir_id', 'cardiac_path__amyloid_diagnosis', 'pyp__amyloid_diagnosis',
       'mayo__amyloid_diagnosis', 'mayo__amyloid_subtype_diagnosis',
       'mayo__ttr_amyloid_subtype_diagnosis', 'final__amyloid_diagnosis',
       'final__amyloid_diagnosis_date', 'final__amyloid_subtype_diagnosis',
       'final__ttr_amyloid_subtype_diagnosis', 'chart_reviews__Amyloid_type',
       'chart_reviews__Method_of_diagnosis',
       'chart_reviews__amyloid_diagnosis_date',
       'chart_reviews__Age_at_Diagnosis', 'chart_reviews__amyloid_diagnosis',
       'chart_reviews__amyloid_subtype_diagnosis',
       'chart_reviews__ttr_amyloid_subtype_diagnosis',
       'merge_chart_reviews_consistency',
       'merge_chart_reviews_consistency_description', 'Tafamidis_cohort_entry',
       'Tafamidis_cohort_entry_date', 'merge_tafamidis_consistency',
       'merge_tafamidis_consistency_description', 'label__amyloid_diagnosis',
       'label__amyloid_subtype_diagnosis',
       'label__ttr_amyloid_subtype_di

In [98]:
reordered_columns = [
    "ir_id",
    "label__amyloid_diagnosis",
    "label__amyloid_subtype_diagnosis",
    "label__ttr_amyloid_subtype_diagnosis",
    "label__amyloid_diagnosis_date",
    "full_chart_review",
    "pyp_or_tafamidis_only",
    "label__chart_review",
    "diagnosis__chart_review",
    "cardiac_path__amyloid_diagnosis",
    "pyp__amyloid_diagnosis",
    "mayo__amyloid_diagnosis",
    "mayo__amyloid_subtype_diagnosis",
    "mayo__ttr_amyloid_subtype_diagnosis",
    "final__amyloid_diagnosis",
    "final__amyloid_diagnosis_date",
    "final__amyloid_subtype_diagnosis",
    "final__ttr_amyloid_subtype_diagnosis",
    "chart_reviews__Amyloid_type",
    "chart_reviews__Method_of_diagnosis",
    "chart_reviews__amyloid_diagnosis_date",
    "chart_reviews__Age_at_Diagnosis",
    "chart_reviews__amyloid_diagnosis",
    "chart_reviews__amyloid_subtype_diagnosis",
    "chart_reviews__ttr_amyloid_subtype_diagnosis",
    "merge_chart_reviews_consistency",
    "merge_chart_reviews_consistency_description",
    "Tafamidis_cohort_entry",
    "Tafamidis_cohort_entry_date",
    "merge_tafamidis_consistency",
    "merge_tafamidis_consistency_description",
    "full_chart_reviews__suggested_label",
    "full_chart_reviews__document_description",
    "full_chart_reviews__priority",
    "full_chart_reviews__amyloid_type",
    "full_chart_reviews__diagnosis_method",
    "full_chart_reviews__diagnosis_date_norm",
    "full_chart_reviews__AL_cardiac_test",
    "full_chart_reviews__Notes",
]

assert sorted(list(chart_reviewed_cohort_labels.columns)) == sorted(reordered_columns)

chart_reviewed_cohort_labels = chart_reviewed_cohort_labels[reordered_columns]



#### Counting the patients before and after this merge

In [None]:
print(f"amyloid cases before:\n{cohort_labels.label__amyloid_diagnosis.value_counts(dropna=False)}")
print()
print(f"amyloid cases after:\n{chart_reviewed_cohort_labels.label__amyloid_diagnosis.value_counts(dropna=False)}")


In [None]:
print(f"amyloid subtype before:\n{cohort_labels.label__amyloid_subtype_diagnosis.value_counts(dropna=False)}")
print()
print(f"amyloid subtype after:\n{chart_reviewed_cohort_labels.label__amyloid_subtype_diagnosis.value_counts(dropna=False)}")


In [None]:
print(f"ttr amyloid subtype before:\n{cohort_labels.label__ttr_amyloid_subtype_diagnosis.value_counts(dropna=False)}")
print()
print(f"ttr amyloid subtype after:\n{chart_reviewed_cohort_labels.label__ttr_amyloid_subtype_diagnosis.value_counts(dropna=False)}")


### Saving the data.
We save 1 files:
- updated labels with new chart review data

This next cell creates a dataframe to collect values from chart review

In [103]:
# Patient level diagnoses from cardiac path reports, pyp reports, and mayo labs, tafamidis, and clinical chart review
# Merged with new chart review labels
# This yields the cohort_amyloid_labels__chart_reviewed which are GOLD STANDARD for amyloid diagnosis
chart_reviewed_cohort_amyloid_labels_path = (
    DATASET_PATH / "2023 pull" / "patient_amyloid_diagnosis" / "cohort_amyloid_labels__chart_reviewed_2023.csv"
)

chart_reviewed_cohort_labels.to_csv(chart_reviewed_cohort_amyloid_labels_path, index=False)
