In [1]:
import os

import livingpark_utils


os.environ["PIP_REQUIRE_VIRTUALENV"] = "False"
utils = livingpark_utils.LivingParkUtils()
utils.notebook_init()

removing link inputs
removing link outputs
Installing notebook dependencies (see log in install.log)... 
This notebook was run on 2022-10-05 17:52:16 UTC +0000


In [2]:
required_files = [
    "Participant_Status.csv",
    "Primary_Clinical_Diagnosis.csv",
]

utils.download_ppmi_metadata(required_files, headless=False)

Download skipped: No missing files!


In [3]:
import numpy as np
import pandas as pd

status = pd.read_csv(os.path.join(utils.study_files_dir, "Participant_Status.csv"))[["PATNO", "STATUS_DATE", "COHORT"]]
prim_diag = pd.read_csv(os.path.join(utils.study_files_dir, "Primary_Clinical_Diagnosis.csv"))[["PATNO", "EVENT_ID", "PRIMDIAG"]]

common_patno = np.intersect1d(status["PATNO"].unique(), prim_diag["PATNO"].unique())
print(f"Status participants: {status['PATNO'].nunique()}")
print(f"Primary diagnosis participants: {prim_diag['PATNO'].nunique()}")
print(f"Number of common patients: {common_patno.size}")

Status participants: 2374
Primary diagnosis participants: 2022
Number of common patients: 2022


In [4]:
status.groupby("PATNO").filter(lambda g: g["COHORT"].nunique() > 1)["PATNO"].nunique()

0

In [5]:
prim_diag.groupby("PATNO").filter(lambda g: g["PRIMDIAG"].nunique() > 1)["PATNO"].nunique()

334

All patients have a single cohort definition, but some have multiple primary diagnosis.

In [6]:
status_HC = status[status["COHORT"] == 2]["PATNO"].unique()
status.groupby("COHORT").size()

COHORT
1    1164
2     311
3      81
4     799
9      19
dtype: int64

# Unique diagnosis

## Get mismatch

In [7]:
single_diag = prim_diag.groupby("PATNO").filter(lambda g: g["PRIMDIAG"].nunique() == 1)
single_diag["PATNO"].nunique()

1688

In [8]:
single_diag_HC = single_diag[single_diag["PRIMDIAG"] == 17]["PATNO"].unique()
single_diag.drop_duplicates(subset=["PATNO", "PRIMDIAG"]).groupby("PRIMDIAG").size()

PRIMDIAG
1     968
5       2
7       5
8       1
15      1
16      1
17    593
23     77
24      1
25     33
97      6
dtype: int64

In [9]:
mismatch = np.intersect1d(
    np.setxor1d(status_HC, single_diag_HC),
    np.intersect1d(status["PATNO"].unique(), single_diag["PATNO"].unique())
)
print(f"Number of mismatch: {mismatch.size}")

status[status["PATNO"].isin(mismatch)].groupby("COHORT")["PATNO"].nunique();

Number of mismatch: 356


In [10]:
prim_diag[prim_diag["PATNO"].isin(mismatch)].groupby("PRIMDIAG")["PATNO"].nunique();

## Results

In [11]:
(
    status[status["PATNO"].isin(mismatch)].merge(
        prim_diag[prim_diag["PATNO"].isin(mismatch)], on=["PATNO"]
    )
    .drop_duplicates(subset=["PATNO", "COHORT", "PRIMDIAG"])
    .groupby(["COHORT", "PRIMDIAG"])
    ["PATNO"].nunique().sort_values(ascending=False)
)

COHORT  PRIMDIAG
4       17          354
2       97            1
3       17            1
Name: PATNO, dtype: int64

Lots of mismatch bewteen COHORT=Promodal and PRIMDIAG=HC

# Multiple diagnosis

## Get mismatch

In [12]:
multi_diag = prim_diag.groupby("PATNO").filter(lambda g: g["PRIMDIAG"].nunique() > 1)
multi_diag["PATNO"].nunique()

334

In [13]:
multi_diag_HC = multi_diag[multi_diag["PRIMDIAG"] == 17]["PATNO"].unique()
multi_diag.drop_duplicates(subset=["PATNO", "PRIMDIAG"]).groupby("PRIMDIAG").size()

PRIMDIAG
1     144
2       2
4       2
5       9
6       1
7      54
9       1
10      1
11     13
12      2
14      2
15     15
16      3
17    229
18      3
23     60
24     72
25     74
97     96
dtype: int64

In [14]:
mismatch_ = np.intersect1d(
    np.setxor1d(status_HC, multi_diag_HC),
    np.intersect1d(status["PATNO"].unique(), multi_diag["PATNO"].unique())
)
print(f"Number of mismatch: {mismatch_.size}")

status[status["PATNO"].isin(mismatch_)].groupby("COHORT")["PATNO"].nunique();

Number of mismatch: 193


In [15]:
prim_diag[prim_diag["PATNO"].isin(mismatch_)].groupby("PRIMDIAG")["PATNO"].nunique();

## Results

In [16]:
(
    status[status["PATNO"].isin(mismatch_)].merge(
        prim_diag[prim_diag["PATNO"].isin(mismatch_)], on=["PATNO"]
    )
    .drop_duplicates(subset=["PATNO", "COHORT", "PRIMDIAG"])
    .groupby(["COHORT", "PRIMDIAG"])
    ["PATNO"].nunique().sort_values(ascending=False)
)

COHORT  PRIMDIAG
4       17          172
        25           65
        24           47
        1            31
        97           28
        23           27
        7            22
3       17           16
        1            16
1       17            5
3       97            5
1       1             4
3       15            3
4       14            2
        15            2
        12            2
        11            2
        16            1
1       25            1
        15            1
        24            1
3       7             1
1       97            1
Name: PATNO, dtype: int64