# 04_summary

In [None]:
# 1) total number of patients in the cohort

In [None]:
import os
import sys
import json
import pathlib
sys.path.append("..")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from datetime import timedelta
import traceback

current_dir = pathlib.Path.cwd()
parent_dir = current_dir.parent
with open(parent_dir.joinpath("config.json")) as file:
    cfg = json.load(file)
with open(current_dir.joinpath("demographics.json")) as file:
    dg_cfg = json.load(file)

In [None]:
import pathlib
result_dir = current_dir.joinpath("result", "psm")
with open(result_dir.joinpath("number_of_patients.json")) as file:
    summary = json.load(file)

In [None]:
# n_patients_df = pd.DataFrame()
# eject n total patients 
if "n_total_patients" in summary:
    n_total_patients = summary.pop("n_total_patients")
n_patients_df = pd.DataFrame.from_dict(summary, orient="index")
n_patients_df.rename(columns={"False": "control", "True": "case"}, inplace=True)
n_patients_df.loc["total"] = n_patients_df.sum()
n_patients_df 

In [None]:
with open(result_dir.joinpath("demographics_for_condition_occurrence.json")) as file:
    demographics_condition = json.load(file)
df_dict = {}
for condition in demographics_condition:
    cond_df = pd.DataFrame.from_dict(demographics_condition[condition], orient="index")
    cond_df.rename(columns={"False": "control", "True": "case"}, inplace=True)
        # cond_df['case_rate'] = cond_df['case'] / n_patients_df.loc[condition, 'total']
        # cond_df['control_rate'] = cond_df['control'] / n_patients_df.loc[condition, 'total']
    df_dict[condition] = cond_df
df_dict

In [None]:
concat_df = pd.concat(df_dict, axis=0)
concat_df.reset_index(inplace=True)
concat_df.rename(columns={"level_0": "condition", "level_1": "demographic"}, inplace=True)
concat_df['n_case'] = n_patients_df.loc[concat_df['condition'].apply(lambda x: x.lower()), 'case'].values
concat_df['n_control'] = n_patients_df.loc[concat_df['condition'].apply(lambda x: x.lower()), 'control'].values
concat_df['case_rate'] = concat_df['case'] / concat_df['n_case'] * 100
concat_df['control_rate'] = concat_df['control'] / concat_df['n_control'] * 100
concat_df.head()

In [None]:
# summation of each condition
concat_df = concat_df.groupby("demographic").sum()
concat_df['case_rate'] = (concat_df['case'] / concat_df['n_case'] * 100).round(2)
concat_df['control_rate'] = (concat_df['control'] / concat_df['n_control'] * 100).round(2)
concat_df['str_control'] = concat_df['control'].astype(str) + " (" + concat_df['control_rate'].astype(str) + "%)"
concat_df['str_case'] = concat_df['case'].astype(str) + " (" + concat_df['case_rate'].astype(str) + "%)"
concat_df.to_csv(result_dir.joinpath("demographics_for_condition_occurrence.csv"))
concat_df

In [None]:
with open(result_dir.joinpath("demographics_for_measurement.json")) as file:
    demographics_measurement = json.load(file)
df_dict = {}
for measurement in demographics_measurement:
    meas_df = pd.DataFrame.from_dict(demographics_measurement[measurement], orient="index")
    meas_df.rename(columns={"False": "control", "True": "case"}, inplace=True)
    df_dict[measurement] = meas_df
df_dict

In [None]:
concat_df = pd.concat(df_dict, axis=0)
concat_df.reset_index(inplace=True)
concat_df.rename(columns={"level_0": "measurement", "level_1": "demographic"}, inplace=True)
concat_df['case_mean'] = concat_df['case'].apply(lambda x: np.array(x, dtype=np.float32)[~np.isnan(np.array(x, dtype=np.float32))].mean())
concat_df['case_std'] = concat_df['case'].apply(lambda x: np.array(x, dtype=np.float32)[~np.isnan(np.array(x, dtype=np.float32))].std())
concat_df['control_std'] = concat_df['control'].apply(lambda x: np.array(x, dtype=np.float32)[~np.isnan(np.array(x, dtype=np.float32))].std())
concat_df['control_mean'] = concat_df['control'].apply(lambda x: np.array(x, dtype=np.float32)[~np.isnan(np.array(x, dtype=np.float32))].mean())
concat_df['str_case'] = concat_df.apply(lambda x: str(round(x['case_mean'], 2)) + " ± " + str(round(x['case_std'], 2)), axis=1)
concat_df['str_control'] = concat_df.apply(lambda x: str(round(x['control_mean'], 2)) + " ± " + str(round(x['control_std'], 2)), axis=1)
concat_df[['measurement', 'demographic', 'str_case', 'str_control']].pivot(index='measurement', columns='demographic', values=['str_case', 'str_control']).T

In [None]:
concat_df = pd.concat(df_dict, axis=0)
concat_df.reset_index(inplace=True)
concat_df.rename(columns={"level_0": "drug", "level_1": "demographic"}, inplace=True)
# extend lists of case and control
concat_df = concat_df.groupby('demographic').apply(lambda x: x.sum())
concat_df.drop(columns=['drug', 'demographic'], inplace=True)

concat_df['case_mean'] = concat_df['case'].apply(lambda x: np.array(x, dtype=np.float32)[~np.isnan(np.array(x, dtype=np.float32))].mean())
concat_df['case_std'] = concat_df['case'].apply(lambda x: np.array(x, dtype=np.float32)[~np.isnan(np.array(x, dtype=np.float32))].std())
concat_df['control_std'] = concat_df['control'].apply(lambda x: np.array(x, dtype=np.float32)[~np.isnan(np.array(x, dtype=np.float32))].std())
concat_df['control_mean'] = concat_df['control'].apply(lambda x: np.array(x, dtype=np.float32)[~np.isnan(np.array(x, dtype=np.float32))].mean())   
concat_df['str_case'] = concat_df.apply(lambda x: str(round(x['case_mean'], 2)) + " ± " + str(round(x['case_std'], 2)), axis=1)
concat_df['str_control'] = concat_df.apply(lambda x: str(round(x['control_mean'], 2)) + " ± " + str(round(x['control_std'], 2)), axis=1)
concat_df[['str_case', 'str_control']]
concat_df.to_csv(result_dir.joinpath("demographics_for_measurement.csv"))
concat_df

In [None]:
with open(result_dir.joinpath("demographics_for_patient_info.json")) as file:
    demographics_patient_info = json.load(file)
    demographics_patient_info = demographics_patient_info.pop('patient_info')

In [None]:
# for Age
concat_df = pd.DataFrame()
for key in demographics_patient_info:
    df = pd.DataFrame.from_dict(demographics_patient_info[key], orient="index")
    df.rename(columns={"False": "control", "True": "case"}, inplace=True)
    df["drug"] = key
    concat_df = pd.concat([concat_df, df], axis=0)    

In [None]:
age_df = concat_df.loc[concat_df.index.str.contains("age")].copy()
calc_mean = (lambda x: np.array(x, dtype=np.float32)[~np.isnan(np.array(x, dtype=np.float32))].mean()) 
calc_std = (lambda x: np.array(x, dtype=np.float32)[~np.isnan(np.array(x, dtype=np.float32))].std())
age_df['case_mean'] = age_df['case'].apply(calc_mean)
age_df['case_std'] = age_df['case'].apply(calc_std)
age_df['control_mean'] = age_df['control'].apply(calc_mean)   
age_df['control_std'] = age_df['control'].apply(calc_std)
age_df["str_case"] = age_df.apply(lambda x: str(round(x['case_mean'], 2)) + " ± " + str(round(x['case_std'], 2)), axis=1)
age_df["str_control"] = age_df.apply(lambda x: str(round(x['control_mean'], 2)) + " ± " + str(round(x['control_std'], 2)), axis=1)
age_df.loc[:, ["drug", "str_case", "str_control"]]

In [None]:
gender_df = concat_df.loc[concat_df.index.str.contains("gender")].copy()
gender_df["case"] 
gender_df["str_case"] = gender_df["case"].apply(lambda x: str(x.count("M")) +" / "+ str(x.count("F")))
gender_df["str_control"] = gender_df["control"].apply(lambda x: str(x.count("M")) +" / "+ str(x.count("F")))
gender_df.loc[:, ["drug", "str_case", "str_control"]]