# Report inclusion/exclusion numbers

In [None]:
import pandas as pd
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import psycopg2

In [None]:
outcomes_path = '/Users/jk1/temp/mimic/extraction/outcome_df.csv'
mimic_notes_path = '/Users/jk1/OneDrive - unige.ch/stroke_research/geneva_stroke_unit_dataset/data/mimic_data/combined_notes_labels_v2.xlsx'

In [None]:
 # information used to create a database connection
sqluser = 'postgres'
sqlpassword = 'postgres'
dbname = 'mimic'
schema_name = 'mimiciii'
con = psycopg2.connect(dbname=dbname, user=sqluser, password=sqlpassword, port=5000, host='localhost')

# the below statement is prepended to queries to ensure they select from the right schema
query_schema = f'set search_path to {schema_name};'

In [None]:
selection_query_path = './data_extraction/patient_selection_query.sql'
# load in the text of the query
with open(selection_query_path) as fp:
    patient_selection_query = ''.join(fp.readlines())

In [None]:
query = query_schema + patient_selection_query + """
            SELECT selection.subject_id, selection.hadm_id, selection.icustay_id,
                    selection.exclusion_los, selection.exclusion_age, selection.exclusion_first_stay,
                    selection.exclusion_discharge_diagnosis, selection.exclusion_non_urgent, selection.exclusion_admission_diagnosis
            FROM selection
            """

In [None]:
included_patients_df = pd.read_sql_query(query, con)

## Extract exclusion criteria

In [None]:
print('{:20s} {:5d}'.format('Number of Patient records', included_patients_df.hadm_id.nunique()))
for col in included_patients_df.columns:
    if "exclusion_" in col:
        print('{:20s} {:5d} ({:2.2f}%)'.format(col, len(set(included_patients_df.hadm_id.unique()) - set(included_patients_df[included_patients_df[col] == 0].hadm_id.unique())), len(set(included_patients_df.hadm_id.unique()) - set(included_patients_df[included_patients_df[col] == 0].hadm_id.unique()))*100.0/included_patients_df.hadm_id.nunique()))

In [None]:
print(f'Not admitted for acute ischemic stroke: {len(set(included_patients_df.hadm_id.unique()) - set(included_patients_df[(included_patients_df["exclusion_discharge_diagnosis"] == 0) & (included_patients_df["exclusion_admission_diagnosis"] == 0) & (included_patients_df["exclusion_non_urgent"] == 0)].hadm_id.unique()))}')

Within patients admitted for acute ischemic stroke:

In [None]:
n_patients_with_ais = len(set(included_patients_df[(included_patients_df["exclusion_discharge_diagnosis"] == 0) & (included_patients_df["exclusion_admission_diagnosis"] == 0) & (included_patients_df["exclusion_non_urgent"] == 0)].hadm_id.unique()))

print('{:20s} {:5d}'.format('Number of Patient records admitted for acute ischemic stroke', n_patients_with_ais))

for col in included_patients_df.columns:
    if "exclusion_" in col:
        print('{:20s} {:5d} ({:2.2f}%)'.format(col, len(set(included_patients_df[(included_patients_df["exclusion_discharge_diagnosis"] == 0) & (included_patients_df["exclusion_admission_diagnosis"] == 0) & (included_patients_df["exclusion_non_urgent"] == 0)].hadm_id.unique()) - set(included_patients_df[included_patients_df[col] == 0].hadm_id.unique())), len(set(included_patients_df[(included_patients_df["exclusion_discharge_diagnosis"] == 0) & (included_patients_df["exclusion_admission_diagnosis"] == 0) & (included_patients_df["exclusion_non_urgent"] == 0)].hadm_id.unique()) - set(included_patients_df[included_patients_df[col] == 0].hadm_id.unique()))*100.0/n_patients_with_ais))

## Retained patients after primary exclusion

In [None]:
outcome_df = pd.read_csv(outcomes_path)

In [None]:
print('Number of patients after primary exclusion:', outcome_df.hadm_id.nunique())

## Retained with detailed note
- admission note
- discharge note with detailed neuro admission exam

In [None]:
mimic_notes_df = pd.read_excel(mimic_notes_path)

In [None]:
print('Number of patients with detailed notes:', mimic_notes_df.hadm_id.nunique())