In [None]:
import pandas as pd

In [None]:
target_path = '/Users/jk1/temp/opsum_end/imaging_extraction/imaging_extraction_target.csv'
extraction_state_path = '/Users/jk1/temp/opsum_end/imaging_extraction/already_extracted_012026.csv'

registry_path = '/Users/jk1/stroke_datasets/stroke_registry_post_hoc_modified.xlsx'
patient_selection_path = '/Users/jk1/temp/opsum_end/gsu_extraction_09052025_204357/high_frequency_data_patient_selection_with_details.csv'

missing_from_comparison_with_manual_data_path = '/Users/jk1/temp/opsum_end/imaging_extraction/test_extraction/missing_from_extractions.csv'

Compare with target

In [None]:
extraction_target_df = pd.read_csv(target_path)
extraction_state_df = pd.read_csv(extraction_state_path)

In [None]:
extraction_target_df.shape, extraction_state_df.shape

In [None]:
# print number of unique patient ids in both datasets
print(f'Number of unique patient IDs in extraction target: {extraction_target_df["patient_id"].nunique()}')
print(f'Number of unique patient IDs in extraction state: {extraction_state_df["patient_id"].nunique()}')

In [None]:
missing_patient_ids = set(extraction_target_df['patient_id'].astype(str)) - set(extraction_state_df['patient_id'].astype(str))
n_missing = len(missing_patient_ids)
print(f'Number of missing patient IDs: {n_missing}')

In [None]:
supplemental_patient_ids = set(extraction_state_df['patient_id'].astype(str)) - set(extraction_target_df['patient_id'].astype(str))
n_supplemental = len(supplemental_patient_ids)
print(f'Number of supplemental patient IDs: {n_supplemental}')
print('Supplemental patient IDs:', supplemental_patient_ids)

# drop the supplemental patient ids from extraction_state_df
extraction_state_df = extraction_state_df[~extraction_state_df['patient_id'].astype(str).isin(supplemental_patient_ids)]


In [None]:
# count duplicates in extraction_state_df['patient_id']
duplicate_counts = extraction_state_df['patient_id'].value_counts()
duplicates = duplicate_counts[duplicate_counts > 1]
print(f'Number of duplicate patient IDs in extraction state: {len(duplicates)}')
extraction_state_df[extraction_state_df['patient_id'].duplicated()]

In [None]:
extraction_state_df['RAPID_found'].value_counts()

Compare with registry

In [None]:
stroke_registry_df = pd.read_excel(registry_path)
patient_selection_df = pd.read_csv(patient_selection_path)

In [None]:
def create_registry_case_identification_column(df):
    # Identify each case with case id (patient id + eds last 4 digits)
    df = df.copy()
    if 'patient_id' not in df.columns:
        df['patient_id'] = df['Case ID'].apply(lambda x: x[8:-4]).astype(str)
    if 'EDS_last_4_digits' not in df.columns:
        df['EDS_last_4_digits'] = df['Case ID'].apply(lambda x: x[-4:]).astype(str)
    case_identification_column = df['patient_id'].astype(str) \
                                 + '_' + df['EDS_last_4_digits'].str.zfill(4).astype(str)
    return case_identification_column

In [None]:
stroke_registry_df['patient_id'] = stroke_registry_df['Case ID'].apply(lambda x: x[8:-4])
stroke_registry_df['EDS_last_4_digits'] = stroke_registry_df['Case ID'].apply(lambda x: x[-4:])
stroke_registry_df['case_admission_id'] = create_registry_case_identification_column(stroke_registry_df)

patient_selection_df = pd.read_csv(patient_selection_path, dtype=str)
patient_selection_df['case_admission_id'] = create_registry_case_identification_column(patient_selection_df)

restricted_to_selection_registry_df = stroke_registry_df[
        stroke_registry_df['case_admission_id'].isin(patient_selection_df['case_admission_id'])]

In [None]:
selected_with_ctp = restricted_to_selection_registry_df[
    restricted_to_selection_registry_df['Acute perf. imaging type'].str.contains('Perfusion CT', na=False)
]
selected_with_ctp.shape[0]

In [None]:
selected_with_ctp.head()

In [None]:
extraction_state_df.head()

In [None]:
# missclassified patients (patient_id) -> in extraction state RAPID_found == 0 but present in registry selection with CTP
missing_from_extraction_state = selected_with_ctp[
    ~selected_with_ctp['patient_id'].astype(str).isin(extraction_state_df[extraction_state_df['RAPID_found'] == 1]['patient_id'].astype(str))
]

In [None]:
missing_from_extraction_state.head()

In [None]:
# save number of missing patients to report (previously generated)
n_missing_from_extraction_state = missing_from_extraction_state.shape[0]
print(f'Number of patients missing from extraction state but with CTP in registry: {n_missing_from_extraction_state}')


save results

In [None]:
from pathlib import Path
from datetime import datetime

# compute summary metrics
unique_target = extraction_target_df['patient_id'].nunique()
unique_extracted = extraction_state_df['patient_id'].nunique()
missing_pct = n_missing / unique_target if unique_target else 0
rapid_counts = extraction_state_df['RAPID_found'].value_counts().sort_index()

summary = {
    'generated_at': datetime.now().strftime('%Y-%m-%d %H:%M'),
    'extraction_target_rows': extraction_target_df.shape[0],
    'extraction_target_cols': extraction_target_df.shape[1],
    'extraction_state_rows': extraction_state_df.shape[0],
    'extraction_state_cols': extraction_state_df.shape[1],
    'unique_patients_target': unique_target,
    'unique_patients_extracted': unique_extracted,
    'missing_patients_count': n_missing,
    'missing_patients_pct': round(missing_pct, 4),
    'supplemental_patients_removed': n_supplemental,
    'duplicate_patients_in_extracted': len(duplicates),
    'rapid_found_0': int(rapid_counts.get(0, 0)),
    'rapid_found_1': int(rapid_counts.get(1, 0)),
    'missing_patient_ids': ';'.join(sorted(map(str, missing_patient_ids))),
    'supplemental_patient_ids': ';'.join(sorted(map(str, supplemental_patient_ids))),
    'missing_from_extraction_state_count_but_with_ctp_in_registry': n_missing_from_extraction_state,
}

report_dir = Path(extraction_state_path).parent
csv_path = report_dir / 'verification_report.csv'

pd.DataFrame([summary]).T.to_csv(csv_path, index=True)
print(f'Report saved to {csv_path}')

In [None]:
# save a refined extraction_target file with missing patients and following columns: Case ID	case_admission_id	patient_id	EDS_last_4_digits	DOB	Arrival at hospital	Arrival time	1st brain imaging date	1st brain imaging time
refined_extraction_target_df = missing_from_extraction_state[[
    'Case ID',
    'case_admission_id',
    'patient_id',
    'EDS_last_4_digits',
    'DOB',
    'Arrival at hospital',
    'Arrival time',
    '1st brain imaging date',
    '1st brain imaging time'
]]



In [None]:
missing_from_comparison_with_manual_data_df = pd.read_csv(missing_from_comparison_with_manual_data_path)
missing_from_comparison_with_manual_data_df.head()

In [None]:
# concat missing_from_comparison_with_manual_data_df with refined_extraction_target_df, then drop duplicates on case_admission_id
combined_missing_df = pd.concat([refined_extraction_target_df, missing_from_comparison_with_manual_data_df], ignore_index=True)
combined_missing_df = combined_missing_df.drop_duplicates(subset=['case_admission_id'])

In [None]:
refined_extraction_target_df = combined_missing_df

In [None]:
refined_extraction_target_path = report_dir / 'refined_extraction_target_missing_ctp_patients.csv'
refined_extraction_target_df.to_csv(refined_extraction_target_path, index=False)