In [1]:
import pandas as pd

In [2]:
target_path = '/Users/jk1/temp/opsum_end/imaging_extraction/imaging_extraction_target.csv'
extraction_state_path = '/Users/jk1/temp/opsum_end/imaging_extraction/already_extracted_012026.csv'

In [3]:
extraction_target_df = pd.read_csv(target_path)
extraction_state_df = pd.read_csv(extraction_state_path)

In [4]:
extraction_target_df.shape, extraction_state_df.shape

((2642, 9), (2283, 2))

In [5]:
# print number of unique patient ids in both datasets
print(f'Number of unique patient IDs in extraction target: {extraction_target_df["patient_id"].nunique()}')
print(f'Number of unique patient IDs in extraction state: {extraction_state_df["patient_id"].nunique()}')

Number of unique patient IDs in extraction target: 2346
Number of unique patient IDs in extraction state: 2283


In [6]:
missing_patient_ids = set(extraction_target_df['patient_id'].astype(str)) - set(extraction_state_df['patient_id'].astype(str))
n_missing = len(missing_patient_ids)
print(f'Number of missing patient IDs: {n_missing}')

Number of missing patient IDs: 70


In [7]:
supplemental_patient_ids = set(extraction_state_df['patient_id'].astype(str)) - set(extraction_target_df['patient_id'].astype(str))
n_supplemental = len(supplemental_patient_ids)
print(f'Number of supplemental patient IDs: {n_supplemental}')
print('Supplemental patient IDs:', supplemental_patient_ids)

# drop the supplemental patient ids from extraction_state_df
extraction_state_df = extraction_state_df[~extraction_state_df['patient_id'].astype(str).isin(supplemental_patient_ids)]


Number of supplemental patient IDs: 7
Supplemental patient IDs: {'MASTER-HUG-007', 'MASTER-HUG-020', 'MASTER-HUG-018', 'MASTER-HUG-019', 'MASTER-HUG-002-v2', '2025.12.23-13:58:23-STD-1.3.12.2.1107.5.99.3', 'MASTER-HUG-012'}


In [8]:
# count duplicates in extraction_state_df['patient_id']
duplicate_counts = extraction_state_df['patient_id'].value_counts()
duplicates = duplicate_counts[duplicate_counts > 1]
print(f'Number of duplicate patient IDs in extraction state: {len(duplicates)}')
extraction_state_df[extraction_state_df['patient_id'].duplicated()]

Number of duplicate patient IDs in extraction state: 0


Unnamed: 0,patient_id,RAPID_found


In [9]:
extraction_state_df['RAPID_found'].value_counts()

RAPID_found
1    1470
0     806
Name: count, dtype: int64

In [16]:
from pathlib import Path
from datetime import datetime

# compute summary metrics
unique_target = extraction_target_df['patient_id'].nunique()
unique_extracted = extraction_state_df['patient_id'].nunique()
missing_pct = n_missing / unique_target if unique_target else 0
rapid_counts = extraction_state_df['RAPID_found'].value_counts().sort_index()

summary = {
    'generated_at': datetime.now().strftime('%Y-%m-%d %H:%M'),
    'extraction_target_rows': extraction_target_df.shape[0],
    'extraction_target_cols': extraction_target_df.shape[1],
    'extraction_state_rows': extraction_state_df.shape[0],
    'extraction_state_cols': extraction_state_df.shape[1],
    'unique_patients_target': unique_target,
    'unique_patients_extracted': unique_extracted,
    'missing_patients_count': n_missing,
    'missing_patients_pct': round(missing_pct, 4),
    'supplemental_patients_removed': n_supplemental,
    'duplicate_patients_in_extracted': len(duplicates),
    'rapid_found_0': int(rapid_counts.get(0, 0)),
    'rapid_found_1': int(rapid_counts.get(1, 0)),
    'missing_patient_ids': ';'.join(sorted(map(str, missing_patient_ids))),
    'supplemental_patient_ids': ';'.join(sorted(map(str, supplemental_patient_ids))),
}

report_dir = Path(extraction_state_path).parent
csv_path = report_dir / 'verification_report.csv'

pd.DataFrame([summary]).T.to_csv(csv_path, index=True)
print(f'Report saved to {csv_path}')


Report saved to /Users/jk1/temp/opsum_end/imaging_extraction/verification_report.csv
