In [1]:
import os, hashlib, re
import pandas as pd
from unidecode import unidecode
import pydicom
from tqdm import tqdm

In [2]:
main_folder_path = '/Users/jk1/stroke_datasets/2018_data/extracted_with_RAPID_part1bis'

In [3]:
def flatten_string(string):
    return unidecode(''.join(re.split(r'[,-]', str(string)))).upper()

In [4]:
subjects = [o for o in os.listdir(main_folder_path)
            if os.path.isdir(os.path.join(main_folder_path, o))]

info_df_columns = ['pid', 'hospital_id', 'first_name', 'last_name', 'dob', 'rapid_imaging_date']
info_df = pd.DataFrame(columns=info_df_columns)

for subj_idx, subject in tqdm(enumerate(subjects)):
    subject_tmax_path = os.path.join(main_folder_path, subject, 'pCT', f'Tmax_{subject}')
    img0 = [o for o in os.listdir(subject_tmax_path) if o.endswith('.dcm')][0]
    dcm = pydicom.dcmread(os.path.join(subject_tmax_path, img0))

    full_name = '_'.join(re.split(r'[/^ ]', unidecode(str(dcm.PatientName).upper())))
    last_name = unidecode(str(dcm.PatientName).split('^')[0].upper())
    first_name = unidecode(str(dcm.PatientName).split('^')[1].upper())

    patient_birth_date = dcm.PatientBirthDate

    patient_identifier = last_name + '^' + first_name + '^' + patient_birth_date

    ID = hashlib.sha256(patient_identifier.encode('utf-8')).hexdigest()[:8]
    pid = 'subj-' + str(ID)

    assert subject == pid, 'subject id is not matching'

    hospitalPatientId = dcm.PatientID
    rapid_imaging_date = dcm.StudyDate

    info_df = info_df.append(
                pd.DataFrame([[pid, hospitalPatientId, first_name, last_name, patient_birth_date, rapid_imaging_date]],
                columns = info_df_columns), ignore_index=True)


13it [00:00, 286.53it/s]


In [5]:
info_df.to_excel(os.path.join(main_folder_path, 'patient_ids_key.xlsx'))