In [1]:
import pandas as pd
import numpy as np
import hashlib

In [2]:
# name of excel file in current directory containing phi mapping data
phi_mapping_path = 'phi_mapping-deid example.csv'

In [3]:
# files to be saved: 

# phi mapping data augmented with anonymous ids
phi_mapping_withanon_path = 'CONTAINS_PHI_phi_mapping-deid example with anonymous patient ids.csv'

# Patient-id anonymous patient id lookup table
patient_id_lut_path = 'CONTAINS_PHI_patient_id_lookup_table.csv'

# anonymous patient id - Hashed_PatientID lookup table
hashed_patient_id_lut_path = 'NO_PHI_hashed_patient_id_lookup_table.csv'

In [4]:
phi_mapping = pd.read_csv(phi_mapping_path)
phi_mapping

Unnamed: 0,PatientName,PatientID,Hashed_PatientID,StudyID,Hashed_StudyID,StudyInstanceUID,Hashed_Study_UID
0,NAME_5,5,f1a239968416da4a,193271793.0,0e8d0f624a469310,1.2.840.114350.2.307.2.798268.2.193271793.1,1.2.840.PDA.2
1,NAME_5,5,918c0a054d2d48b2,192912979.0,2594097f16c54782,1.2.840.114350.2.307.2.798268.2.192912979.1,1.2.840.PDA.3
2,NAME_9,9,6bc2832e6102899c,194894315.0,975d6db4c545a647,1.2.840.114350.2.307.2.798268.2.194894315.1,1.2.840.PDA.4
3,NAME_7,7,ba88e355709936db,190796873.0,4404c637d837303d,1.2.840.114350.2.307.2.798268.2.190796873.1,1.2.840.PDA.5
4,NAME_0,0,1,,,,
5,NAME_2,2,2,,,,
6,NAME_5,5,3,,,,
7,NAME_8,8,4,,,,
8,NAME_2,2,5,,,,
9,NAME_7,7,6,,,,


## Create the anonymous Patient ID

In [5]:
# Create a consistent anonymous patient id for each unique value of PatientID
phi_mapping['PatientID_anonymous'] = phi_mapping.PatientID.apply(lambda x: hashlib.sha1(str(x).encode()).hexdigest())
phi_mapping

Unnamed: 0,PatientName,PatientID,Hashed_PatientID,StudyID,Hashed_StudyID,StudyInstanceUID,Hashed_Study_UID,PatientID_anonymous
0,NAME_5,5,f1a239968416da4a,193271793.0,0e8d0f624a469310,1.2.840.114350.2.307.2.798268.2.193271793.1,1.2.840.PDA.2,ac3478d69a3c81fa62e60f5c3696165a4e5e6ac4
1,NAME_5,5,918c0a054d2d48b2,192912979.0,2594097f16c54782,1.2.840.114350.2.307.2.798268.2.192912979.1,1.2.840.PDA.3,ac3478d69a3c81fa62e60f5c3696165a4e5e6ac4
2,NAME_9,9,6bc2832e6102899c,194894315.0,975d6db4c545a647,1.2.840.114350.2.307.2.798268.2.194894315.1,1.2.840.PDA.4,0ade7c2cf97f75d009975f4d720d1fa6c19f4897
3,NAME_7,7,ba88e355709936db,190796873.0,4404c637d837303d,1.2.840.114350.2.307.2.798268.2.190796873.1,1.2.840.PDA.5,902ba3cda1883801594b6e1b452790cc53948fda
4,NAME_0,0,1,,,,,b6589fc6ab0dc82cf12099d1c2d40ab994e8410c
5,NAME_2,2,2,,,,,da4b9237bacccdf19c0760cab7aec4a8359010b0
6,NAME_5,5,3,,,,,ac3478d69a3c81fa62e60f5c3696165a4e5e6ac4
7,NAME_8,8,4,,,,,fe5dbbcea5ce7e2988b8c69bcfdfde8904aabc1f
8,NAME_2,2,5,,,,,da4b9237bacccdf19c0760cab7aec4a8359010b0
9,NAME_7,7,6,,,,,902ba3cda1883801594b6e1b452790cc53948fda


## Validate the PatientID - Anonymous PatientID mapping

In [6]:
# confirm that the PatientID, PatientID_anonymous is one to one
is_121 = False
num_unique_anonymous_ids = phi_mapping.groupby('PatientID').agg({'PatientID_anonymous': 'nunique'}).rename(columns={'PatientID_anonymous': 'num_anon'})
try:
    # 1-1 requires that all original PatientIDs are present
    if not (np.array(num_unique_anonymous_ids.index.values) == phi_mapping.sort_values('PatientID').PatientID.unique()).all():
        raise ValueError("Some Patient Ids are missing from the mapping table")
    
    # 1-1 also requires that each PatientID have exactly 1 anonymous version
    if not (num_unique_anonymous_ids.values==1).all():
        raise ValueError("Some Patient IDs do not have exactly 1 anonymous versions.")
        
    print("Mapping between Patient IDs and Anonymous Patient Ids is one-to-one.")
    is_121 = True

except ValueError as err:
    print("Error: Patient IDs and Anonymous Patient IDs are not one-to-one. If unfixed, this will result in data loss or incorrect grouping of data.")
    print("Error type:", err)
          
num_unique_anonymous_ids

Mapping between Patient IDs and Anonymous Patient Ids is one-to-one.


Unnamed: 0_level_0,num_anon
PatientID,Unnamed: 1_level_1
0,1
1,1
2,1
3,1
5,1
6,1
7,1
8,1
9,1
10,1


## Create needed lookup taables

In [7]:
patient_id_lut = \
    phi_mapping.\
    groupby(['PatientID', 'PatientID_anonymous']).\
    count().\
    reset_index()[['PatientID', 'PatientID_anonymous']]
patient_id_lut

Unnamed: 0,PatientID,PatientID_anonymous
0,0,b6589fc6ab0dc82cf12099d1c2d40ab994e8410c
1,1,356a192b7913b04c54574d18c28d46e6395428ab
2,2,da4b9237bacccdf19c0760cab7aec4a8359010b0
3,3,77de68daecd823babbb58edb1c8e14d7106e83bb
4,5,ac3478d69a3c81fa62e60f5c3696165a4e5e6ac4
5,6,c1dfd96eea8cc2b62785275bca38ac261256e278
6,7,902ba3cda1883801594b6e1b452790cc53948fda
7,8,fe5dbbcea5ce7e2988b8c69bcfdfde8904aabc1f
8,9,0ade7c2cf97f75d009975f4d720d1fa6c19f4897
9,10,b1d5781111d84f7b3fe45a0852e59758cd7a87e5


In [8]:
hashed_patient_id_lut = \
    phi_mapping.\
    groupby(['Hashed_PatientID', 'PatientID_anonymous']).\
    count().\
    reset_index()[['Hashed_PatientID', 'PatientID_anonymous']]
hashed_patient_id_lut

Unnamed: 0,Hashed_PatientID,PatientID_anonymous
0,1,b6589fc6ab0dc82cf12099d1c2d40ab994e8410c
1,10,c1dfd96eea8cc2b62785275bca38ac261256e278
2,11,77de68daecd823babbb58edb1c8e14d7106e83bb
3,12,c1dfd96eea8cc2b62785275bca38ac261256e278
4,13,c1dfd96eea8cc2b62785275bca38ac261256e278
5,14,77de68daecd823babbb58edb1c8e14d7106e83bb
6,15,fe5dbbcea5ce7e2988b8c69bcfdfde8904aabc1f
7,16,b6589fc6ab0dc82cf12099d1c2d40ab994e8410c
8,17,356a192b7913b04c54574d18c28d46e6395428ab
9,18,902ba3cda1883801594b6e1b452790cc53948fda


## Save the data

In [9]:
if is_121:
    # save the augmented phi_mapping table
    phi_mapping.to_csv(phi_mapping_withanon_path, index=None)

    # save the patient id lookup table
    patient_id_lut.to_csv(patient_id_lut_path, index=None)
    
    # save the hashed patient id lookup table
    hashed_patient_id_lut.to_csv(hashed_patient_id_lut_path, index=None)