## SECTION 4: Anonymisation & Ethics

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import hashlib
import spacy
from spacy.matcher import PhraseMatcher
import re
from typing import List, Dict, Any, Tuple
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
def read_data(path_to_csv_file, delimiter=None):
    '''
    Reads csv file from specified path
    '''
    df = pd.read_csv(path_to_csv_file, delimiter=delimiter)
    
    return df

# call function to import data 
df_patient = read_data('/Users/mncedisimncwabe/Downloads/patient_behavior_data.csv',delimiter=";")
df_patient.head()

Unnamed: 0,patient_id,gender,medication,dose,name,surname,bmi,weight,height,systolic,diastolic,concentration,distractibility,impulsivity,hyperactivity,sleep,mood,appetite,doctor_notes
0,1,Other,Adderall,15mg,Ashley,Merritt,26.8,82.9,1.76,113,88,1,0,-2,0,2,2,0,Needs review of medication due to side effects...
1,2,Other,,,Joseph,Salazar,17.3,52.3,1.74,136,72,-2,1,-2,1,1,2,1,No significant behavioral changes observed. Ye...
2,3,Male,Vyvanse,15mg,Barbara,Roberts,35.5,114.9,1.8,128,77,0,2,-1,-2,2,2,-2,Mood swings noted during follow-up. Both set T...
3,4,Female,,5mg,Dawn,Brown,36.5,91.2,1.58,101,78,0,-2,-1,2,1,0,-2,Increased talkativeness and interrupting behav...
4,5,Female,Vyvanse,30mg,Robert,Warner,17.7,40.4,1.51,111,81,1,0,1,1,2,-1,0,Significant improvement in appetite. Large sig...


#### 1. Mask PII: hash/remove name, surname, patient_id.

In [3]:
def mask_pii(df):
    """
    Masks Personally Identifiable Information (PII)
    """
    # Create a copy to avoid modifying the original DataFrame
    df_masked = df.copy()
    
    # Hash values using SHA-256
    def hash_value(value):
        return hashlib.sha256(str(value).encode()).hexdigest()
    
    pii_columns = ['patient_id', 'name', 'surname']
    for col in pii_columns:
        if col in df_masked.columns:
            df_masked[col] = df_masked[col].apply(hash_value)
    
    return df_masked

df_masked = mask_pii(df_patient)
df_masked[['patient_id','name','surname','doctor_notes']].head()

Unnamed: 0,patient_id,name,surname,doctor_notes
0,6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d...,b74f836916b6b3d61bed6cb0cc69d43dec124eeb444974...,f93e7a84dedd65652a1cf3f7e6b1235deb72e64c26e317...,Needs review of medication due to side effects...
1,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,78a1a1dae4e56fb0aba67dadc5ab5ab808f505137efede...,0aaf1123652ea930a33216066b006e6817f670405579d8...,No significant behavioral changes observed. Ye...
2,4e07408562bedb8b60ce05c1decfe3ad16b72230967de0...,d01926adf94597454516e441789dba45148c4360a90c6e...,ce5f866533e35b55a39194c0a36c80cde4d4d9fe3d0c20...,Mood swings noted during follow-up. Both set T...
3,4b227777d4dd1fc61c6f884f48641d02b4d121d3fd328c...,0dfd8feb99c0951900053a0b0556faa95c0527314aecce...,18f285592896ec8ead1b69d90ee4080300ba8dc31665a6...,Increased talkativeness and interrupting behav...
4,ef2d127de37b942baad06145e54b0c619a1f22327b2ebb...,2238dd61a1bf83816b40ad894518814b8edf7221d84d89...,dc008d0efa5a10ab7b1a67e4a4fc05c6a623ea554fc683...,Significant improvement in appetite. Large sig...


#### 2. Redact PII in doctor_notes using spaCy NER (names, dates, locations â†’ [REDACTED]).

In [4]:
# Install spacy en_core_web_lg model - use for redaction, better accurary than other spacy models
# Uncomment to install
# pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.0/en_core_web_lg-3.7.0-py3-none-any.whl

In [5]:
nlp = spacy.load("en_core_web_lg")
def redact_pii_text(text):
    """PII redaction on text"""
    if pd.isna(text) or not str(text).strip():
        return text
    
    text = str(text)
    doc = nlp(text)
    
    #  # Skip medical roles that aren't actual names eg Nurse, Doctor
    matcher = PhraseMatcher(nlp.vocab)
    roles = ["parent", "mother", "father", "guardian", "patient", "doctor", "nurse"]
    patterns = [nlp.make_doc(role) for role in roles]
    matcher.add("MEDICAL_ROLES", patterns)
    
    for ent in sorted(doc.ents, key=lambda x: x.start_char, reverse=True):
        if matcher(doc[ent.start:ent.end]):
            continue
            
        if ent.label_ in ['PERSON', 'DATE', 'GPE', 'LOC']:
            text = text[:ent.start_char] + '[REDACTED]' + text[ent.end_char:]
    
    return text

df_masked['doctor_notes_redacted'] = df_masked['doctor_notes'].apply(redact_pii_text)
df_masked[['patient_id', 'doctor_notes','doctor_notes_redacted']].head()

Unnamed: 0,patient_id,doctor_notes,doctor_notes_redacted
0,6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d...,Needs review of medication due to side effects...,Needs review of medication due to side effects...
1,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,No significant behavioral changes observed. Ye...,No significant behavioral changes observed. Ye...
2,4e07408562bedb8b60ce05c1decfe3ad16b72230967de0...,Mood swings noted during follow-up. Both set T...,Mood swings noted during follow-up. Both set T...
3,4b227777d4dd1fc61c6f884f48641d02b4d121d3fd328c...,Increased talkativeness and interrupting behav...,Increased talkativeness and interrupting behav...
4,ef2d127de37b942baad06145e54b0c619a1f22327b2ebb...,Significant improvement in appetite. Large sig...,Significant improvement in appetite. Large sig...


In [8]:
# Verify if redaction was applied
redacted_mask = df_masked['doctor_notes_redacted'].str.contains('\[REDACTED\]', na=False)
df_comparison = pd.DataFrame({
    'Original Notes': df_masked.loc[redacted_mask, 'doctor_notes'],
    'Redacted Notes': df_masked.loc[redacted_mask, 'doctor_notes_redacted'],
    'Patient ID': df_masked.loc[redacted_mask, 'patient_id']
})

pd.set_option('display.max_colwidth', 100)
pd.set_option('display.max_rows', 10)
display(df_comparison.sample(min(10, len(df_comparison))))

Unnamed: 0,Original Notes,Redacted Notes,Patient ID
10554,Increased talkativeness and interrupting behavior. Relationship build year result according full...,Increased talkativeness and interrupting behavior. Relationship build [REDACTED] result accordin...,3d396c2b66d43a7d5e6d52ba7a61f73220921f5070a78df5fcf377ac198b9991
3220,No significant behavioral changes observed. Mouth know about also but TV term program establish.,No significant behavioral changes observed. [REDACTED] know about also but TV term program estab...,a762afde67e034ebaa109a994329ccd09efa8676e4fab6ceabb99e119d4d725f
5037,Needs review of medication due to side effects. Even speak hair rise write success reality seaso...,Needs review of medication due to side effects. Even speak hair rise write success reality [REDA...,672ec4dda4a89456ef2c4e2bc1893399a0f04b61d55d9bfb0cd72960bf860558
7886,Low energy levels and persistent fatigue observed. Really trade move tonight in week include sam...,Low energy levels and persistent fatigue observed. Really trade move tonight in [REDACTED] inclu...,cb2cf4bf26a836aa5a877b7b1be46f9cdbb16fd95db2b306699acb743846d7ee
11652,"Struggling with sleep, often restless at night. Reveal opportunity fill morning national forward...","Struggling with sleep, often restless at night. Reveal opportunity fill morning national forward...",65d769c3b509edc5330e43dccf7c46898d1b450cbc3f0ada65245b2853e2f0ef
65,Complains of frequent headaches and irritability. Game them determine month heart wide decade hi...,Complains of frequent headaches and irritability. Game them determine [REDACTED] heart wide deca...,3ada92f28b4ceda38562ebf047c6ff05400d4c572352a1142eedfef67d21e662
669,Mood swings noted during follow-up. Big store name year with note win food.,Mood swings noted during follow-up. Big store [REDACTED] with note win food.,172e1676eda470ede17e9d491554bcbe97ba4691f92880064c8cb29ec35a467e
9614,Needs review of medication due to side effects. Wind moment today case strong like hundred surfa...,Needs review of medication due to side effects. Wind moment [REDACTED] case strong like hundred ...,3deb322a10088c08d1904c583dee01d22928853b968fd3e142274dfa511fb8bb
11149,Increased talkativeness and interrupting behavior. Though trouble traditional early decade.,Increased talkativeness and interrupting behavior. Though trouble traditional [REDACTED].,7157d0b037372c81ddb191177779ef20ac38a4f93e83b40fefa87cfaf972f74f
17950,Complains of frequent headaches and irritability. Time true agreement adult house year TV.,Complains of frequent headaches and irritability. Time true agreement adult house [REDACTED].,4b9d476385096c42149ad8a1b35edf317f99c9d61ffc348d8950ab0c31cf543f


#### 3. Export anonymized_patients.csv

In [9]:
df_masked.to_csv('anonymized_patients.csv', index=False)