In [None]:
import pandas as pd
import numpy as np
import hashlib
import datetime
from dateutil.relativedelta import relativedelta

def anonymize_mimic3(data_dir, output_dir):
    """
    MIMIC-III数据集匿名化处理函数
    
    参数:
        data_dir: 原始数据目录路径
        output_dir: 匿名化后数据输出目录路径
    """
    
    # 1. 处理PATIENTS表 - 包含最敏感的患者个人信息
    patients = pd.read_csv(f"{data_dir}/PATIENTS.csv")
    
    # 匿名化处理
    patients['subject_id'] = patients['subject_id'].apply(lambda x: hash_id(x))
    patients['gender'] = patients['gender']  # 性别可以保留
    patients['dob'] = patients['dob'].apply(shift_date)  # 出生日期偏移
    patients['dod'] = patients['dod'].apply(shift_date)  # 死亡日期偏移
    patients['dod_hosp'] = patients['dod_hosp'].apply(shift_date)
    patients['dod_ssn'] = patients['dod_ssn'].apply(shift_date)
    patients['expire_flag'] = patients['expire_flag']  # 死亡标志可以保留
    
    patients.to_csv(f"{output_dir}/PATIENTS.csv", index=False)
    
    # 2. 处理ADMISSIONS表
    admissions = pd.read_csv(f"{data_dir}/ADMISSIONS.csv")
    
    admissions['subject_id'] = admissions['subject_id'].apply(lambda x: hash_id(x))
    admissions['hadm_id'] = admissions['hadm_id'].apply(lambda x: hash_id(x))
    admissions['admittime'] = admissions['admittime'].apply(shift_datetime)
    admissions['dischtime'] = admissions['dischtime'].apply(shift_datetime)
    admissions['deathtime'] = admissions['deathtime'].apply(shift_datetime)
    # 移除可能包含识别信息的文本字段
    admissions = admissions.drop(columns=['admission_location', 'discharge_location', 'insurance', 
                                        'language', 'religion', 'marital_status', 'ethnicity',
                                        'edregtime', 'edouttime', 'diagnosis'])
    
    admissions.to_csv(f"{output_dir}/ADMISSIONS.csv", index=False)
    
    # 3. 处理NOTEEVENTS表 - 临床笔记需要特别处理
    notes = pd.read_csv(f"{data_dir}/NOTEEVENTS.csv")
    
    notes['subject_id'] = notes['subject_id'].apply(lambda x: hash_id(x))
    notes['hadm_id'] = notes['hadm_id'].apply(lambda x: hash_id(x))
    notes['chartdate'] = notes['chartdate'].apply(shift_date)
    notes['charttime'] = notes['charttime'].apply(shift_datetime)
    notes['storetime'] = notes['storetime'].apply(shift_datetime)
    
    # 对临床笔记文本进行去标识化处理
    notes['text'] = notes['text'].apply(deidentify_text)
    
    notes.to_csv(f"{output_dir}/noteevents.csv", index=False)
    
    # 4. 处理其他表格 - 类似方法处理ID和时间信息
    tables_to_process = [
        'callout', 'caregivers', 'chartevents', 'cptevents', 
        'datetimeevents', 'diagnoses_icd', 'drgcodes', 'icustays',
        'inputevents_cv', 'inputevents_mv', 'labevents', 'microbiologyevents',
        'outputevents', 'prescriptions', 'procedureevents_mv', 
        'procedures_icd', 'services', 'transfers'
    ]
    
    for table in tables_to_process:
        df = pd.read_csv(f"{data_dir}/{table}.csv")
        
        # 通用id匿名化
        if 'subject_id' in df.columns:
            df['subject_id'] = df['subject_id'].apply(lambda x: hash_id(x))
        if 'hadm_id' in df.columns:
            df['hadm_id'] = df['hadm_id'].apply(lambda x: hash_id(x))
        if 'icustay_id' in df.columns:
            df['icustay_id'] = df['icustay_id'].apply(lambda x: hash_id(x))
            
        # 通用时间匿名化
        datetime_cols = [col for col in df.columns if 'time' in col or 'date' in col]
        for col in datetime_cols:
            if df[col].dtype == 'object':  # 假设是日期时间字符串
                if any(':' in str(x) for x in df[col].head()):  # 包含时间部分
                    df[col] = df[col].apply(shift_datetime)
                else:  # 只有日期部分
                    df[col] = df[col].apply(shift_date)
        
        df.to_csv(f"{output_dir}/{table}.csv", index=False)
    
    # 5. 处理描述表格 - 这些通常不包含敏感信息，可以直接复制
    desc_tables = ['d_cpt', 'd_icd_diagnoses', 'd_icd_procedures', 
                  'd_items', 'd_labitems']
    
    for table in desc_tables:
        df = pd.read_csv(f"{data_dir}/{table}.csv")
        df.to_csv(f"{output_dir}/{table}.csv", index=False)

def hash_id(original_id):
    """使用加密哈希函数匿名化ID"""
    if pd.isna(original_id):
        return np.nan
    return int(hashlib.sha256(str(original_id).encode()).hexdigest()[:8], 16)

def shift_date(original_date):
    """将日期偏移随机量以保护隐私"""
    if pd.isna(original_date):
        return np.nan
    
    try:
        # 解析日期
        date_obj = datetime.datetime.strptime(original_date, '%Y-%m-%d').date()
        # 应用固定偏移(示例中使用365天，实际应使用更复杂的逻辑)
        shifted_date = date_obj + datetime.timedelta(days=365)
        return shifted_date.strftime('%Y-%m-%d')
    except:
        return original_date

def shift_datetime(original_datetime):
    """将日期时间偏移随机量以保护隐私"""
    if pd.isna(original_datetime):
        return np.nan
    
    try:
        # 解析日期时间
        dt_obj = datetime.datetime.strptime(original_datetime, '%Y-%m-%d %H:%M:%S')
        # 应用固定偏移(示例中使用365天，实际应使用更复杂的逻辑)
        shifted_dt = dt_obj + datetime.timedelta(days=365)
        return shifted_dt.strftime('%Y-%m-%d %H:%M:%S')
    except:
        return original_datetime

def deidentify_text(text):
    """从临床笔记文本中移除识别信息"""
    if pd.isna(text):
        return np.nan
    
    text = str(text)
    text = text.replace('[**', '').replace('**]', '')  # 移除MIMIC中已有的标记
    return text

# 使用示例
if __name__ == "__main__":
    """data_dir: 原始数据目录路径
    output_dir: 匿名化后数据输出目录路径
    """
    root_dir = r"C:\Users\YourUsername\Documents"  # 替换为你的实际路径
    anonymize_mimic3(
        data_dir=fr"{root_dir}\mimic-iii-clinical-database-demo-1.4",
        output_dir=fr"{root_dir}\anonymized"
    )

  df = pd.read_csv(f"{data_dir}/{table}.csv")
  df = pd.read_csv(f"{data_dir}/{table}.csv")
