In [2]:
!pip install pandas

Collecting pandas
  Downloading pandas-2.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.2/12.2 MB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting tzdata>=2022.1 (from pandas)
  Downloading tzdata-2023.3-py2.py3-none-any.whl (341 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m341.8/341.8 kB[0m [31m16.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting numpy>=1.21.0 (from pandas)
  Downloading numpy-1.24.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.3/17.3 MB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: tzdata, numpy, pandas
Successfully installed numpy-1.24.3 pandas-2.0.2 tzdata-2023.3


In [33]:
import pandas as pd

df = pd.read_csv("/data/enroll_data.csv")
df

Unnamed: 0,site ID,date of consent,cohort,birth date
0,BWH,1/1/2020,CHR,1990-01-01
1,BWH,1/2/2020,CHR,1989-01-02
2,BWH,1/2/2020,HC,1998-01-03
3,BWH,1/2/2020,HC,1987-01-04
4,BWH,1/2/2020,CHR,1986-01-05
...,...,...,...,...
7941,PNC,12/31/2020,CHR,1999-10-04
7942,PNC,12/31/2020,HC,1988-10-04
7943,PNC,12/31/2020,CHR,1987-10-06
7944,PNC,12/31/2020,CHR,1996-10-06


### Add Age as a Column

In [34]:
from datetime import datetime

def calculate_age(dob) -> datetime.date:
    today = datetime.today()
    # calculate the age from a date of birth
    try:
        birthday = dob.replace(year=today.year) # create a new date with the same birthday but current year
    except Exception: # hits exception when encountering invalid dates: eg. Feb 29th of a non leap year
        return today.year - dob.year
    return today.year - dob.year - (birthday > today) # subtract one year if birthday is in the future

df["birth date"] = pd.to_datetime(df["birth date"])
df["age"] = df["birth date"].apply(calculate_age)

df

Unnamed: 0,site ID,date of consent,cohort,birth date,age
0,BWH,1/1/2020,CHR,1990-01-01,33
1,BWH,1/2/2020,CHR,1989-01-02,34
2,BWH,1/2/2020,HC,1998-01-03,25
3,BWH,1/2/2020,HC,1987-01-04,36
4,BWH,1/2/2020,CHR,1986-01-05,37
...,...,...,...,...,...
7941,PNC,12/31/2020,CHR,1999-10-04,23
7942,PNC,12/31/2020,HC,1988-10-04,34
7943,PNC,12/31/2020,CHR,1987-10-06,35
7944,PNC,12/31/2020,CHR,1996-10-06,26


### Anonymize Date of Consent

#### Requirements
- Randomize 'date of consent'
- Dates must be earlier than 1925
- Save offset separately so its retrievable

In [35]:
df["date of consent"] = pd.to_datetime(df["date of consent"])
df

Unnamed: 0,site ID,date of consent,cohort,birth date,age
0,BWH,2020-01-01,CHR,1990-01-01,33
1,BWH,2020-01-02,CHR,1989-01-02,34
2,BWH,2020-01-02,HC,1998-01-03,25
3,BWH,2020-01-02,HC,1987-01-04,36
4,BWH,2020-01-02,CHR,1986-01-05,37
...,...,...,...,...,...
7941,PNC,2020-12-31,CHR,1999-10-04,23
7942,PNC,2020-12-31,HC,1988-10-04,34
7943,PNC,2020-12-31,CHR,1987-10-06,35
7944,PNC,2020-12-31,CHR,1996-10-06,26


In [36]:
import random
import datetime

def days_offset(date1: datetime, date2: datetime) -> int:
    # subtract the dates and get a timedelta object
    delta = date2 - date1
    
    # return the number of days in the timedelta
    return delta.days

def anonymize_date(date: datetime.date) -> datetime.date:
    # get the year, month and day from the input date
    year = date.year
    month = date.month
    day = date.day

    new_year = random.randint(1800, 1925)
    new_month = random.randint(1, 12)
    new_day = random.randint(1, 27)

    return datetime.date(new_year, new_month, new_day)

df["date of consent - anonymized"] = df["date of consent"].apply(anonymize_date)
df["date of consent - anonymized"] = pd.to_datetime(df["date of consent - anonymized"])

df["offset"] = df.apply(lambda row: days_offset(row["date of consent - anonymized"], row["date of consent"]), axis=1)

In [37]:
df

Unnamed: 0,site ID,date of consent,cohort,birth date,age,date of consent - anonymized,offset
0,BWH,2020-01-01,CHR,1990-01-01,33,1872-07-15,53860
1,BWH,2020-01-02,CHR,1989-01-02,34,1872-03-24,53974
2,BWH,2020-01-02,HC,1998-01-03,25,1837-02-15,66795
3,BWH,2020-01-02,HC,1987-01-04,36,1816-02-07,74474
4,BWH,2020-01-02,CHR,1986-01-05,37,1826-09-13,70603
...,...,...,...,...,...,...,...
7941,PNC,2020-12-31,CHR,1999-10-04,23,1855-03-10,60562
7942,PNC,2020-12-31,HC,1988-10-04,34,1823-06-15,72153
7943,PNC,2020-12-31,CHR,1987-10-06,35,1819-10-22,73485
7944,PNC,2020-12-31,CHR,1996-10-06,26,1887-06-02,48790


#### Export Relevant Columns to their respective files

In [38]:
anonymized_df = df.drop(columns=['date of consent', 'birth date','offset'])
anonymized_df = anonymized_df.rename(columns = {'date of consent - anonymized':'date of consent'})

anonymized_df.to_csv('/data/enroll_data_anon_DM.csv')

In [39]:
anonymized_df

Unnamed: 0,site ID,cohort,age,date of consent
0,BWH,CHR,33,1872-07-15
1,BWH,CHR,34,1872-03-24
2,BWH,HC,25,1837-02-15
3,BWH,HC,36,1816-02-07
4,BWH,CHR,37,1826-09-13
...,...,...,...,...
7941,PNC,CHR,23,1855-03-10
7942,PNC,HC,34,1823-06-15
7943,PNC,CHR,35,1819-10-22
7944,PNC,CHR,26,1887-06-02


In [40]:
offset_df = df["offset"]

offset_df.to_csv('/data/enroll_data_offset_DM.csv')

In [41]:
offset_df

0       53860
1       53974
2       66795
3       74474
4       70603
        ...  
7941    60562
7942    72153
7943    73485
7944    48790
7945    66489
Name: offset, Length: 7946, dtype: int64