In [1]:
import polars as pl
from src.paths import FPATH

# Static time tokens

In [2]:
lifelines = pl.read_parquet(
    FPATH.NETWORK_DATA / "destiny" / "cohort" / "lifelines.parquet", 
    columns=["person_id", "birthday", "event_final_date"]
).sort("event_final_date")

In [3]:
start_end_df = lifelines.group_by("person_id").last().sort("event_final_date")

### Birthday (age) tokens

In [4]:
birthday_df = start_end_df.with_columns(
    date_col=pl.date_ranges(
        pl.col("birthday").dt.offset_by("1y"), # Only start with AGE_1
        pl.col("event_final_date"),
        "1y"
    ),
).with_columns(
    age=pl.int_ranges(1, pl.col("date_col").list.len()+1, 1)
).drop("birthday", "event_final_date")

In [5]:
att_birthday = birthday_df.explode("date_col", "age").with_columns(
    age=pl.format("ATT_age{}", pl.col("age"))
)
att_birthday

person_id,date_col,age
i64,date,str
7973198,1941-09-27,"""ATT_age1"""
7973198,1942-09-27,"""ATT_age2"""
7973198,1943-09-27,"""ATT_age3"""
7973198,1944-09-27,"""ATT_age4"""
7973198,1945-09-27,"""ATT_age5"""
…,…,…
5899115,2023-01-29,"""ATT_age67"""
72078879,2020-02-10,"""ATT_age1"""
72078879,2021-02-10,"""ATT_age2"""
72078879,2022-02-10,"""ATT_age3"""


In [10]:
att_birthday.write_parquet(FPATH.NETWORK_DATA / "destiny" / "att_birthday.parquet")

### Calendar year tokens

In [6]:
calendar_df = start_end_df.with_columns(
    pl.col("birthday").dt.offset_by("1y").dt.truncate("1y"),
).with_columns(
    date_col=pl.date_ranges(
        pl.col("birthday"), pl.col("event_final_date"), "1y"
    ),
    calendar_years=pl.int_ranges(
        pl.col("birthday").dt.year(), pl.col("event_final_date").dt.year() + 1
    )
).drop("birthday", "event_final_date")

In [7]:
att_calendar = calendar_df.explode("date_col", "calendar_years").with_columns(
    calendar_years=pl.format("ATT_year{}", pl.col("calendar_years"))
)
att_calendar

person_id,calendar_years,date_col
i64,str,date
7973198,"""ATT_year1941""",1941-01-01
7973198,"""ATT_year1942""",1942-01-01
7973198,"""ATT_year1943""",1943-01-01
7973198,"""ATT_year1944""",1944-01-01
7973198,"""ATT_year1945""",1945-01-01
…,…,…
5899115,"""ATT_year2023""",2023-01-01
72078879,"""ATT_year2020""",2020-01-01
72078879,"""ATT_year2021""",2021-01-01
72078879,"""ATT_year2022""",2022-01-01


In [11]:
att_calendar.write_parquet(FPATH.NETWORK_DATA / "destiny" / "att_calendar.parquet")

# Time tokens between events

In [2]:
lpr = pl.read_parquet(FPATH.NETWORK_DATA / "destiny" / "lpr.parquet")#.head(10_000_000)#.sort("date_col")#.head(1_000_000)

In [3]:
lpr1 = lpr.sort("date_col", maintain_order=True).with_columns(
    diff=pl.col("date_col").diff().over("person_id")
)
lpr2 = lpr.sort("person_id", "date_col", maintain_order=True).with_columns(
    diff=pl.col("date_col").diff().over("person_id")
)

In [None]:
lpr1.filter(pl.col("person_id") == 147267591)

In [4]:
lpr1.sort("person_id", "date_col").equals(lpr2)

False

In [5]:
lpr1.sort("person_id", "date_col").equals(lpr2.sort("person_id", "date_col"))

False

In [6]:
lpr1_sort = lpr1.sort("person_id", "date_col")

In [7]:
n = 10
lpr1_sort.tail(n).equals(lpr2.tail(n))

False

In [8]:
lpr1_sort.tail(11)

person_id,date_col,urgency,patienttype,aktionsdiagnose_adaptrunc,diff
i64,datetime[μs],str,str,str,duration[μs]
147267591,2019-08-02 08:44:00,"""HEA_urgency_ATA3""",,"""HEA_ICD10_DQ120""",0µs
147267591,2019-08-02 08:44:00,"""HEA_urgency_ATA3""",,"""HEA_ICD10_DQ120""",
147267591,2019-08-16 12:35:00,"""HEA_urgency_ATA3""",,"""HEA_ICD10_DQ120""",14d 3h 51m
147267591,2019-10-25 10:12:00,"""HEA_urgency_ATA3""",,"""HEA_ICD10_DQ120""",69d 21h 37m
147267591,2019-10-25 12:01:00,"""HEA_urgency_ATA3""",,"""HEA_ICD10_DQ120""",1h 49m
…,…,…,…,…,…
147500138,2022-08-02 21:24:00,"""HEA_urgency_ATA1""",,"""HEA_ICD10_DP073""",
147500138,2022-08-22 09:42:00,"""HEA_urgency_ATA3""",,"""HEA_ICD10_DP073""",19d 12h 18m
147500138,2022-09-05 09:46:00,"""HEA_urgency_ATA3""",,"""HEA_ICD10_DP073""",14d 4m
147500138,2022-12-19 10:13:00,"""HEA_urgency_ATA3""",,"""HEA_ICD10_DD180F""",105d 27m


In [9]:
lpr2.tail(11)

person_id,date_col,urgency,patienttype,aktionsdiagnose_adaptrunc,diff
i64,datetime[μs],str,str,str,duration[μs]
147267591,2019-08-02 08:44:00,"""HEA_urgency_ATA3""",,"""HEA_ICD10_DQ120""",
147267591,2019-08-02 08:44:00,"""HEA_urgency_ATA3""",,"""HEA_ICD10_DQ120""",0µs
147267591,2019-08-16 12:35:00,"""HEA_urgency_ATA3""",,"""HEA_ICD10_DQ120""",14d 3h 51m
147267591,2019-10-25 10:12:00,"""HEA_urgency_ATA3""",,"""HEA_ICD10_DQ120""",69d 21h 37m
147267591,2019-10-25 12:01:00,"""HEA_urgency_ATA3""",,"""HEA_ICD10_DQ120""",1h 49m
…,…,…,…,…,…
147500138,2022-08-02 21:24:00,"""HEA_urgency_ATA1""",,"""HEA_ICD10_DP073""",
147500138,2022-08-22 09:42:00,"""HEA_urgency_ATA3""",,"""HEA_ICD10_DP073""",19d 12h 18m
147500138,2022-09-05 09:46:00,"""HEA_urgency_ATA3""",,"""HEA_ICD10_DP073""",14d 4m
147500138,2022-12-19 10:13:00,"""HEA_urgency_ATA3""",,"""HEA_ICD10_DD180F""",105d 27m


In [96]:
edges = (
    [hour for hour in range(1, 24)] +
    [day*24 for day in range (1, 30)] +
    [month*24*30 for month in range(1, 13)] +
    [year*24*365 for year in range(1, 101)]
)
labels = (
    [f"[ATT_<{hour}HOUR]" for hour in range(1, 24)] +
    [f"[ATT_<{day}DAY]" for day in range (1, 30)] +
    [f"[ATT_<{month}MONTH]" for month in range(1, 13)] +
    [f"[ATT_<{year}YEAR]" for year in range(1, 101)]
    + ["[ATT_UNKNOWN]"]
)

In [97]:
foo = lpr.with_columns(
    diff2=pl.col("diff").dt.total_hours()
).with_columns(
    pl.col("diff2").cut(breaks=edges, labels=labels, left_closed=True)
)