In [13]:
import logging
import multiprocessing
from datetime import timedelta, datetime
from pprint import pprint
import pyedflib
import pandas as pd
from pandas import DataFrame, Timestamp
from pyedflib import highlevel

from config.paths import PATHS, Dataset, PatientDir, Paths

In [10]:
ptnt_dir = PatientDir(PATHS.for_mayo_dir / 'B52K3P3G')
# ptnt_dir = PatientDir(PATHS.competition_dir / 'P1')

edf_path = next(ptnt_dir.edf_dir.iterdir())  # get the first edf
edf_path

PosixPath('/data/home/webb/UNEEG_data/20240201_UNEEG_ForMayo/B52K3P3G/edf_data/B52K3P3G_2021-10-24_21-56-04.edf')

In [18]:
# Start date time has no tz info
edf = pyedflib.EdfReader(str(edf_path))
start = edf.getStartdatetime()
print(start.tzinfo)
print(start)

None
2021-10-24 21:56:04


In [29]:
# Simply read a timezone
def timezone_from_edf_header(header: dict) -> str:
    # This returns the format "UTC+02h" for 2 hour offset
    tz = header['annotations'][0][2].removeprefix('LOCAL TIME = ')
    # Modifications to make it work with pandas Timestamp init
    tz = tz.removesuffix('h') + ':00'
    return tz


header = highlevel.read_edf_header(str(edf_path))
timezone = timezone_from_edf_header(header)
print(timezone)

UTC+02:00


In [33]:
start_ts = Timestamp(start, tz=timezone)
print(start_ts)
print(start_ts.tz_convert('UTC'))

2021-10-24 21:56:04+02:00
2021-10-24 19:56:04+00:00


# Find all time zones per patient

In [11]:
def find_ptnt_timezones(ptnt_dir: PatientDir) -> dict:
    logging.info(f"Processing {ptnt_dir.name}")
    problematic_files = []

    edf_paths = list(ptnt_dir.edf_dir.iterdir())
    edf_paths.sort()
    edf_names = [edf_path.name for edf_path in edf_paths]
    edfs = DataFrame(index=edf_names, columns=['start', 'end', 'duration_hours', 'timezone'])

    for edf_path in edf_paths:
        logging.debug(f"Processing {edf_path.name}")
        try:
            header = highlevel.read_edf_header(str(edf_path))
            timezone: str = header['annotations'][0][2]
            timezone = timezone.removeprefix('LOCAL TIME = ')

            edf = pyedflib.EdfReader(str(edf_path))
            start = edf.getStartdatetime()
            duration = timedelta(seconds=edf.getFileDuration())
            end = start + duration

            edfs.loc[edf_path.name, 'start'] = start
            edfs.loc[edf_path.name, 'end'] = end
            edfs.loc[edf_path.name, 'duration_hours'] = duration
            edfs.loc[edf_path.name, 'timezone'] = timezone
        except:
            problematic_files.append(edf_path)

    timezones = list(edfs['timezone'].unique())
    timezones.sort()
    logging.info(f"Finished {ptnt_dir.name}")
    return {'timezones': timezones, 'problematic files': problematic_files, 'edfs': edfs}

In [14]:
logging.basicConfig(level=logging.INFO, format='[%(levelname)s]: %(message)s')

# for pd in PATHS.patient_dirs(include_invalid_ptnts=True):
#     print('=== Processing', pd.name)
#     ptnt_timezones[pd.name] = find_ptnt_timezones(pd)

paths = Paths('/data/home/webb/UNEEG_data_2025-12-09_before_timezone_adj')
ptnt_dirs = paths.patient_dirs([Dataset.for_mayo, Dataset.uneeg_extended], include_invalid_ptnts=True)

# ptnt_dirs = PATHS.patient_dirs([Dataset.for_mayo, Dataset.uneeg_extended], include_invalid_ptnts=True)
with multiprocessing.Pool() as pool:
    ptnt_timezones = pool.map(find_ptnt_timezones, ptnt_dirs)
ptnt_timezones = {pd.name: tz for pd, tz in zip(ptnt_dirs, ptnt_timezones)}

[INFO]: Processing M39K4B3C
[INFO]: Processing B52K3P3G
[INFO]: Processing F5TW95P3X
[INFO]: Processing K37N36L4D
[INFO]: Processing G39B4L9E
[INFO]: Processing D63Q51K2N
[INFO]: Processing P73M2F6H
[INFO]: Processing A4RW34Z5B
[INFO]: Processing P4Hk23M7L
[INFO]: Processing E85L95P2H
[INFO]: Processing E15T65H3Z
[INFO]: Processing K53T36N7F
[INFO]: Processing L3GS57K2T
[INFO]: Finished K53T36N7F
[INFO]: Finished P73M2F6H
[INFO]: Finished K37N36L4D
[INFO]: Finished E85L95P2H
[INFO]: Finished P4Hk23M7L
[INFO]: Finished B52K3P3G
[INFO]: Finished A4RW34Z5B
[INFO]: Finished L3GS57K2T
[INFO]: Finished F5TW95P3X
[INFO]: Finished D63Q51K2N
[INFO]: Finished G39B4L9E
[INFO]: Finished M39K4B3C
[INFO]: Finished E15T65H3Z


In [24]:
ptnt_tzs = {}
for ptnt, info in ptnt_timezones.items():
    ptnt_tzs[ptnt] = info['timezones']
    print(f"{ptnt}: {info['timezones']}")

K37N36L4D: ['UTC+01h', 'UTC+02h']
G39B4L9E: ['UTC+01h', 'UTC+02h']
M39K4B3C: ['UTC+01h', 'UTC+02h']
B52K3P3G: ['UTC+01h', 'UTC+02h']
F5TW95P3X: ['UTC+01h', 'UTC+02h']
D63Q51K2N: ['UTC+01h', 'UTC+02h']
P4Hk23M7L: ['UTC+01h', 'UTC+02h']
P73M2F6H: ['UTC+01h', 'UTC+02h']
E85L95P2H: ['UTC+01h', 'UTC+02h']
A4RW34Z5B: ['UTC+01h', 'UTC+02h']
L3GS57K2T: ['UTC+01h', 'UTC+02h']
K53T36N7F: ['UTC+01h', 'UTC+02h']
E15T65H3Z: ['UTC+01h', 'UTC+02h']


In [25]:
DataFrame(ptnt_tzs).to_csv(r'/data/home/webb/other_UNEEG_stuff/patient_timezones.csv')

In [None]:
transitions = []

for ptnt, info in ptnt_timezones.items():
    edfs = info['edfs']
    mask = edfs['timezone'].ne(edfs['timezone'].shift())
    mask.iloc[0] = False  # first row is always True
    change_rows = edfs[mask].copy()

    for curr_pos in mask[mask].index:
        pos = edfs.index.get_loc(curr_pos)
        if pos == 0:
            continue
        prev_idx = edfs.index[pos - 1]
        transitions.append(edfs.loc[[prev_idx, curr_pos]])

# Pandas Timezone aware Timestamp test

In [2]:
from pandas import Timestamp, Timedelta

timezone = 'UTC+01:00'

t1 = Timestamp("2000-01-01 12:00:00", tz=timezone)
t2 = Timestamp("2000-06-01 12:00:00", tz=timezone)
print(t1)
print(t2)

2000-01-01 12:00:00+01:00
2000-06-01 12:00:00+01:00


In [23]:
t1.tz_convert('UTC')

Timestamp('2000-01-01 11:00:00+0000', tz='UTC')

In [24]:
t2.tz_convert('UTC')

Timestamp('2000-06-01 11:00:00+0000', tz='UTC')

In [28]:
# If timezone is created with a location, DST is automatically inferred!
timezone = 'Europe/Berlin'
tA = Timestamp("2000-01-01 12:00:00", tz=timezone)
tB = Timestamp("2000-06-01 12:00:00", tz=timezone)
print(tA)
print(tB)

2000-01-01 12:00:00+01:00
2000-06-01 12:00:00+02:00


In [34]:
# Sorting will occur with a uniform time!
ts = [
    Timestamp("2000-01-01 14:00:00", tz='UTC+02:00'),
    Timestamp("2000-01-01 15:00:00", tz='UTC+02:00'),
    Timestamp("2000-01-01 12:00:00", tz='UTC'),
    Timestamp("2000-01-01 13:00:00", tz='UTC+01:00'),
]

ts.sort()
ts

[Timestamp('2000-01-01 14:00:00+0200', tz='UTC+02:00'),
 Timestamp('2000-01-01 12:00:00+0000', tz='UTC'),
 Timestamp('2000-01-01 13:00:00+0100', tz='UTC+01:00'),
 Timestamp('2000-01-01 15:00:00+0200', tz='UTC+02:00')]

In [37]:
tz = 'Europe/Berlin'
start = Timestamp("2021-10-31 01:00:00", tz=tz)
end = Timestamp("2021-10-31 04:00:00", tz=tz)
daterange = pd.date_range(start, end, freq=Timedelta(minutes=30), tz=tz)

for date in daterange:
    print(date)

2021-10-31 01:00:00+02:00
2021-10-31 01:30:00+02:00
2021-10-31 02:00:00+02:00
2021-10-31 02:30:00+02:00
2021-10-31 02:00:00+01:00
2021-10-31 02:30:00+01:00
2021-10-31 03:00:00+01:00
2021-10-31 03:30:00+01:00
2021-10-31 04:00:00+01:00


In [40]:
for date in daterange.tz_convert('UTC'):
    print(date)

2021-10-30 23:00:00+00:00
2021-10-30 23:30:00+00:00
2021-10-31 00:00:00+00:00
2021-10-31 00:30:00+00:00
2021-10-31 01:00:00+00:00
2021-10-31 01:30:00+00:00
2021-10-31 02:00:00+00:00
2021-10-31 02:30:00+00:00
2021-10-31 03:00:00+00:00


In [43]:
# If you try to parse an ambiguous time, it will raise an error
t = Timestamp("2021-10-31 01:30:00", tz='Europe/London', )
t

AmbiguousTimeError: Cannot infer dst time from 2021-10-31 01:30:00, try using the 'ambiguous' argument

## Convert existing Timestamp
We have an already existing timestamp, which is already local time, and want to add the timezone

In [31]:
t = Timestamp('2000-01-01 12:00:00')
t_new = t.tz_localize('Europe/Berlin')
print(t)
print(t_new)

2000-01-01 12:00:00
2000-01-01 12:00:00+01:00


In [86]:
# For columns with missing values, these are just ignored :)
ts = [
    Timestamp('2020-01-01 12:00:00'),
    Timestamp('2020-10-25 01:00:00'),
    Timestamp('2020-10-25 02:00:00'),
    Timestamp('2020-10-25 02:30:00'),
    Timestamp('2020-10-25 02:00:01'),
    Timestamp('2020-10-25 04:00:00'),
    None,
    None,
]
ts = pd.Series(ts, name='time')
ts.notna().any()

np.True_

In [87]:
ts = ts.dt.tz_localize('Europe/Berlin',
                       # ambiguous='raise',
                       ambiguous='infer',
                       )
ts

0   2020-01-01 12:00:00+01:00
1   2020-10-25 01:00:00+02:00
2   2020-10-25 02:00:00+02:00
3   2020-10-25 02:30:00+02:00
4   2020-10-25 02:00:01+01:00
5   2020-10-25 04:00:00+01:00
6                         NaT
7                         NaT
Name: time, dtype: datetime64[ns, Europe/Berlin]

In [None]:
# If they are already tz-aware, it will raise an error
ts.dt.tz_localize('Europe/Berlin', ambiguous='infer')

# Reading a Timestamp

In [5]:
# It's possible to simply read the str times that get saved! :-)
t = Timestamp('2021-10-08 12:31:42+02:00')
t

Timestamp('2021-10-08 12:31:42+0200', tz='UTC+02:00')

In [6]:
t.tz_convert('UTC')

Timestamp('2021-10-08 10:31:42+0000', tz='UTC')

# Other

In [92]:
t = Timestamp('2021-10-08 12:31:42', tz='Europe/Berlin')
t

Timestamp('2021-10-08 12:31:42+0200', tz='Europe/Berlin')

In [94]:
# For columns with missing values, these are just ignored :)
ts = [
    '2020-11-25 02:00:01',
    '2020-10-25 04:00:00',
]
ts = pd.Series(ts, name='time')

In [96]:
pd.to_datetime(ts, timezone='Europe/Berlin')

TypeError: to_datetime() got an unexpected keyword argument 'timezone'