In [41]:
import logging
from pathlib import Path
import pandas as pd

from data_cleaning.file_correction import clean_mac_files

In [42]:
def combine_annotation_files(paths: list[Path]):
    """Combine multiple annotation files into one. Annotations may be duplicate and single markers will be combined
    with user seizure markers with a start and end."""
    # Procedure:
    # 1. Load the seizures from one file
    # 2. Go through the files and add rows if they contain a start or ends (they should contain both or none)
    #   * assert that they don't exist yet
    # 3. Add the single markers between the corresponding start and end
    #   * assert that they don't exist yet
    # 4. Sort by datetime and save
    # * append comments when adding

    # get seizure dataframes from all files
    szr_dfs = [pd.read_csv(file_path).fillna('') for file_path in paths]
    szr_dfs = pd.concat(szr_dfs)

    start_marker_end_rows = pd.DataFrame(columns= szr_dfs[0].columns)

    # First add all rows with a start, single_marker, and end
    for df in szr_dfs:
        for row in df.iterrows():
            if row['start'] and row['single_marker'] and row['end']:
                if not row['start'] in seizures['start'].values:
                    # the row isn't contained and should be added
                    ...




                # assert row['end'], f'The row contains a start but not an end: {row}'
                # if row['start'] in seizures['start']:
                #     # The row is already contained and should be skipped. However, the single marker should be added if it exists
                #     assert row['end'] in seizures['end'], f'The start is contained but the end is not: {row}'


In [43]:
patient_folder = Path('/data/home/webb/UNEEG_data/20250217_UNEEG_Extended/E15T65H3Z')
clean_mac_files(patient_folder)
annotation_files = [file for file in patient_folder.iterdir() if (file.suffix == '.csv' and not 'all automatic detections' in file.name)]
annotation_files

[PosixPath('/data/home/webb/UNEEG_data/20250217_UNEEG_Extended/E15T65H3Z/E15T65H3Z_EMU_SUBQ_CONSENSUS.csv'),
 PosixPath('/data/home/webb/UNEEG_data/20250217_UNEEG_Extended/E15T65H3Z/E15T65H3Z_OUTPT_SUBQ_SeizureStartEnd.csv'),
 PosixPath('/data/home/webb/UNEEG_data/20250217_UNEEG_Extended/E15T65H3Z/E15T65H3Z_OUTPT_SUBQ_CONSENSUS.csv')]

In [44]:
szr_dfs = [pd.read_csv(file_path).fillna('') for file_path in annotation_files]

In [45]:
szr_dfs[0]

Unnamed: 0,type,start,single_marker,end,comment
0,User seizure marker,2024-03-19 16:36:20.488,,2024-03-19 16:37:09.873,
1,User seizure marker,2024-03-19 17:43:00.101,,2024-03-19 17:43:56.056,
2,User seizure marker,2024-03-20 07:29:49.683,,2024-03-20 07:30:35.047,


In [46]:
szr_dfs[1]

Unnamed: 0,type,start,single_marker,end,comment
0,Seizure-rhythmic +,2023-11-30 04:13:49.771,2023-11-30 04:13:57.669,2023-11-30 04:15:42.658,"end 5a, start 5a"
1,Seizure-rhythmic +,2024-01-30 16:45:45.835,2024-01-30 16:46:36.016,2024-01-30 16:48:43.257,start/end 5c


In [47]:
szr_dfs[2]

Unnamed: 0,type,start,single_marker,end,comment
0,Seizure-rhythmic +,,2023-11-30 04:13:57.669,,start 5a
1,Seizure-rhythmic,,2023-11-30 04:15:42.658,,end 5a
2,Seizure-rhythmic +,,2024-01-30 16:46:36.016,,start/end 5c
3,Seizure-rhythmic +,,2024-03-02 02:26:00.131,,
4,Seizure-rhythmic,,2024-03-02 02:28:27.819,,


In [48]:
szr_dfs = [pd.read_csv(file_path).fillna('') for file_path in annotation_files]
szr_dfs = pd.concat(szr_dfs, ignore_index=True)
szr_dfs


Unnamed: 0,type,start,single_marker,end,comment
0,User seizure marker,2024-03-19 16:36:20.488,,2024-03-19 16:37:09.873,
1,User seizure marker,2024-03-19 17:43:00.101,,2024-03-19 17:43:56.056,
2,User seizure marker,2024-03-20 07:29:49.683,,2024-03-20 07:30:35.047,
3,Seizure-rhythmic +,2023-11-30 04:13:49.771,2023-11-30 04:13:57.669,2023-11-30 04:15:42.658,"end 5a, start 5a"
4,Seizure-rhythmic +,2024-01-30 16:45:45.835,2024-01-30 16:46:36.016,2024-01-30 16:48:43.257,start/end 5c
5,Seizure-rhythmic +,,2023-11-30 04:13:57.669,,start 5a
6,Seizure-rhythmic,,2023-11-30 04:15:42.658,,end 5a
7,Seizure-rhythmic +,,2024-01-30 16:46:36.016,,start/end 5c
8,Seizure-rhythmic +,,2024-03-02 02:26:00.131,,
9,Seizure-rhythmic,,2024-03-02 02:28:27.819,,


In [49]:
# rows with start, marker, and end
start_marker_end_rows = szr_dfs[(
    (szr_dfs['start'] != '') &
    (szr_dfs['single_marker'] != '') &
    (szr_dfs['end'] != '')
)]
start_marker_end_rows

Unnamed: 0,type,start,single_marker,end,comment
3,Seizure-rhythmic +,2023-11-30 04:13:49.771,2023-11-30 04:13:57.669,2023-11-30 04:15:42.658,"end 5a, start 5a"
4,Seizure-rhythmic +,2024-01-30 16:45:45.835,2024-01-30 16:46:36.016,2024-01-30 16:48:43.257,start/end 5c


In [50]:
# rows with just start and end
start_end_rows = szr_dfs[(
    (szr_dfs['start'] != '') &
    (szr_dfs['single_marker'] == '') &
    (szr_dfs['end'] != '')
)]
start_end_rows

Unnamed: 0,type,start,single_marker,end,comment
0,User seizure marker,2024-03-19 16:36:20.488,,2024-03-19 16:37:09.873,
1,User seizure marker,2024-03-19 17:43:00.101,,2024-03-19 17:43:56.056,
2,User seizure marker,2024-03-20 07:29:49.683,,2024-03-20 07:30:35.047,


In [51]:
# rows with just a marker
marker_rows = szr_dfs[(
    (szr_dfs['start'] == '') &
    (szr_dfs['single_marker'] != '') &
    (szr_dfs['end'] == '')
)]
marker_rows

Unnamed: 0,type,start,single_marker,end,comment
5,Seizure-rhythmic +,,2023-11-30 04:13:57.669,,start 5a
6,Seizure-rhythmic,,2023-11-30 04:15:42.658,,end 5a
7,Seizure-rhythmic +,,2024-01-30 16:46:36.016,,start/end 5c
8,Seizure-rhythmic +,,2024-03-02 02:26:00.131,,
9,Seizure-rhythmic,,2024-03-02 02:28:27.819,,


In [52]:
seizures = start_marker_end_rows.copy(deep=True)

In [58]:
# add rows that have just a start and end
for _, row in start_end_rows.iterrows():
    if not row['start'] in seizures['start'].values:
        # the seizure should be added
        assert row['end'] not in seizures['end'].values, f"The start is not contained but the end is: {row}"
        pd.concat([seizures, pd.DataFrame(row).T], ignore_index=True)
        logging.info(f"Added row: {row}")
    else:
        # row is duplicate
        logging.info(f"Duplicate row: {row}")

In [59]:
seizures

Unnamed: 0,type,start,single_marker,end,comment
3,Seizure-rhythmic +,2023-11-30 04:13:49.771,2023-11-30 04:13:57.669,2023-11-30 04:15:42.658,"end 5a, start 5a"
4,Seizure-rhythmic +,2024-01-30 16:45:45.835,2024-01-30 16:46:36.016,2024-01-30 16:48:43.257,start/end 5c


In [55]:
pd.DataFrame(row).T

Unnamed: 0,type,start,single_marker,end,comment
2,User seizure marker,2024-03-20 07:29:49.683,,2024-03-20 07:30:35.047,


In [56]:
start_end_rows

Unnamed: 0,type,start,single_marker,end,comment
0,User seizure marker,2024-03-19 16:36:20.488,,2024-03-19 16:37:09.873,
1,User seizure marker,2024-03-19 17:43:00.101,,2024-03-19 17:43:56.056,
2,User seizure marker,2024-03-20 07:29:49.683,,2024-03-20 07:30:35.047,
