# Libraries

In [1]:
import pyedflib
import plotly.express as px
from pathlib import Path
import os
import pandas as pd
import numpy as np
import EDF_wrapper
import filters
import re

In [2]:
directory = Path("data/edf/")
files = EDF_wrapper.read_files_from_dir(directory, load_files=True)
files

[{'filepath': 'data/edf/1-1-Bewegung_edited.bdf',
  'signals': array([[-2.65347050e+00, -4.09816558e+00, -4.09922416e+00, ...,
           1.10268599e-05,  1.10268599e-05,  1.10268599e-05],
         [-1.54233793e+00, -1.44602933e+00, -1.44344905e+00, ...,
           1.10268599e-05,  1.10268599e-05,  1.10268599e-05],
         [-3.29887007e+00,  4.98973220e+00,  4.99647129e+00, ...,
           7.58485959e-06,  7.58485959e-06,  7.58485959e-06],
         [-4.99413858e+00, -5.02268751e-02, -4.44226541e-02, ...,
           6.99207944e-06,  6.99207944e-06,  6.99207944e-06],
         [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
           0.00000000e+00,  0.00000000e+00,  0.00000000e+00]]),
  'signal_headers': [{'label': 'EMG 1',
    'dimension': 'mV',
    'sample_rate': 4000.0,
    'sample_frequency': 4000.0,
    'physical_max': 185.0,
    'physical_min': -185.0,
    'digital_max': 8388607,
    'digital_min': -8388608,
    'prefilter': '',
    'transducer': 'transkutan'},
   {'labe

In [3]:
def compute_time(sampling_frequency, signal_array):
        # Calculate the time array based on the length of the signal array and the sampling frequency
        total_samples = len(signal_array)
        time_array = np.arange(total_samples) / sampling_frequency
        
        return time_array

In [4]:
def create_swallows_df(file, fileList=False):
    swallow = iter(filter(lambda x : "S_" in x[-1], file["header"]["annotations"]))

    rows = {"set": [], "filepath": [], "category": [], "sample_name": [], "start_time": [], "stop_time": []}

    for ann in swallow:
        start_time, _, desc = ann
        stop_time, _, _ = next(swallow)
        
        if not fileList:
            rows["set"].append(1)
            rows["filepath"].append(Path(file["filepath"]).name)
            rows["category"].append(None)
            rows["sample_name"].append(desc)
            rows["start_time"].append(start_time)
            rows["stop_time"].append(stop_time)

    df = pd.DataFrame(rows)
    
    df['duration'] = df["stop_time"] - df["start_time"]
    
    return df

In [21]:
def create_general_df(file, fileList=False):
    def find_first_element(list_data, condition):
        for element in list_data:
            if condition(element):
                return element
        return None
    
    def crop_signals_array(start_time, stop_time, file):
        cropped_signals = []
        for channel, signal in enumerate(file["signals"]):
            sr = file['signal_headers'][channel]['sample_rate']
            start_idx = round(start_time * sr)
            stop_idx = round(stop_time * sr) + 1
            time_array = compute_time(sr, signal)
            cropped_signals.append((time_array[start_idx: stop_idx], np.array(signal[start_idx: stop_idx])))
        return list(zip(file['signal_headers'], np.array(cropped_signals)))

    general = list(filter(lambda x : "_" in x[-1], file["header"]["annotations"]))

    rows = {"set": [], "subject": [], "category": [], "sample_name": [],
            "start_time": [], "stop_time": [],
             "header": [], "data_label": [], "time": [], "signal": []}

    for i, row in enumerate(general):
        time, _, desc = row
        s = desc.split("_")
        
        if s[0] == "C":
            if s[-1] == "start":
                _, cat, _ = s
            else:
                cat = '-'
            
        else:
            m, sample, event = s
            if event == "start":
                start_time = time
                stop_time, _, _ = find_first_element(general[i:], lambda x: x[-1] == f"{m}_{sample}_stop")
                if not fileList:
                    signals = crop_signals_array(start_time, stop_time, file)
                    for h, sigs in signals:
                        rows["set"].append(1)
                        rows["subject"].append(Path(file["filepath"]).stem)
                        rows["category"].append(cat)
                        rows["sample_name"].append(s[1])
                        rows["start_time"].append(start_time)
                        rows["stop_time"].append(stop_time)
                        rows["header"].append(h)
                        rows["data_label"].append(h['label'])
                        rows["time"].append(sigs[0])
                        rows["signal"].append(sigs[1])

    df = pd.DataFrame(rows)
    
    df = df.explode(['time', 'signal']).reset_index(drop=True)
    
    df["id"] = (df["subject"] + df["category"] + df["sample_name"]).astype("category")
    df["data_label"] = df["data_label"].astype("category")
    
    df["time"] = df["time"].astype(float)
    df["signal"] = df["signal"].astype(float)
    
    cat_columns = df.select_dtypes(['category']).columns

    df[cat_columns] = df[cat_columns].apply(lambda x: x.cat.codes)
    
    # df['duration'] = df["stop_time"] - df["start_time"]
     
    return df

In [53]:
file = files[4]

In [54]:
from annotations_validation import check_T_annotations, check_C_annotations

def check_annotations(file):
    if any(map(lambda x : not re.match(r"\w_\w+_(start|stop)", x[-1]), file["header"]["annotations"])):
        pattern_mismatches = [i for i, val in enumerate(map(lambda x : re.match(r"\w_\w+_(start|stop)", x[-1]), file["header"]["annotations"])) if val is None]
        print(f"There are {len(pattern_mismatches)} annotations out of the pattern.")
        for idx in pattern_mismatches:
            print(idx, file["header"]["annotations"][idx])
        
        return False
    
    elif file["header"]["annotations"] == []:
        print("There are no annotations in the file.")
        
        return False
    
    else:
        if check_T_annotations(list(filter(lambda x : "T_" in x[-1], file["header"]["annotations"]))) and check_C_annotations(list(filter(lambda x : "C_" in x[-1], file["header"]["annotations"]))):
            return True
    

In [55]:
check_annotations(file)

There are 44 annotations out of the pattern.
0 [2.8425, -1.0, 'Schlucken normal']
8 [8.5357, -1.0, 'stop']
9 [11.0007, -1.0, 'Schlucken normal']
16 [18.3997, -1.0, 'stop']
18 [20.3732, -1.0, 'Schlucken hoch']
19 [26.7485, -1.0, 'stop']
20 [28.8607, -1.0, 'Schlucken hoch']
21 [33.0822, -1.0, 'stop']
22 [34.8947, -1.0, 'Schlucken tief']
23 [40.2047, -1.0, 'stop']
24 [41.9152, -1.0, 'Schlucken tief']
25 [45.999, -1.0, 'stop']
26 [48.576, -1.0, 'Schlucken Knie re']
27 [56.2337, -1.0, 'stop']
28 [58.7792, -1.0, 'Schlucken Knie li']
29 [64.0495, -1.0, 'stop']
30 [67.543, -1.0, 'Mendelson']
31 [79.005, -1.0, 'stop']
32 [81.05, -1.0, 'Mendelson']
35 [86.7565, -1.0, 'stop']
36 [92.9237, -1.0, 'Sprechen']
37 [97.8187, -1.0, 'stop']
38 [100.56, -1.0, 'Zunge']
39 [106.7885, -1.0, 'stop']
40 [108.7557, -1.0, 'Z..hne']
41 [114.7772, -1.0, 'stop']
42 [118.2972, -1.0, 'Blick auf Knie re']
43 [123.7447, -1.0, 'stop']
44 [125.433, -1.0, 'Blick auf Knie li']
45 [128.539, -1.0, 'stop']
46 [134.1335, -1.0,

False

In [35]:
pattern_mismatches = [i for i, val in enumerate(map(lambda x : re.match(r"\w_\w+_(start|stop)", x[-1]), file["header"]["annotations"])) if val is None]
pattern_mismatches

[0,
 8,
 9,
 16,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 55,
 56,
 59,
 60,
 61,
 62,
 63]

In [None]:
swallows_df = create_swallows_df(file)
swallows_df

In [23]:
general_df = create_general_df(file)
final = general_df.loc[general_df['data_label'] == 2, ['id', 'time', 'signal']]
final

Unnamed: 0,id,time,signal
0,2,6.68700,-10.326908
1,2,6.68725,-10.325298
2,2,6.68750,-10.335906
3,2,6.68775,-10.338045
4,2,6.68800,-10.349182
...,...,...,...
153836,0,163.88000,-9.764163
153837,0,163.88025,-9.761517
153838,0,163.88050,-9.768949
153839,0,163.88075,-9.768949


## TSFresh tests

In [10]:
from tsfresh import extract_features, extract_relevant_features, select_features
from tsfresh.utilities.dataframe_functions import impute
from tsfresh.feature_extraction import ComprehensiveFCParameters

In [26]:
final['time'] = (final['time'] * 10e+5).astype(int)
final['signal'] = (final['signal'] * 10e+5).astype(int)

In [27]:
final.dtypes

id         int8
time      int64
signal    int64
dtype: object

In [33]:
# Define your extraction settings (optional)
extraction_settings = ComprehensiveFCParameters()

# # Remove the 'sample_entropy' calculation from the extraction settings
# extraction_settings.pop('sample_entropy', None)

# Perform feature extraction without the 'sample_entropy' calculation
X = extract_features(final.head(15000), column_id='id', column_sort='time',
                     #column_kind='data_label', column_value='signal',
                     default_fc_parameters=extraction_settings,
                     impute_function=impute
                     )

Feature Extraction:   0%|          | 0/3 [00:01<?, ?it/s]


KeyboardInterrupt: 

In [34]:
X.to_excel('data/xlsx/bewegungs_edited-data-label-2.xlsx')