In [1]:
!pip install pyEDFlib
!pip install mne

Defaulting to user installation because normal site-packages is not writeable
Collecting pyEDFlib
  Obtaining dependency information for pyEDFlib from https://files.pythonhosted.org/packages/c9/21/db3a777e23cf160f9dc20f20a4a09b6414698b3df94977b75e4078da942f/pyEDFlib-0.1.38-cp311-cp311-win_amd64.whl.metadata
  Downloading pyEDFlib-0.1.38-cp311-cp311-win_amd64.whl.metadata (6.5 kB)
Downloading pyEDFlib-0.1.38-cp311-cp311-win_amd64.whl (2.2 MB)
   ---------------------------------------- 0.0/2.2 MB ? eta -:--:--
   ---------------------------------------- 0.0/2.2 MB ? eta -:--:--
    --------------------------------------- 0.0/2.2 MB 495.5 kB/s eta 0:00:05
   -- ------------------------------------- 0.1/2.2 MB 1.0 MB/s eta 0:00:03
   ---- ----------------------------------- 0.2/2.2 MB 1.4 MB/s eta 0:00:02
   -------- ------------------------------- 0.5/2.2 MB 2.5 MB/s eta 0:00:01
   ------------------ --------------------- 1.0/2.2 MB 4.1 MB/s eta 0:00:01
   -------------------------------



In [1]:
import numpy as np
import pandas as pd
import pyedflib

In [2]:
def read_dataset(file_path):
    edf_file = pyedflib.EdfReader(file_path)
    n_signals = edf_file.signals_in_file
    signal_labels = edf_file.getSignalLabels()
    signals = [edf_file.readSignal(i) for i in range(n_signals)]
    edf_file.close()
    print('Сигналов обнаружено: ', n_signals)
    return signal_labels, signals

In [3]:
def read_txt_markers(file_path):
    with open(file_path, 'r') as file:
        labels = file.read().splitlines()
    return labels

def convert_to_sec(time: str):
    s = list(map(int, time.split(':')))
    return s[0]*3600 + s[1] * 60 + s[2]

In [4]:
def get_markered_dataset(data_file_path, labels_file_path):
    signal_labels, signals = read_dataset(data_file_path)
    signals = np.array(signals)
    data = pd.DataFrame(signals).T.rename(columns={i: signal_labels[i] for i in range(len(signal_labels))})
    
    markers = read_txt_markers(labels_file_path)[1:]
    markers = [line.split('\t') for line in markers]
    markers = [[line[0], convert_to_sec(line[1])*400, line[2]] for line in markers]
    
    markers_df = pd.DataFrame(markers, columns=['id', 'time', 'marker'])
    markers_df['time'] = markers_df['time'].astype(int)

    data['target'] = None

    for i in range(0, len(markers_df), 2):
        start_marker = markers_df.iloc[i]
        end_marker = markers_df.iloc[i + 1]
        if start_marker['marker'].startswith('ds'):
            mask = list(range(start_marker.time, end_marker.time))
            data.loc[mask, 'target'] = 'ds'
        elif start_marker['marker'].startswith('is'):
            mask = list(range(start_marker.time, end_marker.time))
            data.loc[mask, 'target'] = 'is'
        elif start_marker['marker'].startswith('swd'):
            mask = list(range(start_marker.time, end_marker.time))
            data.loc[mask, 'target'] = 'swd'
    percentage_marked = data[data['target'].notna()].shape[0]/data.shape[0]
    print(f'Размечено {round(percentage_marked, 2)} данных')
    return data

In [5]:
def load_marked_dataset(file, folder='ECoG_fully_marked_(4+2 files, 6 h each)', base_path=r"C:/Users/Артём/PycharmProjects/International_hack/official_data"):
    dataset_file_path = fr"{base_path}/{folder}/{file}.edf"
    markers_file_path = fr"{base_path}/{folder}/{file}.txt"
    dataset = get_markered_dataset(dataset_file_path, markers_file_path)
    return dataset

In [6]:
def add_features(data, window_sizes=[400]):
    # Определяем размеры скользящих окон
    for window in window_sizes:
#         print('Считаем Среднее')
        # Среднее значение
        data[f'mean_window_{window}_FrL'] = data['FrL'].rolling(window=window).mean()
        data[f'mean_window_{window}_FrR'] = data['FrR'].rolling(window=window).mean()
        data[f'mean_window_{window}_OcR'] = data['OcR'].rolling(window=window).mean()
        # Среднеквадратичная ошибка
#         print('Считаем ошибку')
#         data[f'rmse_window_{window}_FrL'] = data['FrL'].rolling(window=window).apply(lambda x: np.sqrt(np.mean((x - x.mean())**2)), raw=False)
#         data[f'rmse_window_{window}_FrR'] = data['FrR'].rolling(window=window).apply(lambda x: np.sqrt(np.mean((x - x.mean())**2)), raw=False)
#         data[f'rmse_window_{window}_OcR'] = data['OcR'].rolling(window=window).apply(lambda x: np.sqrt(np.mean((x - x.mean())**2)), raw=False)
        # Минимум и максимум
#         print('Считаем МинМакс')
        data[f'min_window_{window}_FrL'] = data['FrL'].rolling(window=window).min()
        data[f'max_window_{window}_FrL'] = data['FrL'].rolling(window=window).max()
        data[f'min_window_{window}_FrR'] = data['FrR'].rolling(window=window).min()
        data[f'max_window_{window}_FrR'] = data['FrR'].rolling(window=window).max()
        data[f'min_window_{window}_OcR'] = data['OcR'].rolling(window=window).min()
        data[f'max_window_{window}_OcR'] = data['OcR'].rolling(window=window).max()
        # Разница между первым и последним значением в окне
#         print('Считаем разницу первого и последнего')
#         data[f'diff_window_{window}_FrL'] = data['FrL'].rolling(window=window).apply(lambda x: x.iloc[-1] - x.iloc[0], raw=False)
#         data[f'diff_window_{window}_FrR'] = data['FrR'].rolling(window=window).apply(lambda x: x.iloc[-1] - x.iloc[0], raw=False)
#         data[f'diff_window_{window}_OcR'] = data['OcR'].rolling(window=window).apply(lambda x: x.iloc[-1] - x.iloc[0], raw=False)
        # Корреляция между рядами
#         print('Считаем Корееляцию')
        data[f'corr_window_{window}_FrL_FrR'] = data['FrL'].rolling(window=window).corr(data['FrR'])
        data[f'corr_window_{window}_FrL_OcR'] = data['FrL'].rolling(window=window).corr(data['OcR'])
        data[f'corr_window_{window}_FrR_OcR'] = data['FrR'].rolling(window=window).corr(data['OcR'])
    return data

In [8]:
dataset_names = ['Ati4x1_15m_BL_6h', 
                 'Ati4x1_15m_Dex003(Pharm!)_6h', 
                 'Ati4x1_15m_H2O_6h', 
                 'Ati4x3_9m_Xyl01(Pharm!)_6h', 
                 'Ati4x3_12m_BL_6h',
                 'Ati4x6_14m_BL_6h']

for dataset_name in dataset_names:
    print(f'loading dataset {dataset_name}')
    dataset = load_marked_dataset(dataset_name)
    dataset = dataset[dataset['target'].notna()]
    dataset = add_features(dataset, window_sizes=[100, 400, 600])
    dataset['target'] = dataset['target'].replace({'ds': 0, 'is': 1, 'swd': 2})
    dataset.to_csv(f'preprocessed_data/{dataset_name}.csv')

# print('Connecting datasets')
# all_datas = pd.concat(dataset_list, ignore_index=True)
# all_datas['target'] = all_datas['target'].replace({'ds': 0, 'is': 1, 'swd': 2})
# all_datas

loading dataset Ati4x1_15m_BL_6h
Сигналов обнаружено:  3
Размечено 0.42 данных
loading dataset Ati4x1_15m_Dex003(Pharm!)_6h
Сигналов обнаружено:  3
Размечено 0.25 данных
loading dataset Ati4x1_15m_H2O_6h
Сигналов обнаружено:  3
Размечено 0.22 данных
loading dataset Ati4x3_9m_Xyl01(Pharm!)_6h
Сигналов обнаружено:  3
Размечено 0.59 данных


OSError: [Errno 28] No space left on device

In [23]:
dataset = load_marked_dataset('Ati4x3_9m_Xyl01(Pharm!)_6h')
dataset = dataset[dataset['target'].notna()]
dataset = add_features(dataset, window_sizes=[100, 400, 600])
dataset.columns

Сигналов обнаружено:  3
Размечено 0.59 данных
Считаем Среднее
Считаем МинМакс
Считаем Корееляцию
Считаем Среднее
Считаем МинМакс
Считаем Корееляцию


Index(['FrL', 'FrR', 'OcR', 'target', 'mean_window_200_FrL',
       'mean_window_200_FrR', 'mean_window_200_OcR', 'min_window_200_FrL',
       'max_window_200_FrL', 'min_window_200_FrR', 'max_window_200_FrR',
       'min_window_200_OcR', 'max_window_200_OcR', 'corr_window_200_FrL_FrR',
       'corr_window_200_FrL_OcR', 'corr_window_200_FrR_OcR',
       'mean_window_400_FrL', 'mean_window_400_FrR', 'mean_window_400_OcR',
       'min_window_400_FrL', 'max_window_400_FrL', 'min_window_400_FrR',
       'max_window_400_FrR', 'min_window_400_OcR', 'max_window_400_OcR',
       'corr_window_400_FrL_FrR', 'corr_window_400_FrL_OcR',
       'corr_window_400_FrR_OcR'],
      dtype='object')

In [24]:
dataset.head

Unnamed: 0,FrL,FrR,OcR,target,mean_window_200_FrL,mean_window_200_FrR,mean_window_200_OcR,min_window_200_FrL,max_window_200_FrL,min_window_200_FrR,...,mean_window_400_OcR,min_window_400_FrL,max_window_400_FrL,min_window_400_FrR,max_window_400_FrR,min_window_400_OcR,max_window_400_OcR,corr_window_400_FrL_FrR,corr_window_400_FrL_OcR,corr_window_400_FrR_OcR
26000,0.006125,-0.155688,0.045062,swd,,,,,,,...,,,,,,,,,,
26001,-0.013750,-0.134125,-0.017438,swd,,,,,,,...,,,,,,,,,,
26002,-0.055000,-0.136937,-0.057938,swd,,,,,,,...,,,,,,,,,,
26003,-0.091125,-0.132625,-0.083000,swd,,,,,,,...,,,,,,,,,,
26004,-0.128062,-0.134188,-0.090250,swd,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8639195,-0.059438,-0.052625,-0.074625,swd,-0.043756,-0.033908,-0.011162,-0.147125,0.114187,-0.155188,...,-0.015842,-0.147125,0.114187,-0.155188,0.177875,-0.135938,0.12825,0.357609,0.329484,0.356874
8639196,-0.049563,-0.054312,-0.111813,swd,-0.043965,-0.034220,-0.011478,-0.147125,0.114187,-0.155188,...,-0.016013,-0.147125,0.114187,-0.155188,0.177875,-0.135938,0.12825,0.354045,0.327556,0.356743
8639197,-0.028438,-0.044062,-0.088938,swd,-0.043927,-0.034452,-0.011628,-0.147125,0.114187,-0.155188,...,-0.016024,-0.147125,0.114187,-0.155188,0.177875,-0.135938,0.12825,0.349437,0.321592,0.353117
8639198,-0.049313,-0.049688,-0.061062,swd,-0.043783,-0.034804,-0.011747,-0.147125,0.114187,-0.155188,...,-0.015916,-0.147125,0.114187,-0.155188,0.177875,-0.135938,0.12825,0.344477,0.316473,0.347714


In [None]:
updated_dataset.columns