In [1]:
!pip install pyEDFlib
!pip install mne

Defaulting to user installation because normal site-packages is not writeable
Collecting pyEDFlib
  Obtaining dependency information for pyEDFlib from https://files.pythonhosted.org/packages/c9/21/db3a777e23cf160f9dc20f20a4a09b6414698b3df94977b75e4078da942f/pyEDFlib-0.1.38-cp311-cp311-win_amd64.whl.metadata
  Downloading pyEDFlib-0.1.38-cp311-cp311-win_amd64.whl.metadata (6.5 kB)
Downloading pyEDFlib-0.1.38-cp311-cp311-win_amd64.whl (2.2 MB)
   ---------------------------------------- 0.0/2.2 MB ? eta -:--:--
   ---------------------------------------- 0.0/2.2 MB ? eta -:--:--
    --------------------------------------- 0.0/2.2 MB 495.5 kB/s eta 0:00:05
   -- ------------------------------------- 0.1/2.2 MB 1.0 MB/s eta 0:00:03
   ---- ----------------------------------- 0.2/2.2 MB 1.4 MB/s eta 0:00:02
   -------- ------------------------------- 0.5/2.2 MB 2.5 MB/s eta 0:00:01
   ------------------ --------------------- 1.0/2.2 MB 4.1 MB/s eta 0:00:01
   -------------------------------



In [3]:
import numpy as np
import pandas as pd
import pyedflib

In [4]:
def read_dataset(file_path):
    edf_file = pyedflib.EdfReader(file_path)
    n_signals = edf_file.signals_in_file
    signal_labels = edf_file.getSignalLabels()
    signals = [edf_file.readSignal(i) for i in range(n_signals)]
    edf_file.close()
    
    print('Сигналов обнаружено: ', n_signals)
    return signal_labels, signals

In [5]:
def read_txt_markers(file_path):
    with open(file_path, 'r') as file:
        labels = file.read().splitlines()
    return labels

def convert_to_sec(time: str):
    s = list(map(int, time.split(':')))
    return s[0]*3600 + s[1] * 60 + s[2]

In [6]:
def get_markered_dataset(data_file_path, labels_file_path):
    signal_labels, signals = read_dataset(data_file_path)
    signals = np.array(signals)
    data = pd.DataFrame(signals).T.rename(columns={i: signal_labels[i] for i in range(len(signal_labels))})
    
    markers = read_txt_markers(labels_file_path)[1:]
    markers = [line.split('\t') for line in markers]
    markers = [[line[0], convert_to_sec(line[1])*400, line[2]] for line in markers]
    
    markers_df = pd.DataFrame(markers, columns=['id', 'time', 'marker'])
    markers_df['time'] = markers_df['time'].astype(int)

    data['target'] = None

    for i in range(0, len(markers_df), 2):
        start_marker = markers_df.iloc[i]
        end_marker = markers_df.iloc[i + 1]
        if start_marker['marker'].startswith('ds'):
            mask = list(range(start_marker.time, end_marker.time))
            data.loc[mask, 'target'] = 'ds'
        elif start_marker['marker'].startswith('is'):
            mask = list(range(start_marker.time, end_marker.time))
            data.loc[mask, 'target'] = 'is'
        elif start_marker['marker'].startswith('swd'):
            mask = list(range(start_marker.time, end_marker.time))
            data.loc[mask, 'target'] = 'swd'
    percentage_marked = data[data['target'].notna()].shape[0]/data.shape[0]
    print(f'Размечено {round(percentage_marked, 2)} данных')
    return data

In [7]:
def load_marked_dataset(file, folder='ECoG_fully_marked_(4+2 files, 6 h each)', base_path=r"C:/Users/Артём/PycharmProjects/International_hack/data"):
    dataset_file_path = fr"{base_path}/{folder}/{file}.edf"
    markers_file_path = fr"{base_path}/{folder}/{file}.txt"
    dataset = get_markered_dataset(dataset_file_path, markers_file_path)
    return dataset

In [8]:
def add_features(data, window_sizes=[400]):
    # Определяем размеры скользящих окон
    for window in window_sizes:
#         print('Считаем Среднее')
        # Среднее значение
        data[f'mean_window_{window}_FrL'] = data['FrL'].rolling(window=window).mean()
        data[f'mean_window_{window}_FrR'] = data['FrR'].rolling(window=window).mean()
        data[f'mean_window_{window}_OcR'] = data['OcR'].rolling(window=window).mean()
        # Среднеквадратичная ошибка
#         print('Считаем ошибку')
#         data[f'rmse_window_{window}_FrL'] = data['FrL'].rolling(window=window).apply(lambda x: np.sqrt(np.mean((x - x.mean())**2)), raw=False)
#         data[f'rmse_window_{window}_FrR'] = data['FrR'].rolling(window=window).apply(lambda x: np.sqrt(np.mean((x - x.mean())**2)), raw=False)
#         data[f'rmse_window_{window}_OcR'] = data['OcR'].rolling(window=window).apply(lambda x: np.sqrt(np.mean((x - x.mean())**2)), raw=False)
        # Минимум и максимум
#         print('Считаем МинМакс')
        data[f'min_window_{window}_FrL'] = data['FrL'].rolling(window=window).min()
        data[f'max_window_{window}_FrL'] = data['FrL'].rolling(window=window).max()
        data[f'min_window_{window}_FrR'] = data['FrR'].rolling(window=window).min()
        data[f'max_window_{window}_FrR'] = data['FrR'].rolling(window=window).max()
        data[f'min_window_{window}_OcR'] = data['OcR'].rolling(window=window).min()
        data[f'max_window_{window}_OcR'] = data['OcR'].rolling(window=window).max()
        # Разница между первым и последним значением в окне
#         print('Считаем разницу первого и последнего')
#         data[f'diff_window_{window}_FrL'] = data['FrL'].rolling(window=window).apply(lambda x: x.iloc[-1] - x.iloc[0], raw=False)
#         data[f'diff_window_{window}_FrR'] = data['FrR'].rolling(window=window).apply(lambda x: x.iloc[-1] - x.iloc[0], raw=False)
#         data[f'diff_window_{window}_OcR'] = data['OcR'].rolling(window=window).apply(lambda x: x.iloc[-1] - x.iloc[0], raw=False)
        # Корреляция между рядами
#         print('Считаем Корееляцию')
        data[f'corr_window_{window}_FrL_FrR'] = data['FrL'].rolling(window=window).corr(data['FrR'])
        data[f'corr_window_{window}_FrL_OcR'] = data['FrL'].rolling(window=window).corr(data['OcR'])
        data[f'corr_window_{window}_FrR_OcR'] = data['FrR'].rolling(window=window).corr(data['OcR'])
    return data

In [9]:
# Функция для сохранения датасета в CSV файлами блоками
def save_in_blocks(dataframe, num_blocks, output_dir="output_blocks", base_filename="dataset_block"):
    # Создаем директорию для сохранения блоков, если она еще не существует
    import os
    os.makedirs(output_dir, exist_ok=True)
    
    block_size = len(dataframe) // num_blocks
    for i in range(num_blocks):
        start_row = i * block_size
        end_row = start_row + block_size if i < num_blocks - 1 else len(dataframe)
        block = dataframe.iloc[start_row:end_row]
        filename = os.path.join(output_dir, f"{base_filename}_{i+1}.csv")
        block.to_csv(filename, index=False)
        print(f"Сохранен блок {i+1}: {filename}")

In [None]:
dataset_names = ['Ati4x1_15m_BL_6h', 
                 'Ati4x1_15m_Dex003(Pharm!)_6h', 
                 'Ati4x1_15m_H2O_6h', 
                 'Ati4x3_9m_Xyl01(Pharm!)_6h', 
                 'Ati4x3_12m_BL_6h',
                 'Ati4x6_14m_BL_6h']

dataset_list = []
for dataset_name in dataset_names:
    print(f'loading dataset {dataset_name}')
    dataset = load_marked_dataset(dataset_name)
    dataset = dataset[dataset['target'].notna()]
    dataset = add_features(dataset, window_sizes=[100, 400, 600])
    dataset['target'] = dataset['target'].replace({'ds': 0, 'is': 1, 'swd': 2})
    dataset_list.sppend(pd.concat([dataset[dataset.target == i][600:].sample(n=5000) for i in dataset.target.unique()]))
#     dataset.to_csv(f'preprocessed_data/{dataset_name}.csv')

# print('Connecting datasets')
all_datas = pd.concat(dataset_list, ignore_index=True)
all_datas['target'] = all_datas['target'].replace({'ds': 0, 'is': 1, 'swd': 2})
all_datas

In [10]:
dataset_name = 'Ati4x3_9m_Xyl01(Pharm!)_6h'
dataset = load_marked_dataset(dataset_name, base_path=r'D:\Programming\international_hack_data\data')
dataset = dataset[dataset['target'].notna()]
dataset = add_features(dataset, window_sizes=[100, 400, 600])
dataset['target'] = dataset['target'].replace({'ds': 0, 'is': 1, 'swd': 2})

# save_in_blocks(dataset, 
#                num_blocks=10, 
#                output_dir="D:\Programming\international_hack_data\preprocessed_data",
#                base_filename=dataset_name)

Сигналов обнаружено:  3
Размечено 0.59 данных


In [11]:
dataset.head()

Unnamed: 0,FrL,FrR,OcR,target,mean_window_100_FrL,mean_window_100_FrR,mean_window_100_OcR,min_window_100_FrL,max_window_100_FrL,min_window_100_FrR,...,mean_window_600_OcR,min_window_600_FrL,max_window_600_FrL,min_window_600_FrR,max_window_600_FrR,min_window_600_OcR,max_window_600_OcR,corr_window_600_FrL_FrR,corr_window_600_FrL_OcR,corr_window_600_FrR_OcR
26000,0.006125,-0.155688,0.045062,2,,,,,,,...,,,,,,,,,,
26001,-0.01375,-0.134125,-0.017438,2,,,,,,,...,,,,,,,,,,
26002,-0.055,-0.136937,-0.057938,2,,,,,,,...,,,,,,,,,,
26003,-0.091125,-0.132625,-0.083,2,,,,,,,...,,,,,,,,,,
26004,-0.128062,-0.134188,-0.09025,2,,,,,,,...,,,,,,,,,,


In [12]:
pd.concat([dataset[dataset.target == i][600:].sample(n=5000) for i in dataset.target.unique()])

Unnamed: 0,FrL,FrR,OcR,target,mean_window_100_FrL,mean_window_100_FrR,mean_window_100_OcR,min_window_100_FrL,max_window_100_FrL,min_window_100_FrR,...,mean_window_600_OcR,min_window_600_FrL,max_window_600_FrL,min_window_600_FrR,max_window_600_FrR,min_window_600_OcR,max_window_600_OcR,corr_window_600_FrL_FrR,corr_window_600_FrL_OcR,corr_window_600_FrR_OcR
5672648,-0.015187,-0.014688,-0.011687,2,-0.010337,0.017248,-0.053663,-0.162812,0.147125,-0.113375,...,-0.002431,-0.338000,0.368125,-0.593750,0.380250,-0.184000,0.215125,0.365518,0.211697,-0.097087
5730208,-0.204063,-0.285625,-0.190812,2,-0.061746,-0.037958,-0.045779,-0.237937,0.452313,-0.285625,...,-0.050666,-0.349313,0.559125,-0.343062,0.426000,-0.253937,0.231500,0.568433,-0.023692,0.191577
6060093,0.022125,-0.022437,0.109188,2,0.019114,-0.033648,0.022587,-0.139625,0.166750,-0.162000,...,0.025532,-0.185500,0.250437,-0.167563,0.208187,-0.249625,0.253625,0.139837,-0.014282,0.530305
5929504,0.145563,0.078438,0.001187,2,0.120156,0.088756,0.110071,-0.011125,0.321250,-0.062750,...,0.083796,-0.224125,0.507500,-0.298812,0.393000,-0.160750,0.573000,0.474374,0.201235,0.340174
8202624,-0.028313,0.095750,-0.026125,2,-0.049374,-0.007486,-0.058437,-0.199438,0.051688,-0.144437,...,-0.028072,-0.205187,0.146563,-0.193750,0.175437,-0.193812,0.119688,0.614604,0.373393,0.376673
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4479813,-0.073313,0.017375,0.131938,0,0.112491,0.171654,0.066942,-0.073313,0.405000,0.016625,...,0.034092,-0.231688,0.405000,-0.225062,0.501750,-0.276938,0.342750,0.491510,0.071691,-0.017076
1836126,0.247938,0.118688,0.098437,0,0.014748,-0.056419,-0.021817,-0.216750,0.373000,-0.171187,...,0.022191,-0.216750,0.373000,-0.173563,0.321250,-0.204750,0.190438,0.515406,0.357132,0.664805
2082273,-0.052125,-0.268625,-0.072063,0,-0.147189,-0.197955,-0.062955,-0.427250,-0.007875,-0.277125,...,-0.021783,-0.428125,0.313688,-0.303687,0.469000,-0.262250,0.238813,0.722396,0.386595,0.252452
2141176,-0.040375,-0.018437,-0.027000,0,-0.012223,-0.053334,0.011259,-0.250063,0.126500,-0.171813,...,0.040663,-0.250063,0.361375,-0.171813,0.262063,-0.104688,0.275125,0.566747,0.287443,0.506894


In [13]:
dataset.target.unique()

array([2, 0], dtype=int64)