In [1]:
!pip install pyEDFlib
!pip install mne

Defaulting to user installation because normal site-packages is not writeable
Collecting pyEDFlib
  Obtaining dependency information for pyEDFlib from https://files.pythonhosted.org/packages/c9/21/db3a777e23cf160f9dc20f20a4a09b6414698b3df94977b75e4078da942f/pyEDFlib-0.1.38-cp311-cp311-win_amd64.whl.metadata
  Downloading pyEDFlib-0.1.38-cp311-cp311-win_amd64.whl.metadata (6.5 kB)
Downloading pyEDFlib-0.1.38-cp311-cp311-win_amd64.whl (2.2 MB)
   ---------------------------------------- 0.0/2.2 MB ? eta -:--:--
   ---------------------------------------- 0.0/2.2 MB ? eta -:--:--
    --------------------------------------- 0.0/2.2 MB 495.5 kB/s eta 0:00:05
   -- ------------------------------------- 0.1/2.2 MB 1.0 MB/s eta 0:00:03
   ---- ----------------------------------- 0.2/2.2 MB 1.4 MB/s eta 0:00:02
   -------- ------------------------------- 0.5/2.2 MB 2.5 MB/s eta 0:00:01
   ------------------ --------------------- 1.0/2.2 MB 4.1 MB/s eta 0:00:01
   -------------------------------



In [3]:
import numpy as np
import pandas as pd
import pyedflib

In [4]:
def read_dataset(file_path):
    edf_file = pyedflib.EdfReader(file_path)
    n_signals = edf_file.signals_in_file
    signal_labels = edf_file.getSignalLabels()
    signals = [edf_file.readSignal(i) for i in range(n_signals)]
    edf_file.close()
    
    print('Сигналов обнаружено: ', n_signals)
    return signal_labels, signals

In [5]:
def read_txt_markers(file_path):
    with open(file_path, 'r') as file:
        labels = file.read().splitlines()
    return labels

def convert_to_sec(time: str):
    s = list(map(int, time.split(':')))
    return s[0]*3600 + s[1] * 60 + s[2]

In [6]:
def get_markered_dataset(data_file_path, labels_file_path):
    signal_labels, signals = read_dataset(data_file_path)
    signals = np.array(signals)
    data = pd.DataFrame(signals).T.rename(columns={i: signal_labels[i] for i in range(len(signal_labels))})
    
    markers = read_txt_markers(labels_file_path)[1:]
    markers = [line.split('\t') for line in markers]
    markers = [[line[0], convert_to_sec(line[1])*400, line[2]] for line in markers]
    
    markers_df = pd.DataFrame(markers, columns=['id', 'time', 'marker'])
    markers_df['time'] = markers_df['time'].astype(int)

    data['target'] = None

    for i in range(0, len(markers_df), 2):
        start_marker = markers_df.iloc[i]
        end_marker = markers_df.iloc[i + 1]
        if start_marker['marker'].startswith('ds'):
            mask = list(range(start_marker.time, end_marker.time))
            data.loc[mask, 'target'] = 'ds'
        elif start_marker['marker'].startswith('is'):
            mask = list(range(start_marker.time, end_marker.time))
            data.loc[mask, 'target'] = 'is'
        elif start_marker['marker'].startswith('swd'):
            mask = list(range(start_marker.time, end_marker.time))
            data.loc[mask, 'target'] = 'swd'
    percentage_marked = data[data['target'].notna()].shape[0]/data.shape[0]
    print(f'Размечено {round(percentage_marked, 2)} данных')
    return data

In [7]:
def load_marked_dataset(file, folder='ECoG_fully_marked_(4+2 files, 6 h each)', base_path=r"C:/Users/Артём/PycharmProjects/International_hack/data"):
    dataset_file_path = fr"{base_path}/{folder}/{file}.edf"
    markers_file_path = fr"{base_path}/{folder}/{file}.txt"
    dataset = get_markered_dataset(dataset_file_path, markers_file_path)
    return dataset

In [8]:
def add_features(data, window_sizes=[400]):
    # Определяем размеры скользящих окон
    for window in window_sizes:
#         print('Считаем Среднее')
        # Среднее значение
        data[f'mean_window_{window}_FrL'] = data['FrL'].rolling(window=window).mean()
        data[f'mean_window_{window}_FrR'] = data['FrR'].rolling(window=window).mean()
        data[f'mean_window_{window}_OcR'] = data['OcR'].rolling(window=window).mean()
        # Среднеквадратичная ошибка
#         print('Считаем ошибку')
#         data[f'rmse_window_{window}_FrL'] = data['FrL'].rolling(window=window).apply(lambda x: np.sqrt(np.mean((x - x.mean())**2)), raw=False)
#         data[f'rmse_window_{window}_FrR'] = data['FrR'].rolling(window=window).apply(lambda x: np.sqrt(np.mean((x - x.mean())**2)), raw=False)
#         data[f'rmse_window_{window}_OcR'] = data['OcR'].rolling(window=window).apply(lambda x: np.sqrt(np.mean((x - x.mean())**2)), raw=False)
        # Минимум и максимум
#         print('Считаем МинМакс')
        data[f'min_window_{window}_FrL'] = data['FrL'].rolling(window=window).min()
        data[f'max_window_{window}_FrL'] = data['FrL'].rolling(window=window).max()
        data[f'min_window_{window}_FrR'] = data['FrR'].rolling(window=window).min()
        data[f'max_window_{window}_FrR'] = data['FrR'].rolling(window=window).max()
        data[f'min_window_{window}_OcR'] = data['OcR'].rolling(window=window).min()
        data[f'max_window_{window}_OcR'] = data['OcR'].rolling(window=window).max()
        # Разница между первым и последним значением в окне
#         print('Считаем разницу первого и последнего')
#         data[f'diff_window_{window}_FrL'] = data['FrL'].rolling(window=window).apply(lambda x: x.iloc[-1] - x.iloc[0], raw=False)
#         data[f'diff_window_{window}_FrR'] = data['FrR'].rolling(window=window).apply(lambda x: x.iloc[-1] - x.iloc[0], raw=False)
#         data[f'diff_window_{window}_OcR'] = data['OcR'].rolling(window=window).apply(lambda x: x.iloc[-1] - x.iloc[0], raw=False)
        # Корреляция между рядами
#         print('Считаем Корееляцию')
        data[f'corr_window_{window}_FrL_FrR'] = data['FrL'].rolling(window=window).corr(data['FrR'])
        data[f'corr_window_{window}_FrL_OcR'] = data['FrL'].rolling(window=window).corr(data['OcR'])
        data[f'corr_window_{window}_FrR_OcR'] = data['FrR'].rolling(window=window).corr(data['OcR'])
    return data

In [9]:
# Функция для сохранения датасета в CSV файлами блоками
def save_in_blocks(dataframe, num_blocks, output_dir="output_blocks", base_filename="dataset_block"):
    # Создаем директорию для сохранения блоков, если она еще не существует
    import os
    os.makedirs(output_dir, exist_ok=True)
    
    block_size = len(dataframe) // num_blocks
    for i in range(num_blocks):
        start_row = i * block_size
        end_row = start_row + block_size if i < num_blocks - 1 else len(dataframe)
        block = dataframe.iloc[start_row:end_row]
        filename = os.path.join(output_dir, f"{base_filename}_{i+1}.csv")
        block.to_csv(filename, index=False)
        print(f"Сохранен блок {i+1}: {filename}")

In [18]:
def extract_sequential_samples(df, target_col, sample_size=5000, exclude_value=None):
    df[target_col] = df[target_col].replace({exclude_value: pd.NA})
    df_filtered = df.dropna(subset=[target_col]).reset_index(drop=True)
    
    # Инициализация переменных
    result_df = pd.DataFrame()
    grouped_samples = {value: 0 for value in df_filtered[target_col].unique()}
    
    current_group = []
    current_value = df_filtered[target_col].iloc[0]
    
    # Проход по строкам для группировки последовательностей
    for index, value in enumerate(df_filtered[target_col]):
        if value == current_value:
            current_group.append(index)
        else:
            # Проверка, не превышен ли лимит строк для текущего класса
            if grouped_samples[current_value] < sample_size:
                selected_rows = current_group[:sample_size - grouped_samples[current_value]]
                result_df = pd.concat([result_df, df_filtered.loc[selected_rows]])
                grouped_samples[current_value] += len(selected_rows)
            
            # Обновляем текущую группу
            current_group = [index]
            current_value = value
    
    # Обработка последней группы
    if len(current_group) > 0 and grouped_samples[current_value] < sample_size:
        selected_rows = current_group[:sample_size - grouped_samples[current_value]]
        result_df = pd.concat([result_df, df_filtered.loc[selected_rows]])
        grouped_samples[current_value] += len(selected_rows)
    
    return result_df.reset_index(drop=True)

extract_sequential_samples(dataset[600:], target_col='target', sample_size=5000, exclude_value=None)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[target_col] = df[target_col].replace({exclude_value: pd.NA})


Unnamed: 0,FrL,FrR,OcR,target,mean_window_100_FrL,mean_window_100_FrR,mean_window_100_OcR,min_window_100_FrL,max_window_100_FrL,min_window_100_FrR,...,mean_window_600_OcR,min_window_600_FrL,max_window_600_FrL,min_window_600_FrR,max_window_600_FrR,min_window_600_OcR,max_window_600_OcR,corr_window_600_FrL_FrR,corr_window_600_FrL_OcR,corr_window_600_FrR_OcR
0,0.143250,0.299312,0.062500,2,0.010785,0.046769,0.000298,-0.372125,0.843063,-0.388625,...,-0.012970,-0.372125,1.026250,-0.565063,0.964313,-0.356375,0.394062,0.782437,-0.045136,-0.026865
1,0.066687,0.213438,0.066750,2,0.013403,0.049398,0.000432,-0.372125,0.843063,-0.388625,...,-0.012830,-0.372125,1.026250,-0.565063,0.964313,-0.356375,0.394062,0.782204,-0.044916,-0.025772
2,0.017938,0.181875,0.085625,2,0.015637,0.051794,0.000924,-0.372125,0.843063,-0.388625,...,-0.012591,-0.372125,1.026250,-0.565063,0.964313,-0.356375,0.394062,0.781822,-0.045075,-0.024862
3,-0.036250,0.146937,0.090875,2,0.017581,0.053776,0.001529,-0.372125,0.843063,-0.388625,...,-0.012301,-0.372125,1.026250,-0.565063,0.964313,-0.356375,0.394062,0.781315,-0.045780,-0.024254
4,-0.068125,0.134438,0.101312,2,0.019387,0.055821,0.002237,-0.372125,0.843063,-0.388625,...,-0.011982,-0.372125,1.026250,-0.565063,0.964313,-0.356375,0.394062,0.780712,-0.046906,-0.023685
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,-0.059875,-0.029250,0.176937,0,0.092012,-0.081329,-0.023463,-0.059875,0.259250,-0.241375,...,-0.001692,-0.285375,0.495188,-0.339688,0.410062,-0.282438,0.370688,0.260864,0.383327,0.410472
9996,-0.054437,-0.038937,0.185250,0,0.091730,-0.081106,-0.021358,-0.059875,0.259250,-0.241375,...,-0.001203,-0.285375,0.495188,-0.339688,0.410062,-0.282438,0.370688,0.262117,0.382167,0.408023
9997,-0.069000,-0.083562,0.187000,0,0.091401,-0.081419,-0.019219,-0.069000,0.259250,-0.241375,...,-0.000733,-0.285375,0.495188,-0.339688,0.410062,-0.282438,0.370688,0.263890,0.380554,0.404560
9998,-0.048688,-0.092625,0.198063,0,0.091323,-0.081959,-0.017052,-0.069000,0.259250,-0.241375,...,-0.000220,-0.285375,0.495188,-0.339688,0.410062,-0.282438,0.370688,0.265546,0.379086,0.400371


In [20]:
dataset_names = ['Ati4x1_15m_BL_6h', 
                 'Ati4x1_15m_Dex003(Pharm!)_6h', 
                 'Ati4x1_15m_H2O_6h', 
                 'Ati4x3_9m_Xyl01(Pharm!)_6h', 
                 'Ati4x3_12m_BL_6h',
                 'Ati4x6_14m_BL_6h']

dataset_list = []
for dataset_name in dataset_names:
    print(f'loading dataset {dataset_name}')
    dataset = load_marked_dataset(dataset_name, base_path=r"D:\Programming\international_hack_data\data")
    dataset = dataset[dataset['target'].notna()]
#     dataset = add_features(dataset, window_sizes=[100, 400, 600])
    dataset['target'] = dataset['target'].replace({'ds': 0, 'is': 1, 'swd': 2})
    dataset_list.append(extract_sequential_samples(dataset[600:], target_col='target', sample_size=5000, exclude_value=None))
#     dataset.to_csv(f'preprocessed_data/{dataset_name}.csv')

# print('Connecting datasets')
all_datas = pd.concat(dataset_list, ignore_index=True)
all_datas['target'] = all_datas['target'].replace({'ds': 0, 'is': 1, 'swd': 2})
print(all_datas.shape)
all_datas.head()

loading dataset Ati4x1_15m_BL_6h
Сигналов обнаружено:  3
Размечено 0.42 данных


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[target_col] = df[target_col].replace({exclude_value: pd.NA})


loading dataset Ati4x1_15m_Dex003(Pharm!)_6h
Сигналов обнаружено:  3
Размечено 0.25 данных


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[target_col] = df[target_col].replace({exclude_value: pd.NA})


loading dataset Ati4x1_15m_H2O_6h
Сигналов обнаружено:  3
Размечено 0.22 данных


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[target_col] = df[target_col].replace({exclude_value: pd.NA})


loading dataset Ati4x3_9m_Xyl01(Pharm!)_6h
Сигналов обнаружено:  3
Размечено 0.59 данных


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[target_col] = df[target_col].replace({exclude_value: pd.NA})


loading dataset Ati4x3_12m_BL_6h
Сигналов обнаружено:  3
Размечено 0.23 данных


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[target_col] = df[target_col].replace({exclude_value: pd.NA})


loading dataset Ati4x6_14m_BL_6h
Сигналов обнаружено:  3
Размечено 0.34 данных


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[target_col] = df[target_col].replace({exclude_value: pd.NA})


(70000, 4)


(70000, 4)

In [21]:
all_datas.to_csv(r'D:\Programming\international_hack_data\preprocessed_data\selected_data.csv')