In [3]:
from IPython.utils import io
import pandas as pd
import os
from pathlib import Path
from matplotlib import pyplot as plt
from scipy.signal import find_peaks
import numpy as np
import seaborn as sns
import time
import joblib
from os.path import exists
import shutil
import sys
import time
import mne

from sklearn.model_selection import train_test_split
#From my EEG package 
import run_expts
import format_eeg_data
import constants
import eeg_stat_ts

#Let me see as many results as I want to see
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

from sktime.classification.feature_based import Catch22Classifier
from sklearn.ensemble import RandomForestClassifier
from sktime.datasets import load_unit_test
from sktime.transformations.panel.catch22 import Catch22

In [4]:
feature_list = ['DN_HistogramMode_5', 'DN_HistogramMode_10', 'SB_BinaryStats_diff_longstretch0', 'DN_OutlierInclude_p_001_mdrmd', 'DN_OutlierInclude_n_001_mdrmd', 
 'CO_f1ecac', 'CO_FirstMin_ac', 'SP_Summaries_welch_rect_area_5_1', 'SP_Summaries_welch_rect_centroid', 'FC_LocalSimple_mean3_stderr', 'CO_trev_1_num', 
 'CO_HistogramAMI_even_2_5', 'IN_AutoMutualInfoStats_40_gaussian_fmmi', 'MD_hrv_classic_pnn40', 'SB_BinaryStats_mean_longstretch1', 'SB_MotifThree_quantile_hh',
 'FC_LocalSimple_mean1_tauresrat', 'CO_Embed2_Dist_tau_d_expfit_meandiff', 'SC_FluctAnal_2_dfa_50_1_2_logi_prop_r1', 'SC_FluctAnal_2_rsrangefit_50_1_logi_prop_r1', 
 'SB_TransitionMatrix_3ac_sumdiagcov', 'PD_PeriodicityWang_th0_01' , 'StandardDeviation' , 'Mean']

transformer = Catch22(features = feature_list , catch24 = True )

In [64]:
# for data_type in ['REM', 'N1', 'N2', 'N3', 'Wake']:
t1 = time.time()

for data_type in ['N2']:

    print('Generating Catch22 features for ' + data_type + ' data')
    
    transformer = Catch22(features = feature_list , catch24 = True )
    channels = constants.channel_list
    paths , class_list, sleep_night_list , sleep_type_list , participant_id_list = constants.generate_paths_and_info()

    #1. select the appropriate paths and supplementary information - store in lists
    selected_paths , s_class_list , s_night_list , s_sleep_type , s_p_id = [], [], [], [], []


    for path , class_name, night , p_id in zip(paths, class_list, sleep_night_list, participant_id_list ):
        if data_type in path:
            selected_paths.append(path) 
            s_class_list.append(class_name)
            s_night_list.append(night)
            s_sleep_type.append(data_type)
            s_p_id.append(p_id)

    #2. Load corresponding data into dataframes , store in dataframe list
    df_list = []
    error_paths = []
    with io.capture_output() as captured:
        for path in selected_paths:
            try:
                data_epo = mne.read_epochs(path)
                data = data_epo._data * 1e6  # convert signal from V to uV
                df_full = data_epo.to_data_frame()
                df = df_full[channels].copy()
                df_list.append(df)
            except:
                #error with loading data
                error_paths.append(path)
                
    #Remove paths with errors from lists 
    for path in error_paths:
        path_index = selected_paths.index(path)
        #pop that index from all lists
        selected_paths.pop(path_index) 
        s_class_list.pop(path_index)
        s_night_list.pop(path_index)
        s_sleep_type.pop(path_index)
        s_p_id.pop(path_index)

            
    #Now we have the 57 channel EEG data in df's in df_list and corresponding supplementary information in the lists 
    #Selected_paths , s_class_list , s_night_list , s_sleep_type , s_p_id

    #3. Load all of the data into a single dataframe with each cell containing a time series 
    ts_row_list = []

    for df in df_list:
        row = {}
        for col in df.columns:
            row[col] = df[col]
        ts_row_list.append(row)
        
    #All of the main pieces of data to save 
    eeg_data_df = pd.DataFrame.from_records(ts_row_list)
    groups = pd.Series(s_p_id)
    class_list = pd.Series(s_class_list)
    y = class_list.map({'HC': 0 , 'PD' : 1 , 'PD+RBD' : 2 , 'RBD' : 3})

    
    #For now I am commenting out all of the below
    
#     #4. Transform the dataframe _______________________________________________________________________________________
#     transformed_df = transformer.fit_transform(eeg_data_df)

#     #Generate the rename mapping dictionary to rename the transformed dataframe using more appropriate names____________________
#     transformed_names = transformed_df.columns
#     channel_names = eeg_data_df.columns
#     new_names = [channel + '_' + feature for channel in channel_names for feature in feature_list] #This is hard to follow but it is correct

#     rename_mapping_dict = {}
#     for old_name, new_name in zip(transformed_names,new_names):
#         rename_mapping_dict[old_name] = new_name
        
#     final_transformed_df = transformed_df.rename(rename_mapping_dict, axis=1)


#     #5. Save everything in the appropriate place ---->  final_transformed_df, groups , y
#     folder = 'Catch_22_features/'
#     final_transformed_df.to_hdf(folder + data_type + '_c_22_feautures.h5' , key = 'df', mode = 'w')
#     groups.to_hdf(folder + data_type + '_groups.h5' , key = 'df', mode = 'w')
#     y.to_hdf(folder + data_type + '_y.h5' , key = 'df', mode = 'w')

t2 = time.time()
t2-t1

Generating Catch22 features for N2 data


1111.2260336875916

In [None]:
eeg_data_df.to_hdf('all_n2_eeg_data_to_see_size.h5', key='df', mode='w')  

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block0_values] [items->Index(['Fp1', 'Fpz', 'Fp2', 'F1', 'Fz', 'F2', 'AF7', 'F7', 'F5', 'F3', 'AF8',
       'F8', 'F6', 'F4', 'FCz', 'FC1', 'FC2', 'Cz', 'C1', 'C2', 'CP1', 'CP2',
       'FC5', 'FC3', 'C5', 'C3', 'CP5', 'CP3', 'FC6', 'FC4', 'C6', 'C4', 'CP6',
       'CP4', 'TP7', 'T7', 'FT7', 'FT9', 'TP8', 'T8', 'FT8', 'FT10', 'P1',
       'P2', 'Pz', 'P3', 'P5', 'P7', 'PO7', 'P4', 'P6', 'P8', 'PO8', 'POz',
       'O1', 'O2', 'Oz'],
      dtype='object')]

  eeg_data_df.to_hdf('all_n2_eeg_data_to_see_size.h5', key='df', mode='w')


In [65]:
1111 / 60

18.516666666666666

In [None]:
# I want to see how the time scales with the number of samples 
eeg_data_subset = eeg_data_df.iloc[:, :5].copy()
for i in range(len(eeg_data_subset)):
    print( len(eeg_data_subset.iloc[i,0]) /691200 )

In [25]:
0.8*691200

552960.0

In [31]:
(691200 /256) / 60

45.0

In [32]:
#Most rows have at least 552960.0 length

In [None]:
transformed_df = transformer.fit_transform(eeg_data_df)

In [33]:
import custom_ts_length

In [35]:
t1 = time.time()
length_aligned_sub_df = custom_ts_length.customise_df_ts_length(eeg_data_subset,691200 ) 
t2 = time.time()

t2-t1

9.282590389251709

In [55]:
channel_sample_combos = [(1, 1), (1, 4), (1, 8), (2, 1), (2, 4), (2, 8), (4, 1),
                         (4, 4), (4, 8) , (1,12) , (2,12) , (4,12)]

In [52]:
channel_sample_combos = []
for channels in [1,2,4]:
    for samples in [1 , 4 , 8]:
        channel_sample_combos.append((channels,samples))
        
channel_sample_combos
        

[(1, 1), (1, 4), (1, 8), (2, 1), (2, 4), (2, 8), (4, 1), (4, 4), (4, 8)]

In [47]:
samples = 1
channels = 1 

transformed_df = transformer.fit_transform(length_aligned_sub_df.iloc[:samples, :channels])

transformed_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23
0,-1.42967,1.686842,275.0,0.029949,0.021746,12.0,20.0,21.870507,0.085729,2.265151,0.014482,0.229335,19.0,0.972831,32.0,1.567436,0.013216,0.101543,0.44,0.52,1e-05,31.0,4.793582,-0.004732


In [13]:
len(eeg_data_df.iloc[2,0])

1190400

In [17]:
( 1190400 / 698880 ) * 45

76.64835164835165

In [11]:
( 698880 / 256 ) / 60

45.5

In [19]:
( 300 / 2 ) / 60

2.5

In [14]:
105 / 2

52.5

In [15]:
# 73 minutes also popular G 

In [59]:
3.33 * (93*72*57)/(60*60*24)

14.710275

In [63]:
(93*1*57) / (60*60)

1.4725

In [None]:
# 3 hours to do 2 samples ? 