In [35]:
import pandas as pd 
import numpy as np 
import os 
import matplotlib.pyplot as plt 

from sklearn.preprocessing import StandardScaler
from preprocessing.features_engineering import BasicFeaturesCreation

Get names of files containing raw data.

In [36]:
# nb of files to open 
nb_files = 7

In [37]:
cwd = os.getcwd()
files_dir = os.path.join(cwd, "data", "training_records")
files = np.arange(nb_files)

Preprocess our data.

In [38]:
# define category 
category = {
        "First_EEG": (1, 7501, 250), 
        "Second_EEG": (7501, 15001, 250), 
        "Third_EEG": (15001, 22501, 250), 
        "Fourth_EEG": (22501, 30001, 250), 
        "Fifth_EEG": (30001, 37501, 250), 
        "X_axis": (37501, 39001, 50), 
        "Y_axis": (39001, 40501, 50), 
        "Z_axis": (40501, 42001, 50) 
    }

In [39]:
dataframes = []

for file in files:

    # load data
    file_path = os.path.join(files_dir, "dreem_" + str(file) + ".npy")
    data = np.load(file_path)
    df = pd.DataFrame(data)
    df.set_index(0, inplace=True)

    # preprocess data per category 
    df_per_category = []  # initialization 
    for cat in category.keys():

        # get columns in cateogries
        min_, max_, sampling_rate = category.get(cat)

        # define transformer
        transformer = BasicFeaturesCreation(sampling_rate=sampling_rate, 
                                            name=cat)
        df_per_category.append(transformer.transform(df[np.arange(min_, max_)]))

    df_transformed = pd.concat(df_per_category, axis=1)
    df_transformed['identifier'] = df_transformed.index + 10000*file
    dataframes.append(df_transformed)

  df_transformed['identifier'] = df_transformed.index + 10000*file
  df_transformed['identifier'] = df_transformed.index + 10000*file
  df_transformed['identifier'] = df_transformed.index + 10000*file
  df_transformed['identifier'] = df_transformed.index + 10000*file
  df_transformed['identifier'] = df_transformed.index + 10000*file
  df_transformed['identifier'] = df_transformed.index + 10000*file
  df_transformed['identifier'] = df_transformed.index + 10000*file


In [40]:
df_prepared = pd.concat(dataframes, ignore_index=True)

In [41]:
targets = pd.read_csv("./data/targets_train.csv", index_col="identifier")
targets.head()

Unnamed: 0_level_0,record,index,target
identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0,0,0
1,0,1,0
2,0,2,0
3,0,3,0
4,0,4,0


In [42]:
df_final = df_prepared.set_index("identifier").join(targets, on="identifier", how="inner")
df_final.drop(columns=["record", "index"], inplace=True)
df_final.info()

<class 'pandas.core.frame.DataFrame'>
Float64Index: 6173 entries, 0.0 to 60911.0
Columns: 169 entries, Max_freq_First_EEG to target
dtypes: float64(168), int64(1)
memory usage: 8.0 MB


In [43]:
df_final.to_csv("./data/prepared/prepared_basic.csv")