In [4]:
import numpy as np
import scipy.io
from scipy.io import loadmat  # this is the SciPy module that loads mat-files
from scipy import signal
import pandas as pd
import itertools
import os
from scipy.stats import skew, kurtosis
from scipy.signal import correlate, resample, periodogram
from scipy.fftpack import fft
import matplotlib.pyplot as plt
import pyeeg
%matplotlib inline

Specify the directory where all the Kaggle .mat files are present

In [5]:
base_dir = '/data_dir/Kaggle_data/train_1/'

Function to get class label from filename. If filename is I_J_K.mat, class is 'K'

In [6]:
def get_class_label_from_name(name):
    try:
        return int(name[-5])
    except:
        return 0

Get the list of all files and their corresponding class labels. Store them into a pandas dataframe.

In [7]:
def get_file_names_and_class_labels(base_dir):
    ignored_files = ['1_45_1.mat']
    
    return np.array(
        [
            (file, get_class_label_from_name(file)) 
            for file in os.listdir(base_dir) if file not in ignored_files
        ],
        dtype=[('file_name', '|S16'), ('class_label', 'int')]
    )

data_files = pd.DataFrame(get_file_names_and_class_labels(base_dir))
# data_files = data_files.head(6)

<br>Function to extract features from the directory of .mat files. Need to do this individually twice for each patient - once for test and once for train.
<br>For test set features, the function needs to be modified a bit - remove 'class_label' and alter code accordingly

In [8]:
def get_features_from_files(base_dir, data_files):
    data_files.reset_index(drop=True,inplace=True)
    total_files = len(data_files['file_name'])

    column_names = ['file_name','class_label'] + range(1,231)  #The total number of features is 231

    rows = np.zeros(232)
    files = []
    approximator = lambda x: round(x, 4) #4 Decimal Places 

    for k, filename in enumerate(data_files['file_name']):
        print filename
        try:
            mat_data = scipy.io.loadmat(''.join([base_dir, filename.decode('UTF-8')]))
        except ValueError as ex:
            print(u'Error loading MAT file {}: {}'.format(filename, str(ex)))
            continue

        channels_data = mat_data['dataStruct'][0][0][0].transpose()
        channels_data = resample(channels_data, 600, axis=1, window=400)
        df_reduced_data = pd.DataFrame(channels_data.transpose())

        l = [df_reduced_data.var(axis=0)] + [df_reduced_data.min(axis=0)] + [df_reduced_data.max(axis=0)] + [df_reduced_data.kurtosis(axis=0)]
        l = [item for sublist in l for item in sublist]

        # SVD
        U, s, V = np.linalg.svd(channels_data)
        l.append(s[0]) # Highest eigenvalues of X*X_transpose

        # Pearson Correlation Matrix
        ev1 = [np.linalg.eigvals(np.corrcoef(channels_data))[:3]] # 3 highest eigenvalues of the correlations matrix

        # 3 highest eigenvalues of the correlations matrix in after differeniating
        ev2 = [np.linalg.eigvals(np.corrcoef(np.diff(channels_data)))[:3]] 

        l += [item for sublist in ev1 for item in sublist] + [item for sublist in ev2 for item in sublist]

        # Frequency Domain Features
        f = []
        for channel in df_reduced_data.columns:
            # Highest amplitude after FFT
            y = fft(df_reduced_data[channel])
            f.append(np.max(abs(y)))

            # Highest power from periodogram
            f.append(np.max(signal.periodogram(df_reduced_data[channel], fs = 1.0)[1]))
            f.append(np.max(signal.welch(df_reduced_data[channel], fs = 1.0)[1]))

        # Features Using PyEEG package
            f.append(pyeeg.spectral_entropy(df_reduced_data[channel], Band=[0.5,4,7,12,30], Fs=400))
            f.append(pyeeg.hfd(df_reduced_data[channel], Kmax=3))
            f.append(pyeeg.hurst(df_reduced_data[channel]))
            power_list = [pyeeg.bin_power(df_reduced_data[channel], Band=[0.5,4,7,12,30], Fs=400)[0]]
            f += [item for sublist in power_list for item in sublist]

        files.append(filename)
        rows = np.vstack([rows, np.asarray([data_files.loc[k,'class_label']] + map(approximator, l) + map(approximator, f))])

    return pd.DataFrame(rows[1:], index=files, columns=column_names)


In [10]:
df_features = get_features_from_files(base_dir, data_files)
df_features.columns = ['class_label'] + range(1,df_features.shape[1])
df_features.head()

1_1_0.mat
1_27_1.mat
1_28_0.mat
1_28_1.mat
1_29_0.mat
1_29_1.mat


Unnamed: 0,class_label,1,2,3,4,5,6,7,8,9,...,222,223,224,225,226,227,228,229,230,231
1_1_0.mat,0.0,119.6939,165.6753,266.6734,326.378,43.5583,32.8379,98.333,46.9773,40.389,...,839.6885,2350.2561,1154.2714,0.4472,1.2132,0.5553,572.1608,844.9935,1820.2629,8696.9406
1_27_1.mat,1.0,39.9622,457.57,82.9113,851.8883,31.9449,91.6332,14.0493,12.7454,16.9459,...,341.3915,388.4938,359.1828,0.4224,1.1184,0.4589,217.2792,255.7865,843.6824,3731.0044
1_28_0.mat,0.0,47.4328,84.6067,49.981,263.7914,18.5673,12.3646,167.934,18.0344,35.0716,...,661.5663,1458.8999,258.0272,0.4457,1.4922,0.4742,694.2276,556.6255,1177.9987,7029.0275
1_28_1.mat,1.0,45.0802,434.8464,96.017,914.1308,37.2973,90.6659,20.0891,11.0359,14.638,...,386.6431,498.3095,220.3179,0.5288,1.0533,0.4319,335.9179,441.9816,1263.259,3115.2229
1_29_0.mat,0.0,95.3417,239.5643,123.2621,371.0513,41.6245,25.603,75.6389,32.0368,38.609,...,1296.6206,5604.0832,2667.5222,0.4015,1.1792,0.4646,578.494,457.1658,2864.2802,10661.4512


In [11]:
df_features.to_csv("train_1.csv", index=False)