In [24]:
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier, PassiveAggressiveClassifier
from sklearn.decomposition import PCA
import tensorflow as tf

from reader import Reader
from utils import print_c, read_parameters, convolution2d

from plot import Plotter
import matplotlib.pyplot as plt

np.set_printoptions(precision=2)


selection = {'lead': [1],
             'parsimony': [-1],
             'time': (40, 76),
             'frequency': (0, 25),
             'merge_pars':False,
             'merge_lead':False}

conv2d = {'do_filter': False,
          'type': 'gaussian',
            'l': 5,
            'sigma': 2}

# observations:
#   20 freq
#       1dAVb  (57, 78, 0, 15) lead: 1 5 7 9 10 11
#       SB     (57, 76, 0, 12) lead: 8(12) 7(15) 10(12 + 80)
#       ST     (60, 76, 3, 15) lead: 8
#
#   30 freq
#       1dAVb  (40, 76, 0, 25) lead: 1 2 5 9 10
#       SB     (40, 76, 0, 25) lead: 10 et (50, 78, 0, 16) lead: 7 8
#       ST     () lead: / RIEN


# condition = ['1dAVb', 'RBBB', 'LBBB', 'SB', 'AF', 'ST', 'HEALTHY']
condition = ['1dAVb', 'HEALTHY']

# session = "2800 per category _ 20 freq _ short window"
session = "2800 per category _ 35 freq _ long window"

_use_all_data = True
parameter = read_parameters(session)

def _feature_selection(x, selection, conv2d, parameter):
    """
    Method to perform the feature selection based on the self._selection dictionary.
        Select leads in <selection['lead']> if empty no selection is performed
        Select parsimony in <selection['pars']> if empty no selection is performed
        Select time in <selection['time'] = (t_min, t_max)> if t_min = None, no selection will be performed on
            left side of time. Same goes for t_max
        Select Frequency in <selection['frequency'] = (f_min, f_max)> if f_max = None, no selection will be
            performed on upper frequency. Same goes for f_min. If float is given, select directly, if fload are
            provided select based on model frequency.
        Merge all parsimony levels if <selection['merge_pars'] = True>, else nothing is changed
        Merge all leads if <selection['merge_lead'] = True>, else nothing is changed
    :param x: input data array with shape (n_exams, n_leads, n_features, n_parsimony)
    :return: x: output data after selection, /!\ shape has been changed to (n_exams, n_leads, n_parsimony, n_features)
    """
    x = x.transpose((0, 1, 3, 2))

    # Lead selection
    if selection['lead']:
        x = x[:, [*selection['lead']], ...]

    # Parsimony level selection
    if selection['parsimony']:
        x = x[:, :, [*selection['parsimony']], ...]

    # Time or/and frequency selection
    if any(selection['time']) or any(selection['frequency']):
        # temps shape: (n_exams, n_leads, n_parsimony, time, frequency)
        temp = np.array(np.split(x, parameter['n_point'] - 1, axis=-1)).transpose((1, 2, 3, 0, 4))

        ## IDEA of gaussian convolution
        if conv2d['do_filter']:
            for exam_i in range(len(temp)):
                temp[exam_i, 0, 0, :, :] = convolution2d(temp[exam_i, 0, 0, :, :], kernel_type=conv2d['type'],
                                                         l=conv2d['l'], sigma=conv2d['sigma'])

        if any(selection['time']):
            t_min, t_max = get_time_indexes(selection, parameter)
            temp = temp[:, :, :, t_min:t_max, ...]

        if any(selection['frequency']):
            f_min, f_max = get_freq_indexes(selection, parameter)
            temp = temp[..., f_min:f_max]
        x = np.reshape(temp, (temp.shape[0], temp.shape[1], temp.shape[2], temp.shape[3] * temp.shape[4]))

    # Merge parsimony indexes
    if selection['merge_pars'] and x.shape[2] > 1:
        x = np.reshape(x, (x.shape[0], x.shape[1], x.shape[2] * x.shape[3]))[:, :, np.newaxis, :]

    # Merge leads
    if selection['merge_lead'] and x.shape[1] > 1:
        x = x.transpose((0, 2, 1, 3))
        x = np.reshape(x, (x.shape[0], x.shape[1], x.shape[2] * x.shape[3]))[:, :, np.newaxis, :]
        x = x.transpose((0, 2, 1, 3))

    return x

def _init_x_y(reader_hdf5, selection):
    n_exams, n_leads, n_features, n_pars = next(reader_hdf5)

    if selection['lead']:
        n_leads = len(selection['lead'])
    if selection['parsimony']:
        n_pars = len(selection['parsimony'])

    if any(selection['time']) or any(selection['frequency']):
        t_min, t_max = get_time_indexes(selection, parameter)
        f_min, f_max = get_freq_indexes(selection, parameter)
        n_features = (f_max - f_min) * (t_max - t_min)

    if selection['merge_lead'] and n_leads > 1:
        n_features *= n_leads
        n_leads = 1

    if selection['merge_pars'] and n_pars > 1:
        n_features *= n_pars
        n_pars = 1

    return np.zeros((n_exams, n_leads, n_pars, n_features)), np.empty((n_exams,), dtype=np.int8)

def get_time_indexes(selection, parameter):
    t_min = selection['time'][0] if selection['time'][0] is not None else 0
    t_max = selection['time'][1] if selection['time'][1] is not None else parameter['n_point'] - 1
    return t_min, t_max

def get_freq_indexes(selection, parameter):
    f_min = selection['frequency'][0] if selection['frequency'][0] is not None else 0
    f_max = selection['frequency'][1] if selection['frequency'][1] is not None else parameter['n_freq']
    if isinstance(f_min, float):
        f_min = np.argmin(parameter['model_freq'] - f_min)
    if isinstance(f_max, float):
        f_max = np.argmin(parameter['model_freq'] - f_max)
    return f_min, f_max

def _score(y_true, y_pred):
    """
    Calculate the score for a classification task based on the specified metric.

    :param y_true: True labels for the input samples, shape (n_samples,).
    :param y_pred: Predicted labels for the input samples, shape (n_samples,)

    :return: Score for the specified metric, multiplied by 100 to convert to a percentage.
    :raises ValueError: if the specified metric is not one of ['accuracy', 'f1'].
    """
    # List of available metrics
    all_metrics = ['accuracy', 'f1']

    ## testing CNN
    for i, val in enumerate(y_pred):
        if val[0] > val[1]:
            y_pred[i][0] = 1
            y_pred[i][1] = 0
        else:
            y_pred[i][0] = 0
            y_pred[i][1] = 1
    y_pred = [np.where(r==1)[0][0] for r in y_pred]
    y_true = [np.where(r==1)[0][0] for r in y_true]

    return accuracy_score(y_true, y_pred) * 100


[1m[34m
Sessions: 2800 per category _ 35 freq _ long window[0m
 Data case: [1mevoked[0m[0m
 Version: [1m1[0m[0m
 Alpha: [1m[0.00016, 0.00018, 0.00012, 0.00016, 0.00011, 0.00013, 0.00017, 0.00021, 0.00026, 0.00034, 0.00032, 0.00025][0m[0m
 Model frequencies: [ 3.    4.24  5.47  6.71  7.94  9.18 10.41 11.65 12.88 14.12 15.35 16.59
 17.82 19.06 20.29 21.53 22.76 24.   25.24 26.47 27.71 28.94 30.18 31.41
 32.65 33.88 35.12 36.35 37.59 38.82 40.06 41.29 42.53 43.76 45.  ]
 N_freq = [1m35[0m[0m
 N_point = [1m240[0m[0m
 Beta_dim = [1m8365[0m[0m
 Parsimony: [0.04 0.08 0.12 0.16 0.2  0.24 0.28 0.32 0.36 0.4  0.44 0.48 0.52 0.56
 0.6  0.64 0.68 0.72 0.76 0.8  0.84 0.88 0.92 0.96 1.  ]



In [25]:
mode = 'train'
batch_size = 100

print_c(f'{mode}', 'blue', bold=True)
mode_ = 'learning' if mode.lower() == 'train' else 'evaluation'

# Init reader and x, y for the case of use_all_data = True
reader = Reader(batch_size=batch_size * len(condition))
hdf5_batch_iterator = reader.read_hdf5(session, mode, condition, random=False, verbose=True)
x, y = _init_x_y(hdf5_batch_iterator, selection)

pointer = 0
for x_batch, y_batch in hdf5_batch_iterator:
    x_batch = _feature_selection(x_batch, selection, conv2d, parameter)  # (n_exams, n_leads, n_pars, n_features)
    x[pointer:pointer + len(x_batch)] = x_batch
    y[pointer:pointer + len(x_batch)] = y_batch
    pointer += len(x_batch)

    # x = x_batch
    # y = y_batch
    # break

# lighted ram load
del x_batch, y_batch

# Aggregate all exams in one batch (can only be possible if there is selection or low memory usage)
x = x[:, 0, 0, :]

[1m[34mtrain[0m


Batch  1/28: 100%|██████████| 199/199 [00:02<00:00, 82.93it/s]
Batch  2/28: 100%|██████████| 199/199 [00:02<00:00, 86.64it/s]
Batch  3/28: 100%|██████████| 199/199 [00:02<00:00, 81.34it/s]
Batch  4/28: 100%|██████████| 199/199 [00:02<00:00, 90.76it/s]
Batch  5/28: 100%|██████████| 199/199 [00:02<00:00, 86.61it/s]
Batch  6/28: 100%|██████████| 199/199 [00:02<00:00, 82.03it/s]
Batch  7/28: 100%|██████████| 199/199 [00:02<00:00, 84.86it/s]
Batch  8/28: 100%|██████████| 199/199 [00:02<00:00, 82.28it/s]
Batch  9/28: 100%|██████████| 199/199 [00:02<00:00, 87.60it/s]
Batch 10/28: 100%|██████████| 199/199 [00:02<00:00, 87.55it/s]
Batch 11/28: 100%|██████████| 199/199 [00:02<00:00, 90.35it/s]
Batch 12/28: 100%|██████████| 199/199 [00:02<00:00, 90.50it/s]
Batch 13/28: 100%|██████████| 199/199 [00:02<00:00, 93.59it/s]
Batch 14/28: 100%|██████████| 199/199 [00:02<00:00, 91.15it/s]
Batch 15/28: 100%|██████████| 198/198 [00:02<00:00, 93.74it/s]
Batch 16/28: 100%|██████████| 198/198 [00:02<00:00, 92.

In [72]:
def _set_clf(selection, _selection, conv2d):
    """
    Method to perform the selection of the classifier.
    Available classifiers are:
        'LDA': Linear Discriminant Analysis
        'QDA': Quadratic Discriminant Analysis
        'RF':  Random forest
        'SGD': Stochastic Gradient Descent
        'PA':  Passive Agressive Classifier
    :param selection: 'str' to choose the returned classifier.
    :return: clf
    """
    # param = {'classifier': self._clf_choice}
    # print(f'selection={self._selection},   conv2d={self._conv2d},   classifier={param}')
    if selection == 'LDA':
        return LinearDiscriminantAnalysis(solver='svd', shrinkage=None, priors=None, covariance_estimator=None,
                                          n_components=1)
    elif selection == 'QDA':
        return QuadraticDiscriminantAnalysis()
    elif selection == 'RF':
        max_depth = 20
        max_leaf_nodes = 200
        param = {'classifier': selection, 'max_depth': max_depth, 'max_leaf_nodes': max_leaf_nodes}
        print(f'selection={_selection},   conv2d={conv2d},   classifier={param}')
        return RandomForestClassifier(max_depth=max_depth, max_leaf_nodes=max_leaf_nodes, warm_start=False, random_state=42, n_jobs=-1)
    elif selection == 'SGD':
        return SGDClassifier()
    elif selection == 'PA':
        return PassiveAggressiveClassifier(n_jobs=-1)
    elif selection == 'CNN':
        input_shape = (_selection['time'][1]-_selection['time'][0],
                       _selection['frequency'][1]-_selection['frequency'][0], 1)
        num_classes = 2

        model = tf.keras.models.Sequential([
            tf.keras.layers.Conv2D(128, kernel_size=(2, 2), activation='relu', input_shape=input_shape),
            tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
            tf.keras.layers.Conv2D(256, kernel_size=(3, 3), activation='relu'),
            tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
            tf.keras.layers.Conv2D(256, kernel_size=(3, 3), activation='relu'),
            tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
            tf.keras.layers.Flatten(),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dense(128, activation='relu'),
            tf.keras.layers.Dense(num_classes, activation='softmax')
        ])
        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
        return model
    else:
        param = {'classifier': 'adaBoost'}
        print(f'selection={selection},   conv2d={conv2d},   classifier={param}')
        from sklearn.ensemble import AdaBoostClassifier
        return  AdaBoostClassifier(n_estimators=100)

In [27]:
# temporary here to test CNN
x_ = np.array(np.split(x, selection['time'][1]-selection['time'][0], axis=1)).transpose((1, 0, 2))
for i in range(len(y)):
    if y[i] == 6:
        y[i] = 1
temp = np.zeros((y.size, 2))
temp[np.arange(y.size), y] = 1
y = temp

In [None]:
clf = _set_clf('CNN', selection, conv2d)

clf.fit(x_, y, epochs=10)

# Metrics
y_pred = clf.predict(x_)
score = _score(y, y_pred)
print(f'{score = :.2f}')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10


In [35]:
mode = 'validation'
batch_size = 100

# Init reader and x, y for the case of use_all_data = True
reader = Reader(batch_size=batch_size * len(condition))
hdf5_batch_iterator = reader.read_hdf5(session, mode, condition, random=False, verbose=True)
x_val, y_val = _init_x_y(hdf5_batch_iterator, selection)

pointer = 0
for x_batch, y_batch in hdf5_batch_iterator:
    x_batch = _feature_selection(x_batch, selection, conv2d, parameter)  # (n_exams, n_leads, n_pars, n_features)
    x_val[pointer:pointer + len(x_batch)] = x_batch
    y_val[pointer:pointer + len(x_batch)] = y_batch
    pointer += len(x_batch)

    # x = x_batch
    # y = y_batch
    # break

# lighted ram load
del x_batch, y_batch

# Aggregate all exams in one batch (can only be possible if there is selection or low memory usage)
x_val = x_val[:, 0, 0, :]

# temporary here to test CNN
x_val = np.array(np.split(x_val, selection['time'][1]-selection['time'][0], axis=1)).transpose((1, 0, 2))
for i in range(len(y_val)):
    if y_val[i] == 6:
        y_val[i] = 1
temp = np.zeros((y_val.size, 2))
temp[np.arange(y_val.size), y_val] = 1
y_val = temp

Batch  1/7: 100%|██████████| 199/199 [00:02<00:00, 79.34it/s]
Batch  2/7: 100%|██████████| 199/199 [00:02<00:00, 86.93it/s]
Batch  3/7: 100%|██████████| 199/199 [00:02<00:00, 94.36it/s]
Batch  4/7: 100%|██████████| 199/199 [00:02<00:00, 97.29it/s]
Batch  5/7: 100%|██████████| 198/198 [00:02<00:00, 97.37it/s]
Batch  6/7: 100%|██████████| 198/198 [00:01<00:00, 99.06it/s] 
Batch  7/7: 100%|██████████| 198/198 [00:02<00:00, 93.97it/s]


In [56]:
# Metrics
y_pred = clf.predict(x_val)
score = _score(y_val, y_pred)
print(f'{score = :.2f}')

score = 72.52
