# Task 2 - HEART RHYTHM CLASSIFICATION FROM RAW ECG SIGNALS
 While the previous projects dealt with medical image features, we turn now to the classification of entire time series into one of 4 classes. This time you will work with the original ECG recordings of different length sampled as 300Hz to predict heart rhythm.

In [None]:
#@title Install packages
!pip install tensorflow-addons
!pip install neurokit2

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
#@title Imports
import pandas as pd
import numpy as np

import zipfile




from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.base import BaseEstimator



In [None]:
#@title Mount Drive

use_drive = True
if use_drive:
  PATH = "drive/My Drive/AML_HS22/task2/"
  from google.colab import drive
  drive.mount('/content/drive')
else:
  PATH = "./"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#@title Load Data
def load_data():
    zf = zipfile. ZipFile(PATH + 'task2.zip')
    
    X_df = pd.read_csv(zf.open('X_train.csv'), index_col=False)
    X_df = X_df.drop(X_df.columns[0], axis=1)
    # X_df = X_df.apply(lambda x: x.dropna(), axis=0)
    X = np.asarray(X_df)
    
    y_df = pd.read_csv(zf.open('y_train.csv'), index_col=False)
    y_df = y_df.drop(y_df.columns[0], axis=1)
    y = np.asarray(y_df).ravel()

    for i in range(4):
      print (f"class_{i} has {(y == i).sum()} that's {100*(y == i).sum()/y.shape[0]:.2f}% of all elements")
    
    X_test_df = pd.read_csv(zf.open('X_test.csv'), index_col=False)
    X_test_df = X_test_df.drop(X_test_df.columns[0], axis=1)
    # X_test_df = X_test_df.apply(lambda x: x.dropna(), axis=0)
    X_test = np.asarray(X_test_df)

    print("X shape %s" %str(X.shape))
    print("y shape %s" %str(y.shape))
    print("X_test shape %s" %str(X_test.shape))

    return X, y, X_test


X_imp, y, X_test_imp = load_data()

class_0 has 3030 that's 59.21% of all elements
class_1 has 443 that's 8.66% of all elements
class_2 has 1474 that's 28.81% of all elements
class_3 has 170 that's 3.32% of all elements
X shape (5117, 17807)
y shape (5117,)
X_test shape (3411, 17807)


In [None]:
from sklearn.utils import resample
def unbalance_dataset(X_imp, y):
  X_class0, y_class0 = X_imp[y == 0], y[y == 0]
  X_class1, y_class1 = X_imp[y == 1], y[y == 1]
  X_class2, y_class2 = X_imp[y == 2], y[y == 2]
  X_class3, y_class3 = X_imp[y == 3], y[y == 3]
  X_class0_bal, y_class0_bal = resample(X_class0, y_class0, replace=True, n_samples=500, random_state=1)
  X_class1_bal, y_class1_bal = resample(X_class1, y_class1, replace=True, n_samples=500, random_state=1)
  X_class2_bal, y_class2_bal = resample(X_class2, y_class2, replace=True, n_samples=500, random_state=1)
  X_class3_bal, y_class3_bal = resample(X_class3, y_class3, replace=True, n_samples=500, random_state=1)
  X_imp = np.vstack((X_class0_bal, X_class1_bal, X_class2_bal, X_class3_bal))
  y = np.hstack((y_class0_bal, y_class1_bal, y_class2_bal, y_class3_bal))


  print("X shape %s" %str(X_imp.shape))
  print("y shape %s" %str(y.shape))

  for i in range(4):
    print (f"class_{i} has {(y == i).sum()} that's {100*(y == i).sum()/y.shape[0]:.2f}% of all elements")

In [None]:
#@title Equalize Length of samples

INTERVAL_SIZE = 180 # 8s at 300Hz 
SAMPLING_STEP = 4 # downsample by factor of _, if sampling_step = 1 then we leave the original 300hz


# data from mitbhi dataset, used in the ML4H course, was sampled at 60Hz and included 180 samples = 2 cycles, padded with 0s, all data started with the first peak

def equalize_data(X, X_test):

  def last (ts):
    for i in range(len(ts)):
      if np.isnan(ts[i]): 
          break
    return i

  def equalize_series(ts):
    lost = 0
    out = []
    for x in ts:
      length = last(x)
      # print(length)
      if length < INTERVAL_SIZE * SAMPLING_STEP:
        lost += 1
        continue
      start = (length-INTERVAL_SIZE*SAMPLING_STEP)//2 # Get sample in middle of time series
      out.append(x[start:start+INTERVAL_SIZE*SAMPLING_STEP:SAMPLING_STEP])
    print(f"Lost: {lost}")
    return np.asarray(out)
  
  print("Equalizing X")
  X_eq = equalize_series(X)
  print("Equalizing X_test")
  X_tests_eq = equalize_series(X_test)

  return X_eq, X_tests_eq


# X, X_test = equalize_data(X_imp, X_test_imp)
# del X_imp
# del X_test_imp

# print(f'Before (X[0]): {X[0]}')
X = X_imp
X_test = X_test_imp

In [None]:
import neurokit2 as nk
from tqdm import tqdm
def last (ts):
  return len(ts) - np.sum(np.isnan(ts))
def expand_datset_all_epochs(X, y):
  bar = tqdm(range(len(X)))
  X_epochs = []
  y_epochs = []
  lens = []
  for i, ecg_raw in enumerate(X):
    bar.update(1)
    # signals, info = nk.ecg_process(ecg_raw, sampling_rate=300)
    length = last(ecg_raw)
    ecg_raw = ecg_raw[:length]
    cleaned_ecg = nk.ecg_clean(ecg_raw, sampling_rate=300)
    # rpeaks = info["ECG_R_Peaks"]
    # print(rpeaks)
    # cleaned_ecg = signals["ECG_Clean"]
    # plot = nk.events_plot(rpeaks[:5], cleaned_ecg[:1500])
    try:
      epochs = nk.ecg_segment(cleaned_ecg, rpeaks=None, sampling_rate=300, show=False)
    except:
      print("info is zero-valued!!!!!!")
      X_epochs.append(np.zeros((180,)))
      if y is not None:
        y_epochs.append(y[i])
        continue
    
    epochs_signals = np.asarray([val['Signal'] for (_,val) in epochs.items()])
    for epoch in epochs_signals:
      resampled_epoch = nk.signal_resample(epoch, sampling_rate=300, desired_sampling_rate=100)
      lens.append(len(resampled_epoch))
      if len(resampled_epoch) > 180:
        resampled_epoch = resampled_epoch[:180]
      padded_epoch = np.pad(resampled_epoch, (0, 180 - len(resampled_epoch)), 'constant', constant_values=(0.0, 0.0))
      X_epochs.append(padded_epoch)
      if y is not None:
        y_epochs.append(y[i])
  print(np.sort(lens))
  return np.asarray(X_epochs), np.asarray(y_epochs)

X, y = expand_datset_all_epochs(X, y)
print("X shape %s" %str(X_imp.shape))
print("y shape %s" %str(y.shape))
for i in range(4):
  print (f"class_{i} has {(y == i).sum()} that's {100*(y == i).sum()/y.shape[0]:.2f}% of all elements")

X_test, _ = expand_datset_all_epochs(X_test, None)



  0%|          | 0/5117 [00:00<?, ?it/s][A[A

  0%|          | 2/5117 [00:00<08:44,  9.75it/s][A[A

  0%|          | 3/5117 [00:00<08:50,  9.65it/s][A[A

  0%|          | 5/5117 [00:00<11:14,  7.58it/s][A[A

  0%|          | 6/5117 [00:00<10:55,  7.80it/s][A[A

  0%|          | 8/5117 [00:00<09:37,  8.84it/s][A[A

  0%|          | 9/5117 [00:01<09:52,  8.62it/s][A[A

  0%|          | 10/5117 [00:01<11:29,  7.41it/s][A[A

  0%|          | 12/5117 [00:01<10:48,  7.87it/s][A[A

  0%|          | 13/5117 [00:01<11:10,  7.61it/s][A[A

  0%|          | 15/5117 [00:01<10:11,  8.34it/s][A[A

  0%|          | 16/5117 [00:01<09:59,  8.50it/s][A[A

 26%|██▌       | 1338/5117 [28:04<1:19:17,  1.26s/it]


  0%|          | 18/5117 [00:02<14:36,  5.82it/s][A[A

  0%|          | 19/5117 [00:02<13:50,  6.14it/s][A[A

  0%|          | 21/5117 [00:02<10:25,  8.14it/s][A[A

  0%|          | 23/5117 [00:02<10:21,  8.20it/s][A[A

  0%|          | 24/5117 [00:03<11:07,  7.63i

info is zero-valued!!!!!!




 15%|█▍        | 752/5117 [01:39<18:21,  3.96it/s][A[A

 15%|█▍        | 753/5117 [01:39<16:01,  4.54it/s][A[A

 15%|█▍        | 754/5117 [01:40<26:27,  2.75it/s][A[A

 15%|█▍        | 755/5117 [01:40<21:09,  3.44it/s][A[A

 15%|█▍        | 756/5117 [01:40<20:03,  3.62it/s][A[A

 15%|█▍        | 757/5117 [01:41<18:07,  4.01it/s][A[A

 15%|█▍        | 758/5117 [01:41<18:07,  4.01it/s][A[A

 15%|█▍        | 759/5117 [01:41<17:17,  4.20it/s][A[A

 15%|█▍        | 760/5117 [01:41<21:13,  3.42it/s][A[A

 15%|█▍        | 761/5117 [01:42<20:23,  3.56it/s][A[A

 15%|█▍        | 762/5117 [01:42<20:45,  3.50it/s][A[A

 15%|█▍        | 763/5117 [01:42<20:58,  3.46it/s][A[A

 15%|█▍        | 764/5117 [01:42<18:22,  3.95it/s][A[A

 15%|█▍        | 765/5117 [01:43<15:33,  4.66it/s][A[A

 15%|█▍        | 766/5117 [01:43<15:48,  4.59it/s][A[A

 15%|█▍        | 767/5117 [01:43<14:53,  4.87it/s][A[A

 15%|█▌        | 768/5117 [01:43<14:11,  5.11it/s][A[A

 15%|█▌     

info is zero-valued!!!!!!




 15%|█▌        | 785/5117 [01:48<23:23,  3.09it/s][A[A

 15%|█▌        | 786/5117 [01:49<34:48,  2.07it/s][A[A

 15%|█▌        | 787/5117 [01:50<32:17,  2.24it/s][A[A

 15%|█▌        | 788/5117 [01:50<33:01,  2.19it/s][A[A

 15%|█▌        | 789/5117 [01:51<37:07,  1.94it/s][A[A

 15%|█▌        | 790/5117 [01:51<33:35,  2.15it/s][A[A

 15%|█▌        | 791/5117 [01:51<26:38,  2.71it/s][A[A

 15%|█▌        | 792/5117 [01:52<26:02,  2.77it/s][A[A

 15%|█▌        | 793/5117 [01:52<27:52,  2.58it/s][A[A

 16%|█▌        | 794/5117 [01:53<32:22,  2.23it/s][A[A

 16%|█▌        | 795/5117 [01:53<37:37,  1.91it/s][A[A

 16%|█▌        | 796/5117 [01:54<40:59,  1.76it/s][A[A

 16%|█▌        | 797/5117 [01:54<35:58,  2.00it/s][A[A

 16%|█▌        | 798/5117 [01:54<29:11,  2.47it/s][A[A

 16%|█▌        | 799/5117 [01:55<24:44,  2.91it/s][A[A

 16%|█▌        | 800/5117 [01:55<22:58,  3.13it/s][A[A

 16%|█▌        | 801/5117 [01:55<18:31,  3.88it/s][A[A

 16%|█▌     

info is zero-valued!!!!!!




 16%|█▌        | 807/5117 [01:56<14:11,  5.06it/s][A[A

 16%|█▌        | 808/5117 [01:56<12:40,  5.67it/s][A[A

 16%|█▌        | 809/5117 [01:56<12:54,  5.56it/s][A[A

 16%|█▌        | 810/5117 [01:57<14:13,  5.05it/s][A[A

 16%|█▌        | 811/5117 [01:57<14:59,  4.79it/s][A[A

 16%|█▌        | 812/5117 [01:57<14:32,  4.93it/s][A[A

 16%|█▌        | 813/5117 [01:57<14:14,  5.04it/s][A[A

 16%|█▌        | 814/5117 [01:58<16:50,  4.26it/s][A[A

 16%|█▌        | 815/5117 [01:58<21:24,  3.35it/s][A[A

 16%|█▌        | 816/5117 [01:58<22:25,  3.20it/s][A[A

 16%|█▌        | 817/5117 [01:59<19:10,  3.74it/s][A[A

 16%|█▌        | 818/5117 [01:59<20:18,  3.53it/s][A[A

 16%|█▌        | 819/5117 [01:59<20:08,  3.56it/s][A[A

 16%|█▌        | 820/5117 [01:59<19:10,  3.74it/s][A[A

 16%|█▌        | 821/5117 [02:00<18:11,  3.93it/s][A[A

 16%|█▌        | 822/5117 [02:00<18:23,  3.89it/s][A[A

 16%|█▌        | 823/5117 [02:00<17:50,  4.01it/s][A[A

 16%|█▌     

info is zero-valued!!!!!!




 16%|█▋        | 834/5117 [02:02<09:27,  7.54it/s][A[A

 16%|█▋        | 836/5117 [02:02<07:53,  9.04it/s][A[A

info is zero-valued!!!!!!




 16%|█▋        | 838/5117 [02:03<09:02,  7.89it/s][A[A

 16%|█▋        | 839/5117 [02:03<08:49,  8.07it/s][A[A

 16%|█▋        | 841/5117 [02:03<08:21,  8.53it/s][A[A

 16%|█▋        | 842/5117 [02:03<09:57,  7.16it/s][A[A

 16%|█▋        | 843/5117 [02:03<09:33,  7.45it/s][A[A

 16%|█▋        | 844/5117 [02:03<09:07,  7.80it/s][A[A

 17%|█▋        | 845/5117 [02:04<10:16,  6.93it/s][A[A

 17%|█▋        | 847/5117 [02:04<08:43,  8.15it/s][A[A

 17%|█▋        | 849/5117 [02:04<07:02, 10.11it/s][A[A

 17%|█▋        | 851/5117 [02:04<06:37, 10.74it/s][A[A

 17%|█▋        | 853/5117 [02:04<07:50,  9.06it/s][A[A

 17%|█▋        | 854/5117 [02:04<07:56,  8.94it/s][A[A

 17%|█▋        | 856/5117 [02:05<07:17,  9.75it/s][A[A

 17%|█▋        | 858/5117 [02:05<08:31,  8.32it/s][A[A

 17%|█▋        | 859/5117 [02:05<08:16,  8.57it/s][A[A

 17%|█▋        | 860/5117 [02:05<08:17,  8.55it/s][A[A

 17%|█▋        | 861/5117 [02:05<08:06,  8.75it/s][A[A

 17%|█▋     

info is zero-valued!!!!!!




 27%|██▋       | 1387/5117 [03:04<07:16,  8.54it/s][A[A

 27%|██▋       | 1389/5117 [03:04<06:48,  9.12it/s][A[A

 27%|██▋       | 1390/5117 [03:05<07:16,  8.53it/s][A[A

 27%|██▋       | 1391/5117 [03:05<07:12,  8.61it/s][A[A

 27%|██▋       | 1393/5117 [03:05<05:56, 10.44it/s][A[A

 27%|██▋       | 1395/5117 [03:05<06:29,  9.56it/s][A[A

 27%|██▋       | 1397/5117 [03:05<06:08, 10.11it/s][A[A

 27%|██▋       | 1399/5117 [03:06<06:14,  9.92it/s][A[A

 27%|██▋       | 1401/5117 [03:06<05:58, 10.37it/s][A[A

 27%|██▋       | 1403/5117 [03:06<05:54, 10.47it/s][A[A

 27%|██▋       | 1405/5117 [03:06<06:04, 10.18it/s][A[A

 27%|██▋       | 1407/5117 [03:06<05:58, 10.36it/s][A[A

 28%|██▊       | 1409/5117 [03:07<06:39,  9.27it/s][A[A

 28%|██▊       | 1410/5117 [03:07<07:39,  8.07it/s][A[A

 28%|██▊       | 1412/5117 [03:07<06:56,  8.89it/s][A[A

 28%|██▊       | 1413/5117 [03:07<06:53,  8.95it/s][A[A

 28%|██▊       | 1415/5117 [03:07<06:13,  9.91it/s][A

info is zero-valued!!!!!!




 37%|███▋      | 1875/5117 [04:00<05:48,  9.29it/s][A[A

 37%|███▋      | 1877/5117 [04:00<05:19, 10.13it/s][A[A

 37%|███▋      | 1879/5117 [04:00<05:16, 10.22it/s][A[A

 37%|███▋      | 1881/5117 [04:00<05:37,  9.59it/s][A[A

 37%|███▋      | 1883/5117 [04:00<05:45,  9.37it/s][A[A

 37%|███▋      | 1884/5117 [04:00<06:14,  8.63it/s][A[A

 37%|███▋      | 1885/5117 [04:01<08:05,  6.65it/s][A[A

 37%|███▋      | 1886/5117 [04:01<07:47,  6.91it/s][A[A

 37%|███▋      | 1887/5117 [04:01<07:34,  7.11it/s][A[A

 37%|███▋      | 1888/5117 [04:01<07:14,  7.43it/s][A[A

 37%|███▋      | 1890/5117 [04:01<07:01,  7.65it/s][A[A

 37%|███▋      | 1891/5117 [04:01<06:41,  8.03it/s][A[A

 37%|███▋      | 1892/5117 [04:02<07:49,  6.86it/s][A[A

 37%|███▋      | 1894/5117 [04:02<06:20,  8.46it/s][A[A

 37%|███▋      | 1895/5117 [04:02<06:15,  8.57it/s][A[A

 37%|███▋      | 1897/5117 [04:02<05:35,  9.60it/s][A[A

 37%|███▋      | 1898/5117 [04:02<05:34,  9.62it/s][A

info is zero-valued!!!!!!




 48%|████▊     | 2437/5117 [05:07<05:17,  8.44it/s][A[A

 48%|████▊     | 2439/5117 [05:07<05:13,  8.54it/s][A[A

 48%|████▊     | 2440/5117 [05:07<05:54,  7.55it/s][A[A

 48%|████▊     | 2442/5117 [05:07<05:03,  8.82it/s][A[A

 48%|████▊     | 2444/5117 [05:08<04:27,  9.99it/s][A[A

 48%|████▊     | 2446/5117 [05:08<04:18, 10.32it/s][A[A

 48%|████▊     | 2448/5117 [05:08<04:41,  9.48it/s][A[A

 48%|████▊     | 2449/5117 [05:08<04:40,  9.51it/s][A[A

 48%|████▊     | 2450/5117 [05:08<04:40,  9.50it/s][A[A

 48%|████▊     | 2451/5117 [05:08<04:53,  9.07it/s][A[A

 48%|████▊     | 2453/5117 [05:09<04:42,  9.44it/s][A[A

 48%|████▊     | 2454/5117 [05:09<04:39,  9.54it/s][A[A

 48%|████▊     | 2456/5117 [05:09<04:25, 10.01it/s][A[A

 48%|████▊     | 2457/5117 [05:09<04:32,  9.76it/s][A[A

 48%|████▊     | 2459/5117 [05:09<04:06, 10.78it/s][A[A

 48%|████▊     | 2461/5117 [05:09<04:16, 10.37it/s][A[A

 48%|████▊     | 2463/5117 [05:09<04:28,  9.88it/s][A

info is zero-valued!!!!!!




 62%|██████▏   | 3150/5117 [06:27<03:39,  8.94it/s][A[A

 62%|██████▏   | 3152/5117 [06:27<03:14, 10.11it/s][A[A

 62%|██████▏   | 3154/5117 [06:28<03:11, 10.25it/s][A[A

 62%|██████▏   | 3156/5117 [06:28<03:07, 10.45it/s][A[A

 62%|██████▏   | 3158/5117 [06:28<03:04, 10.64it/s][A[A

 62%|██████▏   | 3160/5117 [06:28<03:28,  9.38it/s][A[A

 62%|██████▏   | 3162/5117 [06:28<03:05, 10.54it/s][A[A

 62%|██████▏   | 3164/5117 [06:29<03:05, 10.54it/s][A[A

 62%|██████▏   | 3166/5117 [06:29<03:33,  9.13it/s][A[A

 62%|██████▏   | 3167/5117 [06:29<03:34,  9.09it/s][A[A

 62%|██████▏   | 3168/5117 [06:29<04:03,  8.01it/s][A[A

 62%|██████▏   | 3169/5117 [06:29<04:43,  6.88it/s][A[A

 62%|██████▏   | 3170/5117 [06:30<04:25,  7.33it/s][A[A

 62%|██████▏   | 3171/5117 [06:30<04:11,  7.75it/s][A[A

 62%|██████▏   | 3172/5117 [06:30<04:25,  7.32it/s][A[A

 62%|██████▏   | 3173/5117 [06:30<04:11,  7.74it/s][A[A

 62%|██████▏   | 3174/5117 [06:30<04:13,  7.66it/s][A

info is zero-valued!!!!!!




 75%|███████▍  | 3831/5117 [07:49<02:05, 10.26it/s][A[A

 75%|███████▍  | 3833/5117 [07:49<02:08,  9.96it/s][A[A

 75%|███████▍  | 3835/5117 [07:49<02:22,  9.01it/s][A[A

 75%|███████▍  | 3836/5117 [07:49<02:29,  8.56it/s][A[A

 75%|███████▍  | 3837/5117 [07:49<02:28,  8.59it/s][A[A

 75%|███████▌  | 3839/5117 [07:49<02:10,  9.81it/s][A[A

 75%|███████▌  | 3841/5117 [07:50<02:28,  8.58it/s][A[A

 75%|███████▌  | 3842/5117 [07:50<02:52,  7.38it/s][A[A

 75%|███████▌  | 3843/5117 [07:50<02:51,  7.42it/s][A[A

 75%|███████▌  | 3845/5117 [07:50<02:13,  9.52it/s][A[A

 75%|███████▌  | 3847/5117 [07:50<02:09,  9.81it/s][A[A

 75%|███████▌  | 3849/5117 [07:51<02:06, 10.05it/s][A[A

 75%|███████▌  | 3851/5117 [07:51<01:56, 10.85it/s][A[A

 75%|███████▌  | 3853/5117 [07:51<02:16,  9.23it/s][A[A

 75%|███████▌  | 3854/5117 [07:51<02:15,  9.31it/s][A[A

 75%|███████▌  | 3855/5117 [07:51<02:27,  8.54it/s][A[A

 75%|███████▌  | 3856/5117 [07:51<02:24,  8.76it/s][A

info is zero-valued!!!!!!




 86%|████████▌ | 4393/5117 [08:54<01:13,  9.86it/s][A[A

 86%|████████▌ | 4395/5117 [08:54<01:10, 10.18it/s][A[A

 86%|████████▌ | 4397/5117 [08:54<01:10, 10.27it/s][A[A

 86%|████████▌ | 4399/5117 [08:55<01:14,  9.60it/s][A[A

 86%|████████▌ | 4401/5117 [08:55<01:08, 10.43it/s][A[A

 86%|████████▌ | 4403/5117 [08:55<01:05, 10.86it/s][A[A

 86%|████████▌ | 4405/5117 [08:55<01:01, 11.50it/s][A[A

 86%|████████▌ | 4407/5117 [08:55<01:04, 11.06it/s][A[A

 86%|████████▌ | 4409/5117 [08:55<01:14,  9.55it/s][A[A

 86%|████████▌ | 4411/5117 [08:56<01:14,  9.52it/s][A[A

 86%|████████▌ | 4412/5117 [08:56<01:26,  8.13it/s][A[A

 86%|████████▌ | 4413/5117 [08:56<01:25,  8.20it/s][A[A

 86%|████████▋ | 4415/5117 [08:56<01:15,  9.27it/s][A[A

 86%|████████▋ | 4416/5117 [08:56<01:16,  9.19it/s][A[A

 86%|████████▋ | 4418/5117 [08:56<01:07, 10.36it/s][A[A

 86%|████████▋ | 4420/5117 [08:57<01:25,  8.11it/s][A[A

 86%|████████▋ | 4422/5117 [08:57<01:16,  9.14it/s][A

[ 49  49  49 ... 681 681 681]





X shape (5117, 17807)
y shape (189487,)
class_0 has 106123 that's 56.01% of all elements
class_1 has 20215 that's 10.67% of all elements
class_2 has 58699 that's 30.98% of all elements
class_3 has 4450 that's 2.35% of all elements


 31%|███       | 1057/3411 [01:49<03:27, 11.35it/s]

info is zero-valued!!!!!!


 68%|██████▊   | 2329/3411 [04:03<01:37, 11.13it/s]

info is zero-valued!!!!!!


 84%|████████▍ | 2875/3411 [05:02<00:47, 11.35it/s]

info is zero-valued!!!!!!


100%|██████████| 3411/3411 [05:56<00:00,  9.56it/s]

[ 47  47  47 ... 275 275 275]





In [None]:
#@title Normalize Data
def normalize(X,X_test):
  scl = MinMaxScaler()
  data = np.concatenate([X,X_test])
  scl.fit(data)
  X_scl = scl.transform(X)
  X_test_scl = scl.transform(X_test)
  print(f'Before (X): {X[0][:5]}')
  print(f'After (X):  {X_scl[0][:5]}')

  return X_scl, X_test_scl

X, X_test = normalize(X, X_test)

Before (X): [131.83825873 152.41672146 175.70249049 206.52922734 237.78574907]
After (X):  [0.45766033 0.45312038 0.48019994 0.50369433 0.51140292]


In [None]:
#@title Split Data
def split_data(X, y):
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=1)
    y_train = y_train.ravel()
    y_val = y_val.ravel()
    y = y.ravel()

    print("X shape %s" %str(X.shape))
    print("y shape %s" %str(y.shape))
    print("X_train shape %s" %str(X_train.shape))
    print("y_train shape %s" %str(y_train.shape))
    print("X_val shape %s" %str(X_val.shape))
    print("y_val shape %s" %str(y_val.shape))

    return X_train, X_val, y_train, y_val, X, y

X_train, X_val, y_train, y_val, X, y = split_data(X, y)

X shape (189487, 180)
y shape (189487,)
X_train shape (151589, 180)
y_train shape (151589,)
X_val shape (37898, 180)
y_val shape (37898,)


In [None]:
def f1_weighted(true, pred): #shapes (batch, 4)

    #for metrics include these two lines, for loss, don't include them
    #these are meant to round 'pred' to exactly zeros and ones
    #predLabels = K.argmax(pred, axis=-1)
    #pred = K.one_hot(predLabels, 4) 


    ground_positives = K.sum(true, axis=0) + K.epsilon()       # = TP + FN
    pred_positives = K.sum(pred, axis=0) + K.epsilon()         # = TP + FP
    true_positives = K.sum(true * pred, axis=0) + K.epsilon()  # = TP
        #all with shape (4,)
    
    precision = true_positives / pred_positives 
    recall = true_positives / ground_positives
        #both = 1 if ground_positives == 0 or pred_positives == 0
        #shape (4,)

    f1 = 2 * (precision * recall) / (precision + recall + K.epsilon())
        #still with shape (4,)

    #weighted_f1 = f1 * ground_positives / K.sum(ground_positives) 
    #weighted_f1 = K.sum(weighted_f1)

    
    return 1 - f1#weighted_f1 #for metrics, return only 'weighted_f1'

In [None]:
def base_model_copied_from_ML4H_course():
  nclass = 4
  inp = Input(shape=(INTERVAL_SIZE, 1))
  img_1 = Convolution1D(16, kernel_size=5, activation=activations.relu, padding="valid")(inp)
  img_1 = Convolution1D(16, kernel_size=5, activation=activations.relu, padding="valid")(img_1)
  img_1 = MaxPool1D(pool_size=2)(img_1)
  img_1 = Dropout(rate=0.1)(img_1)
  img_1 = Convolution1D(32, kernel_size=3, activation=activations.relu, padding="valid")(img_1)
  img_1 = Convolution1D(32, kernel_size=3, activation=activations.relu, padding="valid")(img_1)
  img_1 = MaxPool1D(pool_size=2)(img_1)
  img_1 = Dropout(rate=0.1)(img_1)
  img_1 = Convolution1D(32, kernel_size=3, activation=activations.relu, padding="valid")(img_1)
  img_1 = Convolution1D(32, kernel_size=3, activation=activations.relu, padding="valid")(img_1)
  img_1 = MaxPool1D(pool_size=2)(img_1)
  img_1 = Dropout(rate=0.1)(img_1)
  img_1 = Convolution1D(256, kernel_size=3, activation=activations.relu, padding="valid")(img_1)
  img_1 = Convolution1D(256, kernel_size=3, activation=activations.relu, padding="valid")(img_1)
  img_1 = GlobalMaxPool1D()(img_1)
  img_1 = Dropout(rate=0.2)(img_1)

  dense_1 = Dense(64, activation=activations.relu)(img_1)
  dense_1 = Dense(64, activation=activations.relu)(dense_1)
  dense_1 = Dense(nclass, activation=activations.softmax, )(dense_1)

  model = models.Model(inputs=inp, outputs=dense_1)

  opt = optimizers.SGD(0.00031)
  model.compile(optimizer=opt, loss=losses.sparse_categorical_crossentropy, metrics=[tf.keras.metrics.SparseCategoricalAccuracy(),
                           tfa.metrics.F1Score(num_classes=4, average='micro'),
                           tfa.metrics.FBetaScore(beta=2.0, num_classes=4, average='micro')])
  model.summary()
  return model, "CNN_w_pooling"

def threegru_model_from_ML4H():
  nclass = 4
  model = Sequential()
  model.add(Bidirectional(GRU(32, return_sequences=True), input_shape=(INTERVAL_SIZE,1)))
  model.add(Dropout(0.1))

  model.add(Bidirectional(GRU(64,  return_sequences=True)))
  model.add(Dropout(0.1))

  model.add(Bidirectional(GRU(32, return_sequences=True)))
  model.add(Dropout(0.1))

  model.add(Flatten())
  
  model.add(Dense(32))
  model.add(ReLU())
 
  model.add(Dense(nclass, activation=activations.softmax))

  opt = optimizers.SGD(0.001)


  model.compile(optimizer=opt, loss=losses.sparse_categorical_crossentropy, metrics=[tf.keras.metrics.SparseCategoricalAccuracy(),
                           tfa.metrics.F1Score(num_classes=nclass, average='micro'),
                           tfa.metrics.FBetaScore(beta=2.0, num_classes=nclass, average='micro')])
  model.summary()
  return model, "threeGRU"

def neural_network():

  model = Sequential(name="Model")
  model.add(Reshape((INTERVAL_SIZE, 1), input_shape=(INTERVAL_SIZE,)))
  for i in range(5):
    model.add(Conv1D(filters=128, kernel_size=5, padding='same', name=f"Conv_{i+1}")) 
    model.add(LeakyReLU(alpha=0.01, name=f"ReLU_{i+1}"))
  model.add(Flatten())
  model.add(Dense(units=128))
  model.add(LeakyReLU(alpha=0.01))
  model.add(Dense(4))
  model.add(Softmax())
  opt = optimizers.SGD(0.001)
  model.compile(optimizer=opt, loss=losses.sparse_categorical_crossentropy, metrics=[tf.keras.metrics.SparseCategoricalAccuracy(),
                           tfa.metrics.F1Score(num_classes=4, average='micro'),
                           tfa.metrics.FBetaScore(beta=2.0, num_classes=4, average='micro')])
  model.summary()
  return model, "CNN_w_leakyReLU"

def dense_MLP():

  model = Sequential(name="Model")
  model.add(Reshape((INTERVAL_SIZE, 1), input_shape=(INTERVAL_SIZE,)))
  for i in range(3):
    model.add(Dense(units=256)) 
    model.add(ReLU())
  model.add(Flatten())
  model.add(Dense(4))
  model.add(Softmax())
  opt = optimizers.SGD(0.001)
  model.compile(optimizer=opt, loss=losses.sparse_categorical_crossentropy, metrics=[tf.keras.metrics.SparseCategoricalAccuracy(),
                           tfa.metrics.F1Score(num_classes=4, average='micro'),
                           tfa.metrics.FBetaScore(beta=2.0, num_classes=4, average='micro')])
  model.summary()
  return model, "dense_MLP"

In [None]:
#@title Neural Network

# model = neural_network()
model, MODEL_NAME = dense_MLP()

MODEL_PATH = PATH + f"models/{MODEL_NAME}.h5"

checkpoint = ModelCheckpoint(MODEL_PATH, monitor='val_f1_score', verbose=1, save_best_only=True, mode='max')
early = EarlyStopping(monitor="val_f1_score", mode="max", patience=4, verbose=1)
redonplat = ReduceLROnPlateau(monitor="val_f1_score", mode="max", patience=3, verbose=2)
tensorboard = TensorBoard(log_dir=PATH + f"logs/{MODEL_NAME}")
callbacks_list = [checkpoint, early, redonplat, tensorboard]

model.fit(X, y, epochs=1000, verbose=1, callbacks=callbacks_list, validation_split=0.2)

#K.set_value(model.optimizer.learning_rate, 0.1e-4)
#model.fit(X[100:1000], y_cat[100:1000])


Model: "Model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 reshape_1 (Reshape)         (None, 180, 1)            0         
                                                                 
 dense_15 (Dense)            (None, 180, 256)          512       
                                                                 
 re_lu_4 (ReLU)              (None, 180, 256)          0         
                                                                 
 dense_16 (Dense)            (None, 180, 256)          65792     
                                                                 
 re_lu_5 (ReLU)              (None, 180, 256)          0         
                                                                 
 dense_17 (Dense)            (None, 180, 256)          65792     
                                                                 
 re_lu_6 (ReLU)              (None, 180, 256)          0     

KeyboardInterrupt: ignored

In [None]:
def convert_categories(y):
  return np.asarray([np.argmax(k) for k in y])

In [None]:
y_pred=model.predict(X_val)
print(y_pred)
y_pred = np.argmax(y_pred, axis=-1)
print(y_pred)

In [None]:
#@title Score
print(f"Score: {f1_score(y_val, y_pred, average='micro')}")

In [None]:
#@title Output
y_test = model.predict(X_test)
print(y_test[:10])
ids = np.asarray(list(range(len(y_test)))).astype(int)
output_arr = np.column_stack((ids, y_test))
print(output_arr[:10])
np.savetxt(PATH + 'y_test.csv', output_arr, delimiter=',', header="id,y", comments='', fmt=["%d","%f"])

# Experiments

In [None]:
#@title Convert data to Frequency domain

def frequency_domain(ts):

  SAMPLE_RATE = 300
  MAXIMUM_FREQUENCY = min(75, SAMPLE_RATE/2)
  NUMBER_SAMPLE_POINTS=300
  INTERVAL_SIZE = round(NUMBER_SAMPLE_POINTS * SAMPLE_RATE / MAXIMUM_FREQUENCY)

  def last (ts):
    for i in range(len(ts)):
      if np.isnan(ts[i]): 
          break
    return i

  get = (last(ts)//INTERVAL_SIZE)*INTERVAL_SIZE
  data = ts[0:get]
  #print(len(data))
  yf = np.abs(fft.rfft(data))
  xf = fft.rfftfreq(len(data), 1 / SAMPLE_RATE)
  #plt.plot(xf, yf)
  sample_points = [round(x) for x in np.linspace(0, len(xf[xf<MAXIMUM_FREQUENCY])-1, NUMBER_SAMPLE_POINTS)]
  xf = xf[sample_points]
  yf = yf[sample_points]
  #yf = signal.resample(yf, num=NUMBER_SAMPLE_POINTS,t=sample_points, domain='freq')[0]
  #xf = signal.resample(xf, num=150, domain='time')
  #print(yf[:10])
  #plt.plot(xf[sample_points], yf)
  return xf, yf




def convert_data(X, X_test):
  X_f = np.asarray([frequency_domain(x)[1] for x in X])
  X_test_f = np.asarray([frequency_domain(x)[1] for x in X_test])
  return X_f, X_test_f


X_frq, X_test_frq = convert_data(X_imp, X_test_imp)


In [None]:
def standardise_data(X,X_test):
  scaler = StandardScaler()
  #scaler = MinMaxScaler()
  data = np.concatenate([X, X_test])
  scaler.fit(data)
  #print(scaler.mean_)
  X_scl = scaler.transform(X)
  X_test_scl = scaler.transform(X_test)
  print(f'Before Scaling: {X[0][:5]}')
  print(f'After Scaling:  {X_scl[0][:5]}')
  return X_scl, X_test_scl

X, X_test = standardise_data(X_frq, X_test_frq)
#X, X_test = X_frq, X_test_frq #skip standardise

In [None]:
#@title Classifier based on Frequency
clf = KNeighborsTimeSeriesClassifier(n_neighbors=5, weights='distance', distance='dtw')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_val)
print(f'Score: {f1_score(y_val, y_pred, average="micro")}')