## Initializations and imports

In [None]:
import os
from datetime import datetime
import random

In [None]:
from PIL import Image
import numpy as np

In [None]:
from google.colab import drive

drive.mount("/content/drive")

Mounted at /content/drive


## Extracting the datasets with bpf 5

In [None]:
def unzip_dataset(path, dataset_name):
  # Extracting the .tar.zip file of dataset
  ! tar -xzf $path

  # Just chaning the name of extracted folder to dataset_name
  dataset_name = f"EKM_{dataset_name}_dataset"
  ! mv EKM_dataset $dataset_name

  # Moving the train and test EKMs into one folder
  train_source_path = f"/content/{dataset_name}/train"
  dest = f"/content/{dataset_name}"

  ! cd $train_source_path; ls | xargs realpath | xargs mv -t $dest
  ! mv $dataset_name/test/* $dataset_name

  ! rm -r $dataset_name/train
  ! rm -r $dataset_name/test

In [None]:
! ls /content/drive/MyDrive/ECG\ project/ -ltrh

total 33M
drwx------ 2 root root 4.0K Aug 10 01:23 'N. Mokhtari'
-rw------- 1 root root 3.5M Aug 10 09:09  EKM_PTBDB_5bpf.tar.gz
-rw------- 1 root root 8.8M Aug 13 13:55  EKM_MIT_DB_5bpf.tar.gz
-rw------- 1 root root  21M Aug 13 19:40  EKM_NSRDB_5bpf.tar.gz


In [None]:
source_path = "/content/drive/MyDrive/ECG\ project"

nsrdb_path = f"{source_path}/EKM_NSRDB_5bpf.tar.gz"
mitdb_path = f"{source_path}/EKM_MIT_DB_5bpf.tar.gz"
ptbdb_path = f"{source_path}/EKM_PTBDB_5bpf.tar.gz"

unzip_dataset(nsrdb_path, "NSRDB")
unzip_dataset(mitdb_path, "MITDB")
unzip_dataset(ptbdb_path, "PTBDB")

## Getting equal number of users from datasets

Getting minimun amount of users of all datasets, then getting same amount of users from datasets randomly.

In [None]:
# We know that NSRDB has 18 users,
#              MITDB has 48 users,
#              PTBDB has 290 users,
# so we randomly select 18 users from MITDB and PTBDB users

In [None]:
users_amount = 18

In [None]:
dataset_users_amount_dict = {
    "PTBDB": 290,
    "MITDB": 48,
    "NSRDB": 18
}

In [None]:
def users_list(dataset_path, dataset_name):
  # Getting PTBDB's users' id
  image_files = os.listdir(dataset_path)

  dataset_users = []
  for img_name in image_files:
    user_id = img_name.split("-")[-2]
    if user_id not in dataset_users:
      dataset_users.append(user_id)

  # Checking the correctness
  if (len(dataset_users) == dataset_users_amount_dict[dataset_name]):
    print(f"All good with {len(dataset_users)} users for \
    {dataset_name} dataset.")
  else:
    print(f"Well, amount of users is {len(dataset_users)} for \
    {dataset_name} dataset so somthing is wrong!")

  return dataset_users

In [None]:
ptbdb_users = users_list("/content/EKM_PTBDB_dataset", "PTBDB")

All good with 290 users for     PTBDB dataset.


In [None]:
mitdb_users = users_list("/content/EKM_MITDB_dataset", "MITDB")

All good with 48 users for     MITDB dataset.


In [None]:
nsrdb_users = users_list("/content/EKM_NSRDB_dataset", "NSRDB")

All good with 18 users for     NSRDB dataset.


In [None]:
# Getting required amount (users_amount) of users randomly for PTBDB
ptbdb_random_users = random.sample(ptbdb_users, users_amount)

In [None]:
# Getting required amount (users_amount) of users randomly for MITDB
mitdb_random_users = random.sample(mitdb_users, users_amount)

## Getting average amount of EKMs for chosen users

In [None]:
# Creating dict of counters for each randomly-chosen users
# of NSRDB dataset
# Also creating dict of users' EKMs
NSRDB_users_EKM_amount_dict = {}
NSRDB_users_EKMs_dict = {}

for user in nsrdb_users:
  NSRDB_users_EKM_amount_dict[user] = 0
  NSRDB_users_EKMs_dict[user] = []

In [None]:
# Counting each user's EKMs in dataset and collecting EKMs of him/her
dataset_path = "/content/EKM_NSRDB_dataset"
image_files = os.listdir(dataset_path)

for img_name in image_files:
    user_id = img_name.split("-")[-2]
    NSRDB_users_EKM_amount_dict[user_id] += 1
    NSRDB_users_EKMs_dict[user_id].append(img_name)

In [None]:
NSRDB_users_EKM_amount_dict

{'19140': 3000,
 '16483': 3000,
 '16795': 3000,
 '16273': 3000,
 '16265': 3000,
 '16420': 3000,
 '16786': 3000,
 '16539': 3000,
 '16773': 3000,
 '18184': 3000,
 '19830': 3000,
 '19090': 3000,
 '18177': 3000,
 '16272': 3000,
 '19093': 3000,
 '19088': 3000,
 '17453': 3000,
 '17052': 3000}

In [None]:
# Creating dict of counters for each randomly-chosen users
# of MITDB dataset
# Also creating dict of users' EKMs
MITDB_random_users_EKM_amount_dict = {}
MITDB_random_users_EKMs_dict = {}

for user in mitdb_random_users:
  MITDB_random_users_EKM_amount_dict[user] = 0
  MITDB_random_users_EKMs_dict[user] = []

In [None]:
# Counting each user's EKMs in dataset and collecting EKMs of him/her
dataset_path = "/content/EKM_MITDB_dataset"
image_files = os.listdir(dataset_path)

for img_name in image_files:
    user_id = img_name.split("-")[-2]
    if user_id in mitdb_random_users:
      MITDB_random_users_EKM_amount_dict[user_id] += 1
      MITDB_random_users_EKMs_dict[user_id].append(img_name)

In [None]:
MITDB_random_users_EKM_amount_dict

{'210': 518,
 '124': 319,
 '223': 518,
 '222': 489,
 '100': 454,
 '221': 480,
 '121': 372,
 '107': 423,
 '205': 530,
 '112': 508,
 '213': 649,
 '118': 455,
 '220': 409,
 '103': 416,
 '104': 411,
 '102': 437,
 '209': 601,
 '230': 451}

In [None]:
int(sum(MITDB_random_users_EKM_amount_dict.values()) / users_amount)

468

In [None]:
# Creating dict of counters for each randomly-chosen users
# of PTBDB dataset
# Also creating dict of users' EKMs
ptbdb_random_users_EKM_amount_dict = {}
ptbdb_random_users_EKMs_dict = {}

for user in ptbdb_random_users:
  ptbdb_random_users_EKM_amount_dict[user] = 0
  ptbdb_random_users_EKMs_dict[user] = []

In [None]:
# Counting each user's EKMs in dataset and collecting EKMs of him/her
dataset_path = "/content/EKM_PTBDB_dataset"
image_files = os.listdir(dataset_path)

for img_name in image_files:
    user_id = img_name.split("-")[-2]
    if user_id in ptbdb_random_users:
      ptbdb_random_users_EKM_amount_dict[user_id] += 1
      ptbdb_random_users_EKMs_dict[user_id].append(img_name)

In [None]:
ptbdb_random_users_EKM_amount_dict

{'093': 56,
 '216': 27,
 '191': 25,
 '181': 14,
 '232': 36,
 '154': 6,
 '138': 11,
 '257': 31,
 '009': 9,
 '168': 14,
 '064': 34,
 '008': 33,
 '060': 33,
 '072': 49,
 '234': 21,
 '279': 32,
 '001': 45,
 '022': 37}

In [None]:
average_EKMs_amount_each_user = int(sum(ptbdb_random_users_EKM_amount_dict.values()) / users_amount)
average_EKMs_amount_each_user

28

## Getting random EKMs for each user in different datasets

In [None]:
average_EKMs_amount_each_user = 20

In [None]:
# Getting random EKMs from NSRDB's users' EKMs
NSRDB_EKMs = []
for user in NSRDB_users_EKMs_dict.keys():
  NSRDB_EKMs = NSRDB_EKMs + random.sample(NSRDB_users_EKMs_dict[user], \
                average_EKMs_amount_each_user)

In [None]:
# Getting random EKMs from MITDB's users' EKMs
# Note: we check if the user have enough EKMs (average amount) or not.
# If a user doesn't have enough EKMs, we get all the available EKMs
MITDB_EKMs = []
for user in MITDB_random_users_EKMs_dict.keys():
  if MITDB_random_users_EKM_amount_dict[user] < average_EKMs_amount_each_user:
    MITDB_EKMs = MITDB_EKMs + MITDB_random_users_EKMs_dict[user]
  else:
    MITDB_EKMs = MITDB_EKMs + random.sample(MITDB_random_users_EKMs_dict[user], \
                average_EKMs_amount_each_user)

In [None]:
# Getting random EKMs from PTBDB's users' EKMs
# Note: we check if the user have enough EKMs (average amount) or not.
# If a user doesn't have enough EKMs, we get all the available EKMs
PTBDB_EKMs = []
for user in ptbdb_random_users_EKMs_dict.keys():
  if ptbdb_random_users_EKM_amount_dict[user] < average_EKMs_amount_each_user:
    PTBDB_EKMs = PTBDB_EKMs + ptbdb_random_users_EKMs_dict[user]
  else:
    PTBDB_EKMs = PTBDB_EKMs + random.sample(ptbdb_random_users_EKMs_dict[user], \
                average_EKMs_amount_each_user)

## Vectorizing the images

In [None]:
def vertorizing_png_imges(address):
  # Load the PNG image
  image = Image.open(address)

  # Convert the image to RGB mode
  image = image.convert('RGB')

  # Resize the image to match the input size expected by the CNN
  desired_width = 33
  desired_height = 21
  image = image.resize((desired_width, desired_height))

  # Convert the image to a NumPy array
  image_array = np.array(image)

  # Reshape the array to match the input shape expected by the CNN
  # image_array = image_array.reshape((1, desired_height, desired_width, 3))

  # Normalize the array
  image_array = image_array.astype('float32') / 255.0

  return image_array

In [None]:
from IPython.display import clear_output

def progress_bar(index, path):

  bar_length = 50

  total_length = len(os.listdir(path))

  step = int(total_length / bar_length)
  prgress = int(index / step)

  # Clear the current cell's output
  clear_output(wait=True)

  print("[", end="")

  [print("*", end="") for i in range(prgress)]
  [print("-", end="") for i in range(bar_length - prgress)]
  print("]")
  print(f"{index}/{total_length}")

### NSRDB verctorization

In [None]:
X_nsrdb_dataset = []
y_nsrdb_dataset = []

In [None]:
# X data for cnn network input
base_path = "/content/EKM_NSRDB_dataset/"
images_names = os.listdir(base_path)

before_run_time = datetime.now()

# Get X_nsrdb_dataset by vectorization
# and y_nsrdb_dataset by user ids
for index, img_name in enumerate(images_names):
    # Vectorizing just the EKMs that are randomly been chosen
    if img_name not in NSRDB_EKMs:
      continue

    img_vector = vertorizing_png_imges(base_path + img_name)
    X_nsrdb_dataset.append(img_vector)

    img_name = img_name.split("-")
    label = img_name[-2]
    y_nsrdb_dataset.append(label)

    progress_bar(index, base_path)

after_run_time = datetime.now()
diff = after_run_time - before_run_time
print(f"This cell took {int(diff.seconds / 60)} minutes \
and {diff.seconds} seconds to run.")

[*************************************************-]
53617/54000
This cell took 0 minutes and 24 seconds to run.


### MITDB verctorization

In [None]:
X_MITDB_dataset = []
y_MITDB_dataset = []

In [None]:
# X data for cnn network input
base_path = "/content/EKM_MITDB_dataset/"
images_names = os.listdir(base_path)

before_run_time = datetime.now()

# Get X_MITDB_dataset by vectorization
# and y_MITDB_dataset by user ids
for index, img_name in enumerate(images_names):
    # Vectorizing just the EKMs that are randomly been chosen
    if img_name not in MITDB_EKMs:
      continue

    img_vector = vertorizing_png_imges(base_path + img_name)
    X_MITDB_dataset.append(img_vector)

    img_name = img_name.split("-")
    label = str(int(img_name[-2]) + 100)
    y_MITDB_dataset.append(label)

    progress_bar(index, base_path)

after_run_time = datetime.now()
diff = after_run_time - before_run_time
print(f"This cell took {int(diff.seconds / 60)} minutes \
and {diff.seconds} seconds to run.")

[*************************************************-]
21556/21613
This cell took 0 minutes and 14 seconds to run.


### PTBDB verctorization

In [None]:
X_PTBDB_dataset = []
y_PTBDB_dataset = []

In [None]:
# X data for cnn network input
base_path = "/content/EKM_PTBDB_dataset/"
images_names = os.listdir(base_path)

before_run_time = datetime.now()

# Get X_PTBDB_dataset by vectorization
# and y_PTBDB_dataset by user ids
for index, img_name in enumerate(images_names):
    # Vectorizing just the EKMs that are randomly been chosen
    if img_name not in PTBDB_EKMs:
      continue

    img_vector = vertorizing_png_imges(base_path + img_name)
    X_PTBDB_dataset.append(img_vector)

    img_name = img_name.split("-")
    # ToDo (optional): 1000 can be replcaed with actual number of previous
    # dataset's users
    label = str(int(img_name[-2]) + 1000)
    y_PTBDB_dataset.append(label)

    progress_bar(index, base_path)

after_run_time = datetime.now()
diff = after_run_time - before_run_time
print(f"This cell took {int(diff.seconds / 60)} minutes \
and {diff.seconds} secondsto run.")

[**************************************************]
9294/9297
This cell took 0 minutes and 9 secondsto run.


## Creating new dataset with equal users from each dataset and also with equal amount of EKMs for each user

In [None]:
X_mixed_dataset = []
y_mixed_dataset = []

In [None]:
X_mixed_dataset.extend(X_nsrdb_dataset)
X_mixed_dataset.extend(X_MITDB_dataset)
X_mixed_dataset.extend(X_PTBDB_dataset)

X_mixed_dataset = np.array(X_mixed_dataset)

In [None]:
y_mixed_dataset.extend(y_nsrdb_dataset)
y_mixed_dataset.extend(y_MITDB_dataset)
y_mixed_dataset.extend(y_PTBDB_dataset)

y_mixed_dataset = np.array(y_mixed_dataset)

## CNN Architecture

In [None]:
len(X_mixed_dataset)

1034

In [None]:
X_mixed_dataset[0].shape

(21, 33, 3)

In [None]:
def numerical_labels(labels_arr, dict_of_labels={}):
  # This function change the labels of train and test
  # data to numerical ones.
  # Note: for the test data we should pass the train
  # data labels

  if dict_of_labels == {}:
    unique_labels = np.unique(labels_arr)

    for i, value in enumerate(unique_labels):
        dict_of_labels[value] = i

  # print(dict_of_labels)

  num_lbls = []
  for lbl in labels_arr:
    num_lbls.append(dict_of_labels[lbl])

  num_lbls = np.array(num_lbls)

  return dict_of_labels, num_lbls

In [None]:
label_dict, numerical_y_mixed_labels = numerical_labels(y_mixed_dataset)

In [None]:
len(np.unique(numerical_y_mixed_labels))

54

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_mixed_dataset, numerical_y_mixed_labels, test_size=0.2, random_state=42)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.optimizers import Adam

In [None]:
# Creating the CNN model
model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(21, 33, 3)),
    MaxPooling2D(pool_size=(2, 2)),
    Dropout(0.7),
    Flatten(),
    Dense(128, activation='relu'),
    Dense(64, activation='relu'),
    Dense(54, activation='softmax')
])

# Setting Adam optimizer
optimizer = Adam(learning_rate=0.001)

# Compileing the model with the optimizer
model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])


In [None]:
# Train the model
model.fit(X_train, y_train, epochs=80, batch_size=32)

Epoch 1/80
Epoch 2/80
Epoch 3/80
Epoch 4/80
Epoch 5/80
Epoch 6/80
Epoch 7/80
Epoch 8/80
Epoch 9/80
Epoch 10/80
Epoch 11/80
Epoch 12/80
Epoch 13/80
Epoch 14/80
Epoch 15/80
Epoch 16/80
Epoch 17/80
Epoch 18/80
Epoch 19/80
Epoch 20/80
Epoch 21/80
Epoch 22/80
Epoch 23/80
Epoch 24/80
Epoch 25/80
Epoch 26/80
Epoch 27/80
Epoch 28/80
Epoch 29/80
Epoch 30/80
Epoch 31/80
Epoch 32/80
Epoch 33/80
Epoch 34/80
Epoch 35/80
Epoch 36/80
Epoch 37/80
Epoch 38/80
Epoch 39/80
Epoch 40/80
Epoch 41/80
Epoch 42/80
Epoch 43/80
Epoch 44/80
Epoch 45/80
Epoch 46/80
Epoch 47/80
Epoch 48/80
Epoch 49/80
Epoch 50/80
Epoch 51/80
Epoch 52/80
Epoch 53/80
Epoch 54/80
Epoch 55/80
Epoch 56/80
Epoch 57/80
Epoch 58/80
Epoch 59/80
Epoch 60/80
Epoch 61/80
Epoch 62/80
Epoch 63/80
Epoch 64/80
Epoch 65/80
Epoch 66/80
Epoch 67/80
Epoch 68/80
Epoch 69/80
Epoch 70/80
Epoch 71/80
Epoch 72/80
Epoch 73/80
Epoch 74/80
Epoch 75/80
Epoch 76/80
Epoch 77/80
Epoch 78/80
Epoch 79/80
Epoch 80/80


<keras.callbacks.History at 0x7e6bf83d9f00>

In [None]:
# Evaluate the model on the unfair test set
test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=2)
print(f'Test Loss: {test_loss:.4f}')
print(f'Test Accuracy: {test_accuracy:.4f}')

7/7 - 0s - loss: 0.3828 - accuracy: 0.9275 - 208ms/epoch - 30ms/step
Test Loss: 0.3828
Test Accuracy: 0.9275
