# Library

## Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

## Data

In [None]:
import numpy as np
import pandas as pd
import os

In [None]:
from tqdm import tqdm

## Feature Extraction

In [None]:
!pip install heartpy
from heartpy import analysis

In [None]:
!pip install hrv-analysis
import hrvanalysis
from hrvanalysis import get_time_domain_features
# from hrvanalysis.preprocessing import remove_ectopic_beats
from hrvanalysis import get_frequency_domain_features
from hrvanalysis import get_poincare_plot_features
# from hrvanalysis.plot import plot_psd, plot_poincare

In [None]:
!pip install pyhrv
import pyhrv
import pyhrv.nonlinear as nl

# R-Peak Collection

In [None]:
def get_files(path):
  files = []
  for r, d, f in os.walk(path):
      for file in f:
          if 'channel_1.csv' in file:
            files.append(file)
  return files

In [None]:
positive_path = "/content/drive/My Drive/Signal/Positive-Peaklist/"
positive_files = get_files(positive_path)
print(len(positive_files))

negative_path = "/content/drive/My Drive/Signal/Negative-Peaklist/"
negative_files = get_files(negative_path)
print(len(negative_files))

# Feature Extraction

## Slice Peaklist

In [None]:
def get_sliced_peaklist(peaklist_df, time_duration, frequency_sampling):
  if time_duration == 5: # 5 minutes duration
    return peaklist_df['peaklist'].to_numpy()
  end_sampling = 5 * 60 * frequency_sampling
  start_sampling = (5-time_duration) * 60 * frequency_sampling
  return peaklist_df[peaklist_df['peaklist'].between(start_sampling, end_sampling)]['peaklist'].to_numpy()

## RR Interval

In [None]:
def get_rr_interval(peaklist, frequency_sampling):
  wd = analysis.calc_rr(peaklist, sample_rate = frequency_sampling)
  return wd['RR_list']

## Analysis

### Time Domain

In [None]:
def outlier_count(rr_intervals, difference_percentage):
  count = 0
  length = len(rr_intervals)
  for i in range(length-1):
    if abs(rr_intervals[i] - rr_intervals[i+1]) > difference_percentage * rr_intervals[i]:
      count += 1
  return count

In [None]:
time_domain = ['MeanNN','SDNN','RMSSD','pNN50','sdHR', 'Outlier']

def get_time_domain_analysis_features(rr_interval):
  result = {}
  time_domain_features = get_time_domain_features(rr_interval)
  features = {'MeanNN':'mean_nni',
          'SDNN': 'sdnn',
          'RMSSD': 'rmssd',
          'pNN50': 'pnni_50',
          'sdHR': 'std_hr'}
  for description, key in features.items():
    result[description] = time_domain_features[key]
  result['Outlier'] = outlier_count(rr_intervals=rr_interval, difference_percentage=0.2)
  return result

### Frequency Domain

In [None]:
frequency_domain = ['VLF','LF','HF','LF/HF','aTotal','pVLF','pLF']

def get_frequency_domain_analysis_features(rr_interval):
  result = {}
  # plot_psd(rr_interval)
  frequency_domain_features = get_frequency_domain_features(rr_interval, method='welch')
  features = {'VLF':'vlf',
          'LF': 'lf',
          'HF': 'hf',
          'LF/HF': 'lf_hf_ratio',
          'aTotal': 'total_power'}
  for description, key in features.items():
    result[description] = frequency_domain_features[key]
  result['pVLF'] = result['VLF'] / result['aTotal'] * 100
  result['pLF'] = result['LF'] / result['aTotal'] * 100
  return result

### Non Linear Domain

In [None]:
import nolds
import biosppy

def new_dfa(nn=None, rpeaks=None, short=None, long=None, show=True, figsize=None, legend=True):
	"""Parameters
	----------
	nn : array
		NN intervals in [ms] or [s].
	rpeaks : array
		R-peak times in [ms] or [s].
	short : array, 2 elements
		Interval limits of the short term fluctuations (default: None: [4, 16]).
	long : array, 2 elements
		Interval limits of the long term fluctuations (default: None: [17, 64]).
	show : bool
		If True, shows DFA plot (default: True)
	legend : bool
		If True, adds legend with alpha1 and alpha2 values to the DFA plot (default: True)
	Returns (biosppy.utils.ReturnTuple Object)
	------------------------------------------
	[key : format]
		Description.
	dfa_short : float
		Alpha value of the short term fluctuations
	dfa_long : float
		Alpha value of the long term fluctuations
	dfa_plot : matplotlib plot figure
		Matplotlib plot figure of the DFA
	"""
	# Check input values
	nn = pyhrv.utils.check_input(nn, rpeaks)

	# Check intervals
	short = pyhrv.utils.check_interval(short, default=(4, 16))
	long = pyhrv.utils.check_interval(long, default=(17, 64))

	# Create arrays
	short = range(short[0], short[1] + 1)
	long = range(long[0], long[1] + 1)

	# try:
	# Compute alpha values
	try:
		alpha1, dfa_short = nolds.dfa(nn, short, debug_data=True, overlap=False)
		alpha2, dfa_long = nolds.dfa(nn, long, debug_data=True, overlap=False)
	except ValueError:
		# If DFA could not be conducted due to insufficient number of NNIs, return an empty graph and 'nan' for alpha1/2
		warnings.warn("Not enough NNI samples for Detrended Fluctuations Analysis.")
		# ax.axis([0, 1, 0, 1])
		# ax.text(0.5, 0.5, '[Insufficient number of NNI samples for DFA]', horizontalalignment='center',
		# 		verticalalignment='center')
		alpha1, alpha2 = 'nan', 'nan'
	else:
		# Plot DFA results if number of NNI were sufficent to conduct DFA
		# Plot short term DFA
		vals, flucts, poly = dfa_short[0], dfa_short[1], np.polyval(dfa_short[2], dfa_short[0])
		label = r'$ \alpha_{1}: %0.2f$' % alpha1
		# ax.plot(vals, flucts, 'bo', markersize=1)
		# ax.plot(vals, poly, 'b', label=label, alpha=0.7)

		# Plot long term DFA
		vals, flucts, poly = dfa_long[0], dfa_long[1], np.polyval(dfa_long[2], dfa_long[0])
		label = r'$ \alpha_{2}: %0.2f$' % alpha2
		# ax.plot(vals, flucts, 'go', markersize=1)
		# ax.plot(vals, poly, 'g', label=label, alpha=0.7)

		# # Add legend
		# if legend:
		# 	ax.legend()
		# ax.grid()

	# # Plot axis
	# if show:
	# 	plt.show()

	# Output
	args = (alpha1, alpha2, short, long)
	return biosppy.utils.ReturnTuple(args, ('dfa_alpha1', 'dfa_alpha2', 'dfa_alpha1_beats', 'dfa_alpha2_beats'))

In [None]:
non_linear_domain = ['SD1','SD2','SD1/SD2', 'Alpha']

original_dfa = nl.dfa
nl.dfa = new_dfa

def get_non_linear_domain_analysis_features(rr_interval):
  result = {}
  # plot_poincare(rr_interval)
  non_linear_domain_features = get_poincare_plot_features(rr_interval)
  features = {'SD1':'sd1',
        'SD2': 'sd2'}
  for description, key in features.items():
    result[description] = non_linear_domain_features[key]

  result['SD1/SD2'] = result['SD1'] / result['SD2']

  result['Alpha'] = nl.dfa(nn=rr_interval, long=(17,20), show=False)['dfa_alpha1']

  return result

## Execution

In [None]:
def feature_extraction(path, files, time_duration, target, frequency_sampling=128):
  features_result = []
  errors = []

  bar = tqdm(total=len(files))
  for no, file in enumerate(files): # Iterate all files
    peaklist_df = pd.read_csv(path + file) # in dataframe
    peaklist = get_sliced_peaklist(peaklist_df, time_duration, frequency_sampling)
    rr_interval = get_rr_interval(peaklist, frequency_sampling)

    extracted_features = {}
    try:
      extracted_features.update(get_time_domain_analysis_features(rr_interval)) # Time Domain Analysis
      # extracted_features.update(get_frequency_domain_analysis_features(rr_interval)) # Frequency Domain Analysis
      extracted_features.update(get_frequency_domain_analysis_features(rr_interval, 'welch')) # Frequency Domain Analysis
      extracted_features.update(get_non_linear_domain_analysis_features(rr_interval)) # Non Linear Domain Analysis
      extracted_features.update({'Target': target})
      extracted_features.update({'File': file})
      features_result.append(extracted_features)
    except Exception as e:
      print(e)
      errors.append(file)
    bar.update(1)
    # print("\r", end='')
    # print("{}/{}". format(no+1, len(files)), end='', flush=True)
  bar.close()
  # print("\nDONE!", end='\n', flush=False)

  return features_result, errors

In [None]:
def make_dataset(positive_dataset, negative_dataset, columns_order, n_data = 150):
  dataset = []
  for i in range(n_data):
    dataset.append(positive_dataset[i])
    dataset.append(negative_dataset[i])
  df = pd.DataFrame(dataset)
  df = df[columns_order]
  return df

In [None]:
def save_feature_extraction(dataset, folder, filename):
  path = "/content/drive/My Drive/Signal/" + folder + "/" + filename
  dataset.to_csv(path, index=False)
  print("{} saved!".format(filename))

In [None]:
from tqdm import tqdm
import time

def foo_():
    time.sleep(0.3)
range_ = range(0, 10)
total = len(range_)

with tqdm(total=total, position=0, leave=True) as pbar:
   for i in tqdm((foo_, range_ ), position=0, leave=True):
    pbar.update()

In [None]:
folder = "Dataset"
n_data = len(positive_files)
columns = time_domain + frequency_domain + non_linear_domain
columns.append('Target')
columns.append('File')

import warnings
warnings.filterwarnings('once')

time_durations = [2,3,4,5] # in minutes
for time_duration in time_durations:
  # print("Positive Dataset Extraction")
  positive_dataset, positive_errors = feature_extraction(positive_path, positive_files[:n_data], time_duration, target = 1)
  # print("Negative Dataset Extraction")
  negative_dataset, negative_errors = feature_extraction(negative_path, negative_files[:n_data], time_duration, target = 0)
  dataset = make_dataset(positive_dataset, negative_dataset,columns_order = columns, n_data=n_data)
  save_feature_extraction(dataset, folder, filename = "Dataset_{}.csv".format(time_duration))