In [4]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt

In [14]:
labels = ['AF', 'N']

In [1]:
import os

dataset_folder = 'dataset/'
filenames = []
for filename in os.listdir(dataset_folder):
    if filename.find("_AFDB_") > -1 or filename.find("_NSRDB_") > -1:
        filenames.append(filename)

In [2]:
filenames

['NSRDB_NSRDB_16265_sequence_300_pt.csv',
 'NSRDB_NSRDB_16272_sequence_300_pt.csv',
 'NSRDB_NSRDB_16273_sequence_300_pt.csv',
 'NSRDB_NSRDB_16420_sequence_300_pt.csv',
 'NSRDB_NSRDB_16483_sequence_300_pt.csv',
 'NSRDB_NSRDB_16539_sequence_300_pt.csv',
 'NSRDB_NSRDB_16773_sequence_300_pt.csv',
 'NSRDB_NSRDB_16786_sequence_300_pt.csv',
 'NSRDB_NSRDB_16795_sequence_300_pt.csv',
 'NSRDB_NSRDB_17052_sequence_300_pt.csv',
 'NSRDB_NSRDB_17453_sequence_300_pt.csv',
 'NSRDB_NSRDB_18177_sequence_300_pt.csv',
 'NSRDB_NSRDB_18184_sequence_300_pt.csv',
 'NSRDB_NSRDB_19088_sequence_300_pt.csv',
 'NSRDB_NSRDB_19090_sequence_300_pt.csv',
 'NSRDB_NSRDB_19093_sequence_300_pt.csv',
 'NSRDB_NSRDB_19140_sequence_300_pt.csv',
 'NSRDB_NSRDB_19830_sequence_300_pt.csv',
 'test_AFDB_04015.csv',
 'test_AFDB_04043.csv',
 'test_AFDB_04048.csv',
 'test_AFDB_04936.csv',
 'test_AFDB_05091.csv',
 'test_AFDB_05121.csv',
 'test_AFDB_05261.csv',
 'test_AFDB_06426.csv',
 'test_AFDB_06453.csv',
 'test_AFDB_06995.csv',
 'te

In [7]:
train_dfs = []
test_dfs = []
normal_dfs = []
for name in filenames :
    if name.find('train_') > -1:
        train_df = pd.read_csv(dataset_folder + name, header=None)
        train_dfs.append(train_df)
    if name.find('test_') > -1:
        test_df = pd.read_csv(dataset_folder + name, header=None)
        test_dfs.append(test_df)
    if name.find('_NSRDB_') > -1:
        normal_df = pd.read_csv(dataset_folder + name, header=None)
        normal_dfs.append(normal_df)

In [8]:
train_df_all = pd.concat(train_dfs, ignore_index=True)
test_df_all = pd.concat(test_dfs, ignore_index=True)
normal_df_all = pd.concat(normal_dfs, ignore_index=True)

In [9]:
train_df_all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33060 entries, 0 to 33059
Columns: 301 entries, 0 to 300
dtypes: float64(300), int64(1)
memory usage: 75.9 MB


In [10]:
test_df_all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15167 entries, 0 to 15166
Columns: 301 entries, 0 to 300
dtypes: float64(301)
memory usage: 34.8 MB


In [11]:
normal_df_all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39118 entries, 0 to 39117
Columns: 301 entries, 0 to 300
dtypes: float64(300), object(1)
memory usage: 89.8+ MB


### Remove Normal Sample from AFDB (train & test)

In [18]:
train_df_AF = train_df_all[train_df_all[300] == 0]

In [19]:
test_df_AF = test_df_all[test_df_all[300] == 0]

### Concate Dataset

In [21]:
normal_df_all[300] = 1

In [25]:
df_AF_N = pd.concat([train_df_AF, test_df_AF, normal_df_all])

### Balancing Dataset (After Merging with NSRDB)

In [26]:
df_AF_N[300]=df_AF_N[300].astype(int)
equilibre=df_AF_N[300].value_counts()

print(equilibre)

1    39118
0    22914
Name: 300, dtype: int64


In [29]:
# sampling and resampling dataset

from sklearn.utils import resample
n_samples = 30000 
random_states = [123, 124]

dfs = []

for i in range(len(equilibre)):
    dfs.append(df_AF_N[df_AF_N[300]==i])
    dfs[i]=resample(dfs[i],replace=True,n_samples=n_samples,random_state=random_states[i])

df_AF_N_balanced =pd.concat(dfs)

In [30]:
df_AF_N_balanced[300]=df_AF_N_balanced[300].astype(int)
equilibre=df_AF_N_balanced[300].value_counts()

print(equilibre)

1    30000
0    30000
Name: 300, dtype: int64


### Split Dataset

In [33]:
from sklearn.model_selection import train_test_split

In [31]:
y = df_AF_N_balanced.iloc[:, 300].values
X = df_AF_N_balanced.iloc[:, :300].values

In [34]:
X_train, X_test, y_train, y_test = train_test_split(
                                    X, y, test_size=0.15, random_state=42)

In [51]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((51000, 300), (9000, 300), (51000,), (9000,))

### Save Dataset

In [53]:
train_df_all = pd.DataFrame(np.hstack((X_train, np.expand_dims(y_train, 1))))

In [54]:
test_df_all = pd.DataFrame(np.hstack((X_test, np.expand_dims(y_test, 1))))

In [55]:
train_df_all.to_csv(dataset_folder + "train_all-v2.csv", index=None, header=None)

In [56]:
test_df_all.to_csv(dataset_folder + "test_all-v2.csv", index=None, header=None)