In [None]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt

In [2]:
labels = ['AF', 'N']

In [3]:
import os

dataset_folder = 'dataset/'
filenames = []
for filename in os.listdir(dataset_folder):
    if filename.find("_AFDB_") > -1 or filename.find("NSRDB_") > -1:
        filenames.append(filename)

In [4]:
filenames

['NSRDB_16265_sequence_300_pt_2_ch.csv',
 'NSRDB_16272_sequence_300_pt_2_ch.csv',
 'NSRDB_16273_sequence_300_pt_2_ch.csv',
 'NSRDB_16483_sequence_300_pt_2_ch.csv',
 'NSRDB_16539_sequence_300_pt_2_ch.csv',
 'NSRDB_16773_sequence_300_pt_2_ch.csv',
 'NSRDB_16786_sequence_300_pt_2_ch.csv',
 'NSRDB_16795_sequence_300_pt_2_ch.csv',
 'NSRDB_17052_sequence_300_pt_2_ch.csv',
 'NSRDB_17453_sequence_300_pt_2_ch.csv',
 'NSRDB_18177_sequence_300_pt_2_ch.csv',
 'NSRDB_18184_sequence_300_pt_2_ch.csv',
 'NSRDB_19088_sequence_300_pt_2_ch.csv',
 'NSRDB_19090_sequence_300_pt_2_ch.csv',
 'NSRDB_19093_sequence_300_pt_2_ch.csv',
 'NSRDB_19140_sequence_300_pt_2_ch.csv',
 'NSRDB_19830_sequence_300_pt_2_ch.csv',
 'test_AFDB_04015.csv',
 'test_AFDB_04043.csv',
 'test_AFDB_04048.csv',
 'test_AFDB_04936.csv',
 'test_AFDB_05091.csv',
 'test_AFDB_05121.csv',
 'test_AFDB_05261.csv',
 'test_AFDB_06426.csv',
 'test_AFDB_06453.csv',
 'test_AFDB_06995.csv',
 'test_AFDB_07910.csv',
 'test_AFDB_08215.csv',
 'test_AFDB_082

In [5]:
train_dfs = []
test_dfs = []
normal_dfs = []
for name in filenames :
    if name.find('train_') > -1:
        train_df = pd.read_csv(dataset_folder + name, header=None)
        train_dfs.append(train_df)
    if name.find('test_') > -1:
        test_df = pd.read_csv(dataset_folder + name, header=None)
        test_dfs.append(test_df)
    if name.find('NSRDB_') > -1:
        normal_df = pd.read_csv(dataset_folder + name, header=None)
        normal_dfs.append(normal_df)

In [6]:
train_df_all = pd.concat(train_dfs, ignore_index=True)
test_df_all = pd.concat(test_dfs, ignore_index=True)
normal_df_all = pd.concat(normal_dfs, ignore_index=True)

In [7]:
train_df_all.tail()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,591,592,593,594,595,596,597,598,599,600
33055,-0.049068,0.013104,0.07518,0.122488,0.247975,0.495666,0.70169,0.848924,0.898229,0.842257,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
33056,-0.029308,0.076083,0.216461,0.34422,0.504467,0.732283,0.929944,0.9471,0.751166,0.454888,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
33057,0.862124,0.556975,0.251699,0.033675,-0.104614,-0.205641,-0.012216,0.036171,0.059388,0.01499,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
33058,-0.018008,0.033609,0.070579,0.024674,-0.00663,0.042465,0.064741,0.030956,-0.000402,0.036466,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
33059,0.143095,0.323831,0.565793,0.726778,0.836212,0.874479,0.699368,0.374617,0.044874,-0.142767,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [8]:
train_df_all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33060 entries, 0 to 33059
Columns: 601 entries, 0 to 600
dtypes: float64(600), int64(1)
memory usage: 151.6 MB


In [9]:
test_df_all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12906 entries, 0 to 12905
Columns: 601 entries, 0 to 600
dtypes: float64(601)
memory usage: 59.2 MB


In [10]:
normal_df_all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41028 entries, 0 to 41027
Columns: 601 entries, 0 to 600
dtypes: float64(600), object(1)
memory usage: 188.1+ MB


### Remove Normal Sample from AFDB (train & test)

In [11]:
train_df_AF = train_df_all[train_df_all[600] == 0]

In [12]:
test_df_AF = test_df_all[test_df_all[600] == 0]

### Concate Dataset

In [13]:
normal_df_all[600] = 1

In [14]:
df_AF_N = pd.concat([train_df_AF, test_df_AF, normal_df_all])

### Balancing Dataset (After Merging with NSRDB)

In [15]:
df_AF_N[600]=df_AF_N[600].astype(int)
equilibre=df_AF_N[600].value_counts()

print(equilibre)

1    41028
0    22853
Name: 600, dtype: int64


In [16]:
# sampling and resampling dataset

from sklearn.utils import resample
n_samples = 30000 
random_states = [123, 124]

dfs = []

for i in range(len(equilibre)):
    dfs.append(df_AF_N[df_AF_N[600]==i])
    dfs[i]=resample(dfs[i],replace=True,n_samples=n_samples,random_state=random_states[i])

df_AF_N_balanced =pd.concat(dfs)

In [17]:
df_AF_N_balanced[600]=df_AF_N_balanced[600].astype(int)
equilibre=df_AF_N_balanced[600].value_counts()

print(equilibre)

1    30000
0    30000
Name: 600, dtype: int64


### Split Dataset

In [18]:
from sklearn.model_selection import train_test_split

In [19]:
y = df_AF_N_balanced.iloc[:, 600].values
X = df_AF_N_balanced.iloc[:, :600].values

In [20]:
X_train, X_test, y_train, y_test = train_test_split(
                                    X, y, test_size=0.15, random_state=42)

In [21]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((51000, 600), (9000, 600), (51000,), (9000,))

### Save Dataset

In [22]:
train_df_all = pd.DataFrame(np.hstack((X_train, np.expand_dims(y_train, 1))))

In [23]:
test_df_all = pd.DataFrame(np.hstack((X_test, np.expand_dims(y_test, 1))))

In [24]:
train_df_all.to_csv(dataset_folder + "train_all-v2.csv", index=None, header=None)

In [None]:
test_df_all.to_csv(dataset_folder + "test_all-v2.csv", index=None, header=None)