In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from glob import glob
from sklearn.model_selection import StratifiedKFold

In [2]:
df_train = pd.read_csv('/workspace/data/train.csv')
df_test = pd.read_csv('/workspace/data/test.csv')

In [3]:
df_train.head(2)

Unnamed: 0,Id,target,age,sex,label_type
0,81ac15cb8d3be42e3d0ccdea36176183,1,80,female,human
1,92a68a851f4686e2a4ddf99304f307d4,0,59,male,auto


In [6]:
df_train_human = df_train[df_train.label_type == 'human'].reset_index(drop=True)
df_train_auto = df_train[df_train.label_type == 'auto'].reset_index(drop=True)

In [7]:
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=516)

df_train_human["cv"] = -1

for i, (train_index, val_index) in enumerate(kfold.split(df_train_human, df_train_human.target)):
    df_train_human.loc[val_index, ["cv"]] = i
    
df_train_auto["cv"] = -1

for i, (train_index, val_index) in enumerate(kfold.split(df_train_auto, df_train_auto.target)):
    df_train_auto.loc[val_index, ["cv"]] = i

In [11]:
df_concat = pd.concat([df_train_human, df_train_auto], axis=0).reset_index(drop=True)

In [16]:
df_concat["ecg_path"] = [f"/workspace/data/ecg/{p}.npy" for p in df_concat.Id]

In [17]:
df_concat

Unnamed: 0,Id,target,age,sex,label_type,cv,ecg_path
0,81ac15cb8d3be42e3d0ccdea36176183,1,80,female,human,2,/workspace/data/ecg/81ac15cb8d3be42e3d0ccdea36...
1,076d002b3c47e54fff4985b0875e0890,0,29,male,human,4,/workspace/data/ecg/076d002b3c47e54fff4985b087...
2,13ac4608f13ffbca4ceb2d49179930a8,0,75,male,human,4,/workspace/data/ecg/13ac4608f13ffbca4ceb2d4917...
3,b0b9b2598f0ce5f1ab4666ca86a1a679,0,50,female,human,4,/workspace/data/ecg/b0b9b2598f0ce5f1ab4666ca86...
4,7daf2118dcf7bb9d06956be2b5e1410d,1,76,male,human,2,/workspace/data/ecg/7daf2118dcf7bb9d06956be2b5...
...,...,...,...,...,...,...,...
1995,eb61272487ce6a2e9952816b1a91e03d,0,59,male,auto,4,/workspace/data/ecg/eb61272487ce6a2e9952816b1a...
1996,2793d8c225448013e0a6328f3e6f43d2,0,49,female,auto,1,/workspace/data/ecg/2793d8c225448013e0a6328f3e...
1997,c2af362333b382ebc2230adda699903f,0,58,male,auto,0,/workspace/data/ecg/c2af362333b382ebc2230adda6...
1998,3eb753ea212524680020928de084f121,0,55,female,auto,0,/workspace/data/ecg/3eb753ea212524680020928de0...


In [34]:
df_concat.to_csv('/workspace/data/df_train.csv', index=False)

In [21]:
for i in df_concat.ecg_path:
    if os.path.exists(i) is False:
        print(i)
        break

In [25]:
all_data = np.array([np.load(p) for p in glob('/workspace/data/ecg/*.npy')])

In [26]:
all_data.shape

(10000, 800, 12)

In [29]:
all_data.mean((0, 1))

array([-0.00215789, -0.00043377,  0.00172448,  0.00128416, -0.0018845 ,
        0.00067227, -0.00111946, -0.00225999, -0.00316783, -0.0018304 ,
       -0.00196941, -0.00267312])

In [30]:
all_data.std((0, 1))

array([0.16227234, 0.15298072, 0.15850562, 0.13635088, 0.14097595,
       0.13283227, 0.19303711, 0.2945234 , 0.28988948, 0.26989414,
       0.24618867, 0.21785621])

In [31]:
df_test

Unnamed: 0,Id,age,sex,label_type
0,eee45832964560ae45040cbc95a252e7,72,female,human
1,6a9adde92c964bd844ddeb12bf559130,76,female,human
2,ee1f947df169cbdc8569a6959913c4ef,58,female,human
3,611df0e51c4fcc5fd1a03887b031a2dc,75,male,human
4,2aeae75bd1d6c3ad42574b68d4daf07c,59,male,human
...,...,...,...,...
7995,0d833b22aa1d1457ab5429e3d4e94bcc,18,female,human
7996,e6a54ce19cda5b78d22f5dc68963c8d8,43,female,human
7997,b9039f75e14ebf64167fe11cfd1fb497,73,female,human
7998,e1f9f35bfc5193a7489f59406cda2392,40,female,human


In [32]:
df_test["ecg_path"] = [f"/workspace/data/ecg/{p}.npy" for p in df_test.Id]

In [36]:
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=516)

df_test["cv"] = -1

for i, (train_index, val_index) in enumerate(kfold.split(df_test, df_test.age)):
    df_test.loc[val_index, ["cv"]] = i



In [37]:
df_test

Unnamed: 0,Id,age,sex,label_type,ecg_path,cv
0,eee45832964560ae45040cbc95a252e7,72,female,human,/workspace/data/ecg/eee45832964560ae45040cbc95...,0
1,6a9adde92c964bd844ddeb12bf559130,76,female,human,/workspace/data/ecg/6a9adde92c964bd844ddeb12bf...,1
2,ee1f947df169cbdc8569a6959913c4ef,58,female,human,/workspace/data/ecg/ee1f947df169cbdc8569a69599...,0
3,611df0e51c4fcc5fd1a03887b031a2dc,75,male,human,/workspace/data/ecg/611df0e51c4fcc5fd1a03887b0...,2
4,2aeae75bd1d6c3ad42574b68d4daf07c,59,male,human,/workspace/data/ecg/2aeae75bd1d6c3ad42574b68d4...,3
...,...,...,...,...,...,...
7995,0d833b22aa1d1457ab5429e3d4e94bcc,18,female,human,/workspace/data/ecg/0d833b22aa1d1457ab5429e3d4...,1
7996,e6a54ce19cda5b78d22f5dc68963c8d8,43,female,human,/workspace/data/ecg/e6a54ce19cda5b78d22f5dc689...,0
7997,b9039f75e14ebf64167fe11cfd1fb497,73,female,human,/workspace/data/ecg/b9039f75e14ebf64167fe11cfd...,4
7998,e1f9f35bfc5193a7489f59406cda2392,40,female,human,/workspace/data/ecg/e1f9f35bfc5193a7489f59406c...,2


In [38]:
df_test.to_csv('/workspace/data/df_test.csv', index=False)