In [None]:
import os
import sys
import pandas as pd

import logging
import boto3
import io
from io import StringIO # python3; python2: BytesIO 
from botocore.exceptions import ClientError

import shutil

In [None]:
MODEL_NAME = 'life-threatening_arrhythmias'

BUCKET = 'sndv.analysisdata'
PREFIX = 'dataset/{model_name}/'.format(model_name=MODEL_NAME)

def get_common_prefixes_all(s3_bucket, **base_kwargs):
    continuation_token = None
    while True:
        list_kwargs = dict(MaxKeys=1000, **base_kwargs)
        if continuation_token:
            list_kwargs['ContinuationToken'] = continuation_token
        response = s3_bucket.meta.client.list_objects_v2(**list_kwargs)
        yield from response.get('CommonPrefixes', [])
        if not response.get('IsTruncated'):  # At the end of the list?
            break
        continuation_token = response.get('NextContinuationToken')

In [None]:
s3 = boto3.resource('s3')
bucket = s3.Bucket(BUCKET)
s3_client = boto3.client('s3')

In [None]:
df_dataset = pd.DataFrame()
dic_samples = {}
for o in get_common_prefixes_all(bucket, 
                                 Bucket=bucket.name,
                                 Prefix=PREFIX,
                                 Delimiter='/'):
    prefix =o.get('Prefix')
    label = o.get('Prefix').split('/')[-2:][0]
    print('label:', label)
    
    
    dic_arrhythmia = {}
    for o in get_common_prefixes_all(bucket, 
                                     Bucket=bucket.name,
                                     Prefix=prefix,
                                     Delimiter='/'):
        prefix =o.get('Prefix')
        arrhythmia = o.get('Prefix').split('/')[-2:][0]
        print(' arrhythmias:', arrhythmia)
        
        list_dir = []
        list_record = []
        list_alarm_file = []
        list_lead_ii_min = []
        list_lead_ii_max = []
        for o in get_common_prefixes_all(bucket, 
                                         Bucket=bucket.name,
                                         Prefix=prefix,
                                         Delimiter='/'):
            prefix =o.get('Prefix')
            directory = o.get('Prefix').split('/')[-2:][0]
            #print('   directory:', directory)
            object_csv = os.path.join(label, arrhythmia, directory, directory+'.csv')
            list_dir.append(directory)
            list_record.append(directory.split('-a')[0])
            list_alarm_file.append(object_csv)
            
            obj = s3_client.get_object(Bucket=bucket.name, Key=prefix+directory+'.csv')
            df_lead_ii = pd.read_csv(io.BytesIO(obj['Body'].read()),
                                     sep=',', header = None)
            
            df_lead_ii.columns = ['lead_ii']
            list_lead_ii_max.append(df_lead_ii['lead_ii'].max())
            list_lead_ii_min.append(df_lead_ii['lead_ii'].min())

        print ('    directory count : ', len(list_dir))
        df_dataset = df_dataset.append(pd.DataFrame({'record':list_record,
                                                     'alarm_label':label, 
                                                     'arrhythmia':arrhythmia,
                                                     'lead_ii_max':list_lead_ii_max,
                                                     'lead_ii_min':list_lead_ii_min,
                                                     'lead_ii_file':list_alarm_file}))
        dic_arrhythmia[arrhythmia] = list_dir
    dic_samples[label] = dic_arrhythmia
print ("total : ", len(df_dataset))

In [None]:
df_scatter = df_dataset.sample(frac=1).reset_index(drop=True)
df_sort = df_dataset.sort_values(['record']).reset_index(drop=True)

In [None]:
csv_buffer = StringIO()
df_sort.to_csv(csv_buffer, index = False)

s3.Object(BUCKET, PREFIX+MODEL_NAME+'.csv').put(Body=csv_buffer.getvalue())

In [4]:
df_sort

Unnamed: 0,record,alarm_label,arrhythmia,lead_ii_max,lead_ii_min,lead_ii_file
0,a103l,false,Asystole,1.84435,-0.79219,false/Asystole/a103l-a1/a103l-a1.csv
1,a104s,false,Asystole,2.01575,-0.89022,false/Asystole/a104s-a1/a104s-a1.csv
2,a105l,false,Asystole,1.97200,-0.77429,false/Asystole/a105l-a1/a105l-a1.csv
3,a109l,false,Asystole,1.96848,-0.94543,false/Asystole/a109l-a1/a109l-a1.csv
4,a134s,false,Asystole,5.09194,-5.09495,false/Asystole/a134s-a1/a134s-a1.csv
5,a142s,true,Asystole,0.64527,-0.17663,true/Asystole/a142s-a1/a142s-a1.csv
6,a145l,false,Asystole,0.78016,-1.28306,false/Asystole/a145l-a1/a145l-a1.csv
7,a161l,true,Asystole,0.30606,-1.44913,true/Asystole/a161l-a1/a161l-a1.csv
8,a163l,false,Asystole,4.55699,-2.44400,false/Asystole/a163l-a1/a163l-a1.csv
9,a165l,false,Asystole,3.21308,-1.90748,false/Asystole/a165l-a1/a165l-a1.csv
