## Prepare the Dataset

In [6]:
project_name = "pn_deploy"

train_prefix = "train"
val_prefix = "validation"

train_data = "s3://{}/{}/{}/".format(bucket, project_name, train_prefix)
validation_data = "s3://{}/{}/{}/".format(bucket, project_name, val_prefix)

### Download or Update Data from S3

In [54]:
def get_file_list(bucket_name, prefix):
    s3 = boto3.resource('s3')
    bucket=bucket_name
    my_bucket = s3.Bucket(bucket)
    location_list = []
    for (bucket_name, key) in map(lambda x: (x.bucket_name, x.key), my_bucket.objects.filter(Prefix=prefix)):
        data_location = "s3://{}/{}".format(bucket_name, key)
        location_list.append(data_location)
    # Remove the root folder path
    if "s3://{}/{}/".format(bucket_name, prefix) in location_list:
        location_list.remove("s3://{}/{}/".format(bucket_name, prefix))
    return location_list

In [1]:
list_normal = get_file_list(bucket,"pn_deploy/normal_1000")
list_pneumonia = get_file_list(bucket,"pn_deploy/pneumonia_1000")

In [73]:
#download data
for l in list_normal:
    data_source1 = S3Downloader.download(
    local_path="pn/data/normal/",
    s3_uri=l,
    sagemaker_session=session,
    )

for l in list_pneumonia:
    data_source1 = S3Downloader.download(
    local_path="pn/data/pneumonia/",
    s3_uri=l,
    sagemaker_session=session,
    )

### Generate annotations

In [74]:
# generate annotations
filePath = 'pn/data/normal'
l = os.listdir(filePath)
ant={}
for n in l:
    name,_ = n.split('.')
    ant[name] = 'normal'
with open('pn/annotations/normal.txt', 'w') as f:
    for n, c in ant.items():
        f.write(str(n)+" "+str(c)+"\n")

filePath = 'pn/data/pneumonia'
l = os.listdir(filePath)
ant={}
for n in l:
    name,_ = n.split('.')
    ant[name] = 'pneumonia'
with open('pn/annotations/pneumonia.txt', 'w') as f:
    for n, c in ant.items():
        f.write(str(n)+" "+str(c)+"\n")

In [75]:
# read annotations
def get_annotations(file_path, annotations={}):
    
    with open(file_path, 'r') as f:
        rows = f.read().splitlines()

    for i, row in enumerate(rows):
        image_name, class_name = row.split(' ')
        image_name = image_name + '.jpeg'
        
        annotations[image_name] = class_name
    
    return annotations

In [77]:
# read annotations
annotations_normal={}
annotations_pneumonia={}
annotations_normal = get_annotations('pn/annotations/normal.txt',annotations_normal)
annotations_pneumonia = get_annotations('pn/annotations/pneumonia.txt',annotations_pneumonia)

total_count = len(annotations_normal.keys())
print('Total normal examples', total_count)
total_count = len(annotations_pneumonia.keys())
print('Total pneumonia examples', total_count)

Total normal examples 1000
Total pneumonia examples 1000


In [78]:
print(next(iter(annotations_normal.items())))
print(next(iter(annotations_pneumonia.items())))

('NORMAL2-IM-0588-0001.jpeg', 'normal')
('person55_bacteria_263.jpeg', 'pneumonia')


### Split Data and Upload

In [21]:
# split and copy file
import os
classes = ['normal', 'pneumonia']
sets = ['train', 'validation']
root_dir = 'pn/custom_data'

if not os.path.isdir(root_dir):
    os.mkdir(root_dir)
    
for set_name in sets:
    if not os.path.isdir(os.path.join(root_dir, set_name)):
        os.mkdir(os.path.join(root_dir, set_name))
    for class_name in classes:
        folder = os.path.join(root_dir, set_name, class_name)
        if not os.path.isdir(folder):
            os.mkdir(folder)

In [86]:
for image, class_name in annotations_normal.items():
    target_set = 'validation' if random.randint(0, 99) < 20 else 'train'
    target_path = os.path.join(root_dir, target_set, class_name, image)
    shutil.copy(os.path.join('pn/data/normal', image), target_path)

for image, class_name in annotations_pneumonia.items():
    target_set = 'validation' if random.randint(0, 99) < 20 else 'train'
    target_path = os.path.join(root_dir, target_set, class_name, image)
    shutil.copy(os.path.join('pn/data/pneumonia', image), target_path)

In [3]:
sets_counts = {
    'train': 0,
    'validation': 0
}

for set_name in sets:
    for class_name in classes:
        path = os.path.join(root_dir, set_name, class_name)
        count = len(os.listdir(path))
        print(path, 'has', count, 'images')
        sets_counts[set_name] += count

print(sets_counts)

In [2]:
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()
bucket_name = bucket

print('Uploading to S3..')
s3_data_path = sagemaker_session.upload_data(path=root_dir, bucket=bucket_name, key_prefix='pn_deploy')

print('Uploaded to', s3_data_path)

---