In [3]:
from sagemaker import get_execution_role
from bs4 import BeautifulSoup 
import pandas as pd
import sagemaker
import boto3
import json
import os

In [4]:
role = get_execution_role()
sess = sagemaker.Session()

In [5]:
bucketName = 'signet-ring-cell-bryan'
folderName = 'challenge1-signetringcell-dataset/sig-train-pos/sig-train-pos/'

In [6]:
s3_client = boto3.client('s3')
bucket = s3_client.list_objects(Bucket=bucketName, Prefix=folderName)

In [7]:
# First five xml files) in our dataset
for img_num in range(1,11):
    if img_num%2==1:
        print(bucket['Contents'][img_num]['Key'])

challenge1-signetringcell-dataset/sig-train-pos/sig-train-pos/2018_64982_1-3_2019-02-25 21_57_36-lv0-33516-59515-2003-2010.xml
challenge1-signetringcell-dataset/sig-train-pos/sig-train-pos/2018_64982_1-3_2019-02-25 21_57_36-lv0-34589-61706-2030-2044.xml
challenge1-signetringcell-dataset/sig-train-pos/sig-train-pos/2018_64982_1-3_2019-02-25 21_57_36-lv0-36515-58465-2013-2071.xml
challenge1-signetringcell-dataset/sig-train-pos/sig-train-pos/2018_64982_1-3_2019-02-25 21_57_36-lv0-37528-60747-2016-2000.xml
challenge1-signetringcell-dataset/sig-train-pos/sig-train-pos/2018_64982_1-3_2019-02-25 21_57_36-lv0-38368-62991-2040-2016.xml


In [8]:
len(bucket['Contents'])

154

In [9]:
import s3fs
fs = s3fs.S3FileSystem()

In [10]:
# Function to create json annotation file from xml file and move that json file to s3
def xml_to_json(data, bucketName, folderName, fileName):
    """
    data: content being read
    bucketName: bucket being read from
    folderName: folder being wrote to
    file_name: xml file being read
    """
    Bs_data = BeautifulSoup(data, "xml") 
    zz=Bs_data.find_all("size")
    #print(zz)
    yyyy=str(zz)
    size=yyyy.partition('\n')[2]
    #print(size)
    wid=size.split("</width>",1)[0] 
    wid=wid.split("<width>",1)[1]
    #print(wid)
    hei=size.split("</height>",1)[0] 
    hei=hei.split("<height>",1)[1]
    #print(hei)
    depth=size.split("</depth>",1)[0] 
    depth=depth.split("<depth>",1)[1]
    #print(depth)
    b_unique = Bs_data.find_all('bndbox') 
    data1={}
   # data1['files']=file_name
# I think we nned to change it to jpeg, i don't know if that would fix it
    data1['files']=fileName.split('/')[-1][:-4] + '.jpg'
    data1['image_size']=[]
    data1['image_size'].append({
        'width' : wid,
        'height' : hei,
        'depth' : depth
    })
    data1['annotations']=[]
    for i in range(len(b_unique)):
        z=b_unique[i]
        y=str(z)
        yy=y.partition('\n')[2]
        xmin=yy.split("</xmin>",1)[0]
        xmin=xmin.split("<xmin>",1)[1]
        ymin=yy.split("</ymin>",1)[0] 
        ymin=ymin.split("<ymin>",1)[1]
        xmax=yy.split("</xmax>",1)[0] 
        xmax=xmax.split("<xmax>",1)[1]
        ymax=yy.split("</ymax>",1)[0] 
        ymax=ymax.split("<ymax>",1)[1]
        left=int(xmin)
        top=int(ymax)
        height=str(int(ymax)-int(ymin))
        width=str(int(xmax)-int(xmin))
        data1['annotations'].append({
            'class_id': 0,
            'left': left,
            'top' : top,
            'width' : width,
            'height' :height
        })
    data1['categories']=[]
    data1['categories'].append({
        'class_id' : 0,
        'name' : "signet ring cell"
    })
    
    json_fileName = fileName.split('/')[-1][:-4] + '.json'
    with open(json_fileName, 'w') as outfile:
        print("writing json file " + json_fileName)
        json.dump(data1, outfile) 
        
    with open(json_fileName, 'rb') as f:
        print('sending file to s3')
        s3_client.upload_fileobj(f, bucketName, folderName + json_fileName)
        print('file successfully sent')
        
    os.remove(json_fileName)
    

In [11]:
SRC_folderName = 'challenge1-signetringcell-dataset/src_detection_model/train_annotation/'

In [12]:
# Create annotation file for each image
# for img_num in range(len(bucket['Contents'])):
#     if img_num%2==1:
#         file_name = bucket['Contents'][img_num]['Key']
#         with fs.open('{}/{}'.format(bucketName, file_name)) as f:
#             data = f.read()
#             xml_to_json(data, bucketName, SRC_folderName, file_name)

In [20]:
# Bucket names for training and validation data

prefix = 'challenge1-signetringcell-dataset/src_detection_model'

train_channel = prefix + '/train'
validation_channel = prefix + '/validation'
train_annotation_channel = prefix + '/train_annotation'
validation_annotation_channel = prefix + '/validation_annotation'


s3_train_data = 's3://{}/{}'.format(bucket, train_channel)
s3_validation_data = 's3://{}/{}'.format(bucket, validation_channel)
s3_train_annotation = 's3://{}/{}'.format(bucket, train_annotation_channel)
s3_validation_annotation = 's3://{}/{}'.format(bucket, validation_annotation_channel)

In [21]:
# Set up output location where object detection model will be stored
s3_output_location = 's3://{}/{}/output'.format(bucketName, 'challenge1-signetringcell-dataset/src_detection_model')

In [22]:
# Get our training uri

training_image = sagemaker.image_uris.retrieve(region=sess.boto_region_name, framework='object-detection')
print (training_image)

825641698319.dkr.ecr.us-east-2.amazonaws.com/object-detection:1


In [27]:
# Training the model

# Sagemaker estimator will launch the training job

od_model = sagemaker.estimator.Estimator(training_image,
                                         role, 
                                         instance_count=1, 
                                         instance_type='ml.t3.medium',
                                         volume_size = 50,
                                         max_run = 360000,
                                         input_mode = 'File',
                                         output_path=s3_output_location,
                                         sagemaker_session=sess)

In [28]:
od_model.set_hyperparameters(base_network='resnet-50',
                             use_pretrained_model=1,
                             num_classes=1,
                             mini_batch_size=16,
                             epochs=30,
                             learning_rate=0.001,
                             lr_scheduler_step='10',
                             lr_scheduler_factor=0.1,
                             optimizer='sgd',
                             momentum=0.9,
                             weight_decay=0.0005,
                             overlap_threshold=0.5,
                             nms_threshold=0.45,
                             num_training_samples=62)

In [29]:
# Configure our data channels from the s3 buckets

train_data = sagemaker.inputs.TrainingInput(s3_train_data, distribution='FullyReplicated', content_type='image/jpeg', s3_data_type='S3Prefix')
validation_data = sagemaker.inputs.TrainingInput(s3_validation_data, distribution='FullyReplicated', content_type='image/jpeg', s3_data_type='S3Prefix')
train_annotation = sagemaker.inputs.TrainingInput(s3_train_annotation, distribution='FullyReplicated', content_type='image/jpeg', s3_data_type='S3Prefix')
validation_annotation = sagemaker.inputs.TrainingInput(s3_validation_annotation, distribution='FullyReplicated', content_type='image/jpeg', s3_data_type='S3Prefix')

data_channels = {'train': train_data, 'validation': validation_data, 
                 'train_annotation': train_annotation, 'validation_annotation':validation_annotation}

In [30]:
od_model.fit(inputs=data_channels, logs=True)

ClientError: An error occurred (413) when calling the CreateTrainingJob operation: 