In [None]:
#Bryan Bennett
# 3/2/2021

In [1]:
from sagemaker import get_execution_role
import pandas as pd
import sagemaker
import boto3
import os

In [2]:
role = get_execution_role()
sess = sagemaker.Session()

In [3]:
bucketName = 'signet-ring-cell-bryan'
folderName = 'challenge2-colorectal-dataset/tissue-train-pos-v1/'

In [4]:
s3_client = boto3.client('s3')
bucket = s3_client.list_objects(Bucket=bucketName, Prefix=folderName)

In [5]:
len(bucket['Contents'])

500

In [6]:
DST_folderName = 'challenge2-colorectal-dataset/tissue-train-pos-pngs/'

In [13]:
#Create png copies of masks, someone check if this works with the file structure

from PIL import Image


#Create annotation file for each image
for img_num in range(len(bucket['Contents'])):
    if img_num%2==0:
            file_name = bucket['Contents'][img_num]['Key']
            png_fileName = file_name.split('/')[-1][:-4] + '.png'
            with Image.open('{}/{}'.format(bucketName, file_name)) as im:
                s3_client.upload_fileobj(im, bucketName, DST_folderName + png_fileName)
                #im.save(bucketName + DST_folderName + file_name + '.png')

FileNotFoundError: [Errno 2] No such file or directory: 'signet-ring-cell-bryan/challenge2-colorectal-dataset/tissue-train-pos-v1/18-00530B_2019-05-07 23_56_22-lv1-11712-16122-7372-7686.jpg'

In [15]:
# Bucket names for training and validation data



prefix = 'challenge2-colorectal-dataset/tumor_segmentation_model'

train_channel = prefix + '/train'
validation_channel = prefix + '/validation'
train_annotation_channel = prefix + '/train_annotation'
validation_annotation_channel = prefix + '/validation_annotation'
label_map_channel = prefix + '/label_map'



s3_train_data = 's3://{}/{}'.format(bucketName, train_channel)
s3_validation_data = 's3://{}/{}'.format(bucketName, validation_channel)
s3_train_annotation = 's3://{}/{}'.format(bucketName, train_annotation_channel)
s3_validation_annotation = 's3://{}/{}'.format(bucketName, validation_annotation_channel)
s3_label_map = 's3://{}/{}'.format(bucketName, label_map_channel)

In [16]:
import json

label_map = { 'scale': 1 }

with open('train_label_map.json', 'w') as lmfile:
    json.dump(label_map, lmfile)
    s3_client.upload_fileobj(lmfile, bucketName, label_map_channel + '/train_label_map.json')
    
with open('validation_label_map.json', 'w') as lmfile:
    json.dump(label_map, lmfile)
    s3_client.upload_fileobj(lmfile, bucketName, label_map_channel + '/validation_label_map.json')

RuntimeError: Input <_io.TextIOWrapper name='train_label_map.json' mode='w' encoding='UTF-8'> of type: <class '_io.TextIOWrapper'> is not supported.

In [None]:
# Split training and validation sets
from PIL import Image

ratio = 0.2 #Fraction of images to go in validation set (0.2 = 20%)

for img_num in range(len(bucket['Contents'])):
    
    file_name = bucket['Contents'][img_num]['Key']

    # it's a mask
    if img_num%2==0:    
        png_fileName = file_name.split('/')[-1][:-4] + '.png'
        with Image.open('{}/{}'.format(bucketName, file_name)) as im:     
            if(img_num < (img_num*ratio)):
                s3_client.upload_fileobj(im, bucketName, validation_annotation_channel + png_fileName)
            else:
                s3_client.upload_fileobj(im, bucketName, train_annotation_channel + png_fileName)
                
    # it's an image
    if img_num%2==1:
        with Image.open('{}/{}'.format(bucketName, file_name)) as im:     
            if(img_num < (img_num*ratio)):
                s3_client.upload_fileobj(im, bucketName, validation_annotation_channel + fileName)
            else:
                s3_client.upload_fileobj(im, bucketName, train_annotation_channel + fileName)
            

In [None]:
import glob

num_training_samples = len(glob.glob1(s3_train_data,"*.jpg"))
num_validation_samples = len(glob.glob1(s3_validation_data,"*.jpg"))

print('Num Train Images = ' + str(num_training_samples))
assert num_training_samples == len(glob.glob1(s3_train_annotation,"*.png"))

print('Num Validation Images = ' + str(num_validation_samples))
assert num_validation_samples == len(glob.glob1(s3_validation_annotation,"*.png"))

In [None]:
# Set output location where artifact will be stored
s3_output_location = 's3://{}/{}/output'.format(bucketName, 'challenge2-colorectal-dataset/tumor_segmentation_model')

In [None]:
# Get training uri

#Arguments in-order
training_image = sagemaker.image_uris.retrieve(region=sess.boto_region_name, framework='semantic-segmentation')
print (training_image)

In [None]:
from sagemaker.amazon.amazon_estimator import get_image_uri

training_image = get_image_uri(sess.boto_region_name, 'semantic-segmentation', repo_version="latest")
print (training_image)

In [None]:
# Training the model

# Sagemaker estimator will initiate the training job

ss_model = sagemaker.estimator.Estimator(training_image,
                                         role, 
                                         instance_count=1, 
                                         instance_type='ml.p3.2xlarge',
                                         volume_size = 50,
                                         max_run = 360000,
                                         output_path=s3_output_location,
                                         sagemaker_session=sess)

In [None]:
# Set our hyperparameters

ss_model.set_hyperparameters(backbone='resnet-50',  # This is the encoder. Other option is resnet-101
                             algorithm='fcn',  # This is the decoder. Other options are 'psp' and 'deeplab'                             
                             use_pretrained_model=1,  # Use the pre-trained model.
                             #crop_size=240,  # Size of image random crop.                             
                             num_classes=1,  # FIXME: Should this be 2 or 1? Normal tissue and cancerous?
                             epochs=30,  # Number of epochs to run.
                             learning_rate=0.0001,
                             optimizer='rmsprop',  # Other options include 'adam', 'rmsprop', 'nag', 'adagrad'.
                             lr_scheduler='poly',  # Other options include 'cosine' and 'step'.                           
                             mini_batch_size=8,  # Setup some mini batch size.
                             validation_mini_batch_size=5,
                             num_training_samples=num_training_samples,  # This is a mandatory parameter, 1464 in this case.
)

In [None]:
# Configure data channels from s3
distribution = 'FullyReplicated'


train_data = sagemaker.inputs.TrainingInput(s3_train_data, distribution=distribution, content_type='image/jpg', s3_data_type='S3Prefix')
validation_data = sagemaker.inputs.TrainingInput(s3_validation_data, distribution=distribution, content_type='image/jpg', s3_data_type='S3Prefix')
train_annotation = sagemaker.inputs.TrainingInput(s3_train_annotation, distribution=distribution, content_type='image/png', s3_data_type='S3Prefix')
validation_annotation = sagemaker.inputs.TrainingInput(s3_validation_annotation, distribution=distribution, content_type='image/png', s3_data_type='S3Prefix')
#label_map = sagemaker.inputs.TrainingInput(s3_label_map, distribution=distribution, content_type='application/json', s3_data_type='S3Prefix')

data_channels = {'train': train_data,
                 'validation': validation_data, 
                 'train_annotation': train_annotation,
                 'validation_annotation':validation_annotation,
                 'label_map': label_map_channel #FIXME: This could be a problem, maybe just the channel?
                }

In [None]:
# Train the model

ss_model.fit(data_channels, logs=True)