In [None]:
import boto3           # For interacting with S3
import pandas as pd
import sys             # Python system library needed to load custom functions
import json
# Imports to run Sagemaker training jobs
from sagemaker import get_execution_role
from sagemaker.pytorch import PyTorch
from sagemaker.session import Session

import matplotlib.pyplot as plt  # Used for plotting
import mmcv  # Object detection framework
import os  # Interaction with the file system
import pandas as pd  # Home of the DataFrame construct, _the_ most important object for Data Science
import sys  # Python system library needed to load custom functions

from matplotlib.patches import Rectangle  # Allows drawing the bounding boxes of the worm sections
from mmcv import Config  # Loading and accessing MMDetection configuration files
from mmdet.apis import inference_detector, init_detector, train_detector, set_random_seed  # Part of the MMDetection framework
from mmdet.datasets import build_dataset  # Part of the MMDetection framework
from mmdet.models import build_detector  # Part of the MMDetection framework

from PIL import Image  # For loading image files
from tqdm import tqdm  # for timing a for loop
from sklearn.model_selection import train_test_split
from mmdet.utils import AvoidCUDAOOM

In [None]:
sys.path.append('../src')  # Add the source directory to the PYTHONPATH. This allows to import local functions and modules.
from config import DEFAULT_BUCKET, DEFAULT_REGION  # The name of the S3 bucket that contains the training data
from detection_util import create_predictions
from gdsc_util import download_and_extract_model, set_up_logging, extract_hyperparams,create_encrypted_bucket, PROJECT_DIR, upload_to_s3,\
load_sections_df
from PredictionEvaluator import PredictionEvaluator
#from mmdet.apis import inference_detector, init_detector, train_detector, set_random_seed
set_up_logging()  # Sets up logging to console and .log

from detection_util import create_predictions
from gdsc_score import get_leaderboard_score
from gdsc_util import download_directory, download_file, load_sections_df, set_up_logging, PROJECT_DIR
from PredictionEvaluator import PredictionEvaluator

In [None]:
# experiment name
entry_point = 'training_config.py'
import training_config as exp
exp_name = entry_point.split('.')[0].replace('_', '-')  # AWS does not allow . and _ as experiment names
exp_name

In [None]:
account_id = boto3.client('sts').get_caller_identity().get('Account')
role = get_execution_role()
sm_client = boto3.client("sagemaker", region_name=DEFAULT_REGION)
sess = Session(sagemaker_client=sm_client)

In [None]:
# the data in the input channel will be copied to training job container
input_channels = {    
    "train": f"s3://gdsc5/data",
}
# we need to create our own s3 bucket if it doesn't exist yet:
s3_output_location = f"s3://gdsc5/train_runs"

In [None]:
#import boto3 # uploading the data to my own bucket
#client = boto3.client('s3')
# upload_to_s3('../data/actual_train.csv', 'data/actual_train.csv', 'gdsc5')
# upload_to_s3('../data/actual_test.csv', 'data/actual_test.csv', 'gdsc5')

### setting hyper parameters for training job

In [None]:
# #Reloading the module after small changes
import importlib
importlib.reload(exp)

In [None]:
# loading the config to check for any errors
data_folder = str(PROJECT_DIR / 'data')
cfg, base_file = exp.load_config(data_folder)
hyperparameters = extract_hyperparams(entry_point) # custom function to parse the training script and extract config
hyperparameters['base_file'] = base_file

In [None]:
#cfg['data']['test']['pipeline'][1]

In [None]:
print(cfg.pretty_text)

In [None]:
# writing the config file 
# text_file = open("../src/custom/exp_4a1_conf.py", "w")
# text_file.write(cfg.pretty_text)
# text_file.close()

In [None]:
# building the dataset
datasets = [build_dataset(cfg.data.train)]
datasets


# model = build_detector(cfg.model, train_cfg=cfg.get('train_cfg'), test_cfg=cfg.get('test_cfg'))

# model.CLASSES = datasets[0].CLASSES  # Add an attribute for visualization convenience

# train_detector(model, datasets, cfg, validate=True)


Finally, we need to specify which metrics we want Sagemaker to automatically track. For this we need to setup [regular expressions](https://en.wikipedia.org/wiki/Regular_expression) that will be applied on the logs.
The corresponding values will then be stored and made visible in the training job.

In [None]:
## for coco dataset format evaluations
# # Output format is:
# # INFO:mmdet:Epoch [4][50/497]#011lr: 5.000e-03, eta: 0:10:23, time: 1.638, data_time: 1.433, memory: 1863, loss_rpn_cls: 0.0897, loss_rpn_bbox: 0.0781, loss_cls: 0.2336, acc: 90.8691, loss_bbox: 0.3404, loss: 0.7418
# metrics = [
# {"Name": "train:loss_rpn_cls", "Regex": "loss_rpn_cls: ([0-9\.]+)"},
# {"Name": "train:loss_rpn_bbox", "Regex": "loss_rpn_bbox: ([0-9\.]+)"},
# {"Name": "train:loss_cls", "Regex": "loss_cls: ([0-9\.]+)"},
# {"Name": "train:loss_bbox", "Regex": "loss_bbox: ([0-9\.]+)"},
# {"Name": "train:loss", "Regex": "loss: ([0-9\.]+)"},
# {"Name": "train:accuracy", "Regex": "acc: ([0-9\.]+)"},
# {"Name": "train:epoch", "Regex": "Epoch (\[[0-9\.]+\])"},
# {"Name": "val:epoch", "Regex": "Epoch\(val\) (\[[0-9]+\])"},
# {"Name": "val:AP 0.75", "Regex": "Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=400 ] = ([0-9\.]+)"},
# {"Name": "val:AR 0.75", "Regex": "Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=400 ] = ([0-9\.]+)"},
# {"Name": "val:bbox", "Regex": "bbox_mAP_50: ([0-9\.]+)"},
# {"Name": "val:bbox", "Regex": "bbox_mAP_75: ([0-9\.]+)"},

# ]

In [None]:
metrics = [
    {"Name": "train:loss_rpn_cls", "Regex": "loss_rpn_cls: ([0-9\.]+)"},
    {"Name": "train:loss_rpn_bbox", "Regex": "loss_rpn_bbox: ([0-9\.]+)"},
    {"Name": "train:loss_cls", "Regex": "loss_cls: ([0-9\.]+)"},
    {"Name": "train:loss_bbox", "Regex": "loss_bbox: ([0-9\.]+)"},
    {"Name": "train:loss", "Regex": "loss: ([0-9\.]+)"},
    {"Name": "train:accuracy", "Regex": "acc: ([0-9\.]+)"},
    {"Name": "train:epoch", "Regex": "Epoch (\[[0-9\.]+\])"},
    {"Name": "val:epoch", "Regex": "Epoch\(val\) (\[[0-9]+\])"},
    {"Name": "val:mAP", "Regex": "mAP: ([0-9\.]+)"},
]

In [None]:
estimator = PyTorch(
    entry_point=entry_point,             # This function will be called by the training job
    source_dir="../src",                 # All code in this folder will be copied over
    image_uri=f"954362353459.dkr.ecr.{DEFAULT_REGION}.amazonaws.com/sm-training-custom:torch-1.8.1-cu111-noGPL",
    role=role,
    output_path=s3_output_location,
    container_log_level=20,             # 10=debug, 20=info
    base_job_name=exp_name,
    instance_count=1,
    instance_type="ml.g4dn.xlarge",     # a GPU instance
    volume_size=45,
    metric_definitions=metrics,
   hyperparameters=hyperparameters,
    
)

After we created the estimator, we will need to call the .fit method to start the training job. As this might take a while, we set ```wait=False``` so our notebook will not wait for the training job to finish and we can continue working.

In [None]:
estimator.fit(
    input_channels,
    wait=False,           # Whether or not the notebook should wait for the job to finish. By setting it to False we can continue working while the job runs on another machine.
)