In [None]:
!conda install --file requirements.txt 


done
Solving environment: failed with initial frozen solve. Retrying with flexible solve.
Solving environment: | 

In [2]:
# Sagemaker Dependencies
import sagemaker
from sagemaker.experiments.run import Run, load_run
from sagemaker.remote_function import remote, RemoteExecutor

from sagemaker.remote_function import RemoteExecutor

# Training dependencies
from tqdm import tqdm
import network
import utils
import os
import random
import argparse
import numpy as np

from torch.utils import data
from datasets import LeafDataset
from utils import ext_transforms as et
from metrics import StreamSegMetrics, BinarySegMetrics

import torch
import torch.nn as nn
from utils.visualizer import Visualizer

from PIL import Image
import matplotlib
import matplotlib.pyplot as plt
from torchvision import transforms
import matplotlib.pyplot as plt

import uuid


sm_session = sagemaker.Session()
s3_root_folder = f"s3://{sm_session.default_bucket()}/segmentation/"

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


ModuleNotFoundError: No module named 'tqdm'

In [None]:
def get_dataset(config):
    """ Dataset And Augmentation
    """
    if config["dataset"] == 'voc':
        train_transform = et.ExtCompose([
            # et.ExtResize(size=config["crop_size"]),
            et.ExtRandomScale((0.5, 2.0)),
            et.ExtRandomCrop(size=(config["crop_size"], config["crop_size"]), pad_if_needed=True),
            et.ExtRandomHorizontalFlip(),
            et.ExtToTensor(),
            et.ExtNormalize(mean=[0.485, 0.456, 0.406],
                            std=[0.229, 0.224, 0.225]),
        ])
        if config["crop_val"]:
            val_transform = et.ExtCompose([
                et.ExtResize(config["crop_size"]),
                et.ExtCenterCrop(config["crop_size"]),
                et.ExtToTensor(),
                et.ExtNormalize(mean=[0.485, 0.456, 0.406],
                                std=[0.229, 0.224, 0.225]),
            ])
        else:
            val_transform = et.ExtCompose([
                et.ExtToTensor(),
                et.ExtNormalize(mean=[0.485, 0.456, 0.406],
                                std=[0.229, 0.224, 0.225]),
            ])
        train_dst = VOCSegmentation(root=config["data_root"], year=config["year"],
                                    image_set='train', download=config["download"], transform=train_transform)
        val_dst = VOCSegmentation(root=config["data_root"], year=config["year"],
                                  image_set='val', download=False, transform=val_transform)

    if config["dataset"] == 'custom':
        """
        Augmentation to the custom dataset
        """
        train_img_transform = transforms.Compose([
          #RandomCropAndPad(512),
          transforms.Resize((256, 256)),
          #transforms.RandomResizedCrop(size=(256, 256)),
          transforms.RandomHorizontalFlip(),
          #transforms.RandomRotation(degrees=(0, 360)),
          #transforms.ColorJitter(brightness=0.5, contrast=0.5, saturation=0.5, hue=0.5),
          transforms.ToTensor(),
          transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ])

        train_mask_transform = transforms.Compose([
            #RandomCropAndPadMask(512),
            transforms.Resize((256, 256), interpolation=transforms.InterpolationMode.NEAREST),
            #transforms.RandomResizedCrop(size=(256, 256), interpolation=transforms.InterpolationMode.NEAREST),
            transforms.RandomHorizontalFlip(),
            #transforms.RandomRotation(degrees=(0, 360)),
            transforms.ToTensor(),
        ])

        val_img_transform = transforms.Compose([
            transforms.Resize((256, 256)),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ])

        val_mask_transform = transforms.Compose([
            transforms.Resize((256, 256), interpolation=transforms.InterpolationMode.NEAREST),
            transforms.ToTensor(),
        ])

        train_dst = LeafDataset(root=config["custom_data_path"], image_set='train', img_transform=train_img_transform, mask_transform=train_mask_transform)
        val_dst = LeafDataset(root=config["custom_data_path"], image_set='val', img_transform=val_img_transform, mask_transform=val_mask_transform)


    return train_dst, val_dst


# ================================================= Validation ==============================================================
def validate(config, model, loader, device, metrics, criterion, ret_samples_ids=None):
    """Do validation and return specified samples"""
    metrics.reset()
    ret_samples = []

    total_loss = 0.0  # Initialize total loss
    num_batches = 0

    if config["save_val_results"]:
        if not os.path.exists('results'):
            os.mkdir('results')
        denorm = utils.Denormalize(mean=[0.485, 0.456, 0.406],
                                   std=[0.229, 0.224, 0.225])
        img_id = 0


    with torch.no_grad():
        for i, (images, labels) in tqdm(enumerate(loader)):

            images = images.to(device, dtype=torch.float32)
            labels = labels.to(device, dtype=torch.long)

            outputs = model(images)
            #preds = outputs.detach().max(dim=1)[1].cpu().numpy()
            
            outputs = torch.squeeze(outputs, dim=1)
            labels = labels.float()

            loss = criterion(outputs, labels)  # Compute loss
            total_loss += loss.item()  # Accumulate the total loss
            num_batches += 1

            probs = torch.sigmoid(outputs).detach()
            preds = (probs > 0.5).long().cpu().numpy()
            
            targets = labels.cpu().numpy()
            
            
            metrics.update(targets, preds)
            if ret_samples_ids is not None and i in ret_samples_ids:  # get vis samples
                ret_samples.append(
                    (images[0].detach().cpu().numpy(), targets[0], preds[0]))

            if config["save_val_results"]:
                for i in range(len(images)):
                    image = images[i].detach().cpu().numpy()
                    target = targets[i]
                    pred = preds[i]

                    # Decode the binary target and prediction masks to RGB images
                    target_rgb = loader.dataset.decode_target(target).astype(np.uint8)
                    pred_rgb = loader.dataset.decode_target(pred).astype(np.uint8)
                    
                    image = (denorm(image) * 255).transpose(1, 2, 0).astype(np.uint8)

                    Image.fromarray(image).save('results/%d_image.png' % img_id)
                    Image.fromarray(target).save('results/%d_target.png' % img_id)
                    Image.fromarray(pred).save('results/%d_pred.png' % img_id)

                    fig = plt.figure()
                    plt.imshow(image)
                    plt.axis('off')
                    plt.imshow(pred, alpha=0.7)
                    ax = plt.gca()
                    ax.xaxis.set_major_locator(matplotlib.ticker.NullLocator())
                    ax.yaxis.set_major_locator(matplotlib.ticker.NullLocator())
                    plt.savefig('results/%d_overlay.png' % img_id, bbox_inches='tight', pad_inches=0)
                    plt.close()
                    img_id += 1
        
        average_loss = total_loss / num_batches
        score = metrics.get_results()
    return score, ret_samples, average_loss


def smooth_labels(labels,smoothing=0.1):
    return labels * (1 - smoothing) + 0.5 * smoothing

@remote
def train_main(config):
    if config["dataset"].lower() == 'custom':
        config["num_classes"] = 2   # Multi-class with cross-entropy
        config["num_classes"] = 1   #Binary segmentation

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print("Device: %s" % device)

    # Setup random seed
    torch.manual_seed(config["random_seed"])
    np.random.seed(config["random_seed"])
    random.seed(config["random_seed"])

    # Setup dataloader
    train_dst, val_dst = get_dataset(config)
    train_loader = data.DataLoader(
        train_dst, batch_size=config["batch_size"], shuffle=True, num_workers=2,
        drop_last=True)  # drop_last=True to ignore single-image batches.
    val_loader = data.DataLoader(
        val_dst, batch_size=config["val_batch_size"], shuffle=True, num_workers=2)
    print("Dataset: %s, Train set: %d, Val set: %d" %
          (config["dataset"], len(train_dst), len(val_dst)))

    # Set up model (all models are 'constructed at network.modeling)
    model = network.modeling.__dict__[config["model"]](num_classes=config["num_classes"], output_stride=config["output_stride"])
    if config["separable_conv"] and 'plus' in config["model"]:
        network.convert_to_separable_conv(model.classifier)
    utils.set_bn_momentum(model.backbone, momentum=0.01)

    # Set up metrics
    metrics = BinarySegMetrics()

    # Set up optimizer
    optimizer = torch.optim.SGD(params=[
        {'params': model.backbone.parameters(), 'lr': 0.1 * config["lr"]},
        {'params': model.classifier.parameters(), 'lr': config["lr"]},
    ], lr=config["lr"], momentum=0.9, weight_decay=config["weight_decay"])
    # optimizer = torch.optim.SGD(params=model.parameters(), lr=config["lr"], momentum=0.9, weight_decay=config["weight_decay"])
    # torch.optim.lr_scheduler.StepLR(optimizer, step_size=config["lr"]_decay_step, gamma=config["lr"]_decay_factor)
    if config["lr_policy"] == 'poly':
        scheduler = utils.PolyLR(optimizer, config["total_itrs"], power=0.9)
    
    elif config["lr_policy"] == 'step':
        scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=config["step_size"], gamma=0.1)
    
    elif config["lr_policy"] == 'warmup':
        scheduler = utils.GradualWarmupLR(optimizer, multiplier=1, total_epoch=5, after_scheduler=None)
    
    elif config["lr_policy"] == 'chained': # Combine warm up and step
        warmup_scheduler = utils.GradualWarmupLR(optimizer, multiplier=1, total_epoch=5, after_scheduler=None)
        step_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=config["step_size"], gamma=0.1)
        scheduler = torch.optim.lr_scheduler.SequentialLR(optimizer, schedulers=[warmup_scheduler, step_scheduler], milestones=[5])
    else:
        scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=0.1, gamma=0.1)

    # Set up criterion
    # criterion = utils.get_loss(config["loss_type"])
    if config["loss_type"] == 'focal_loss':
        criterion = utils.FocalLoss(ignore_index=255, size_average=True)
    elif config["loss_type"] == 'binary_focal':
        criterion = utils.BinaryFocalLoss() 
    elif config["loss_type"] == 'cross_entropy':
        criterion = nn.CrossEntropyLoss(ignore_index=255, reduction='mean')
    elif config["loss_type"] == 'BCE':
        criterion = nn.BCEWithLogitsLoss()  ##nn.BCELoss()

    def save_ckpt(path):
        """ save current model
        """
        torch.save({
            "cur_itrs": cur_itrs,
            "model_state": model.module.state_dict(),
            "optimizer_state": optimizer.state_dict(),
            "scheduler_state": scheduler.state_dict(),
            "best_score": best_score,
        }, path)
        print("Model saved as %s" % path)

    utils.mkdir('checkpoints')
    # Restore
    best_score = 0.0
    cur_itrs = 0
    cur_epochs = 0
    if config["ckpt"] is not None and os.path.isfile(config["ckpt"]):
        # https://github.com/VainF/DeepLabV3Plus-Pytorch/issues/8#issuecomment-605601402, @PytaichukBohdan
        checkpoint = torch.load(config["ckpt"], map_location=torch.device('cpu'))
        model.load_state_dict(checkpoint["model_state"])
        model = nn.DataParallel(model)
        model.to(device)
        if config["continue_training"]:
            optimizer.load_state_dict(checkpoint["optimizer_state"])
            scheduler.load_state_dict(checkpoint["scheduler_state"])
            cur_itrs = checkpoint["cur_itrs"]
            best_score = checkpoint['best_score']
            print("Training state restored from %s" % config["ckpt"])
        print("Model restored from %s" % config["ckpt"])
        del checkpoint  # free memory
    else:
        print("[!] Retrain")
        model = nn.DataParallel(model)
        model.to(device)

    # =========================================   Train Loop   =====================================================
    vis_sample_id = np.random.randint(0, len(val_loader), config["vis_num_samples"],
                                      np.int32) if config["enable_vis"] else None  # sample idxs for visualization
    denorm = utils.Denormalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # denormalization for ori images

    if config["test_only"]:
        model.eval()
        val_score, ret_samples, _ = validate(config, model=model, loader=val_loader, device=device, metrics=metrics, criterion=criterion, ret_samples_ids=vis_sample_id)
        print(metrics.to_str(val_score))
        return

    best_val_loss = float('inf')
    no_improve_epochs = 0
    patience = 5

    interval_loss = 0
    while True:  # cur_itrs < config["total_itrs"]:
        # =====  Train  =====
        model.train()
        cur_epochs += 1
        for (images, labels) in train_loader:
            cur_itrs += 1

            images = images.to(device, dtype=torch.float32)
            labels = labels.to(device, dtype=torch.long)

            optimizer.zero_grad()
            #print("Max label value:", labels.max().item())
            outputs = model(images)
            

            # Dimension to BCE
            outputs = torch.squeeze(outputs, dim=1)
            
            labels = labels.float()

            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            np_loss = loss.detach().cpu().numpy()
            interval_loss += np_loss
            
            if (cur_itrs) % 10 == 0:
                interval_loss = interval_loss / 10
                print("Epoch %d, Itrs %d/%d, Loss=%f" %(cur_epochs, cur_itrs, config["total_itrs"], interval_loss))
                interval_loss = 0.0

            if (cur_itrs) % config["val_interval"] == 0:
                save_ckpt('checkpoints/latest_%s_%s_os%d.pth' % (config["model"], config["dataset"], config["output_stride"]))
                print("validation...")
                model.eval()
                val_score, ret_samples, current_val_loss = validate(config=config, model=model, loader=val_loader, device=device, metrics=metrics, criterion=criterion, ret_samples_ids=vis_sample_id)
                print("Validation Loss: %f" % current_val_loss)
                print(metrics.to_str(val_score))
                # if val_score['IoU Foreground'] > best_score:  # save best model
                #     best_score = val_score['IoU Foreground']
                #     save_ckpt('checkpoints/best_%s_%s_os%d.pth' %
                #               (config["model"], config["dataset"], config["output_stride"]))
                print("==========================================================")
                
                #================== Early stop =========================================
                if current_val_loss < best_val_loss:
                    best_val_loss = current_val_loss
                    no_improve_epochs = 0
                    # Save best model
                    save_ckpt('checkpoints/best_%s_%s_os%d.pth' % (config["model"], config["dataset"], config["output_stride"]))
                else:
                    no_improve_epochs += 1

                if no_improve_epochs >= patience:
                    print("Early stopping triggered after %d validations" % no_improve_epochs)
                    break
                #=======================================================================
                model.train()
            scheduler.step()

            if cur_itrs >= config["total_itrs"]:
                return
            
def lower_all_files(path):
    for file in os.listdir(path):
        os.rename(os.path.join(path, file), os.path.join(path,file.lower()))


In [None]:
!python3 split_dataset.py

In [None]:

# replacement for argparse-d params 
config = {
    "dataset":"custom",
    "data_root": None,
    "custom_data_path": "local_data/",
    "model": "deeplabv3plus_mobilenet",
    "separable_conv": True,
    "lr_policy": "chained",
    "batch_size": 16,
    "continue_training": False,
    "loss_type": "BCE",
    "lr": 1e-2,
    "total_iters": 800,
    "val_interval": 10,
    "random_seed": 42,
    "val_batch_size": 1,
    "output_stride": 16,
    "weight_decay": 1e-4,
    "ckpt": None,
    "enable_vis": False,
    "vis_num_samples": 2,
    "test_only": False,
    "vis_port": 13570,
    "vis_env": "main",
    "step_size": 10000,
    "total_itrs": 800,
    "save_val_results": False,
    "print_interval": 10,
    "download": True,
    
}


with Run(
    experiment_name="local-tests",
    run_name=f"local-tests-{uuid.uuid4()}",
    sagemaker_session=sm_session,
) as run:
    trained_model = train_main(config)


In [8]:
# Set path to config file
os.environ["SAGEMAKER_USER_CONFIG_OVERRIDE"] = os.getcwd()

In [10]:
default_config = {
    "dataset":"custom",
    "data_root": None,
    "custom_data_path": "local_data/",
    "model": "deeplabv3plus_mobilenet",
    "separable_conv": True,
    "lr_policy": "chained",
    "batch_size": 16,
    "continue_training": False,
    "loss_type": "BCE",
    "lr": 1e-2,
    "total_iters": 800,
    "val_interval": 100,
    "random_seed": 42,
    "val_batch_size": 1,
    "output_stride": 16,
    "weight_decay": 1e-4,
    "ckpt": None,
    "enable_vis": False,
    "vis_num_samples": 2,
    "test_only": False,
    "vis_port": 13570,
    "vis_env": "main",
    "step_size": 10000,
    "total_itrs": 800,
    "save_val_results": False,
    "print_interval": 10,
    "download": True,
    "val_interval": 100
    
}
with RemoteExecutor(
    max_parallel_jobs=2, keep_alive_period_in_seconds=60, s3_root_uri=s3_root_folder
) as executor:
    futures = {}
    for run_name, lr in [
        (f"run-{uuid.uuid4()}", 0.3),
        (f"run-{uuid.uuid4()}", 1),
        (f"run-{uuid.uuid4()}", 3.0),
    ]:
        with Run(
            experiment_name="seg-lr-test", run_name=run_name, sagemaker_session=sm_session
        ) as run:
            run.log_artifact(
                name="raw_data",
                value=f"https://sagemaker-example-files-prod-{sm_session.boto_region_name}.s3.amazonaws.com/datasets/image/MNIST/",
                is_output=False,
            )
            test_config = default_config
            test_config["lr"] = lr
            futures[run_name] = executor.submit(train_main, test_config)

INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole


ValueError: Unable to load the config file from the location: /home/ec2-user/SageMaker/treetracker-machine-learning/segmentationProvide a valid file path