In [1]:
import sys
colab = False

if 'google.colab' in sys.modules:
    from google.colab import drive
    drive.mount('/content/drive')
    !pip install certifi==2022.6.15 charset-normalizer==2.1.1 imageio==2.21.1 jmespath==1.0.1 joblib==1.1.1 networkx==2.8.6 numpy==1.23.5 opencv-python-headless==4.7.0.72 \
    packaging==21.3 Pillow==9.4.0 "protobuf>3.8,<=3.20.12" pyparsing==3.0.9 pyrr==0.10.3 python-dateutil==2.8.2 PyWavelets==1.5.0 PyYAML==6.0 qudida==0.0.4 requests==2.28.1 \
    scikit-image==0.20.0 scikit-learn==1.2.2 scipy==1.10.1 simplejson==3.18.4 six==1.16.0 tensorboardX==2.5.1 threadpoolctl==3.1.0 tifffile==2022.8.12 typing_extensions==4.3.0 \
    urllib3==1.26.12 boto3==1.24.58 botocore==1.27.58 s3transfer==0.6.0 torch==2.0.1 torchvision==0.15.2 albumentations[imgaug]==1.2.1
    colab = True

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import argparse
import configparser
import datetime
import os
import random
import warnings
from collections import OrderedDict
from math import pi
from os.path import exists
from pathlib import Path

import albumentations as A
import cv2
import torch
import torch.nn.parallel
import torch.optim as optim
import torch.utils.data
from torch.autograd import Variable
from torch.cuda import amp
from tqdm.notebook import tqdm


# Conditional that checks if it's running on Colab and sets the directory accordingly
if colab:
    os.chdir('/content/drive/MyDrive/dope-training_new')
    print("Running on Google Colab")
else:
    # Add all parent directories to the system path
    for parent in Path.cwd().parents:
        sys.path.append(str(parent))

from auxiliar_dope.model import DopeNetwork
from auxiliar_dope.utils import MultipleVertexJson, save_image
import src.args_parser as ar

# Import the necessary modules
warnings.filterwarnings("ignore")
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["NO_ALBUMENTATIONS_UPDATE"] = "1"

full_path = os.getcwd()
sys.path.append(full_path)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Running on Google Colab
Using device: cuda


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
# Parse arguments
opt = ar.parse_args(full_path, colab)

In [5]:
##################################################
# TRAINING CODE MAIN STARTING HERE
##################################################

print ("start:" , datetime.datetime.now().time())

if opt.pretrained in ['false', 'False']:
	opt.pretrained = False

try:
    os.makedirs(opt.outf)
except OSError:
    pass

if opt.manualseed is None:
    opt.manualseed = random.randint(1, 10000)

# save the hyper parameters passed
with open (opt.outf+'/header.txt','w') as file:
    file.write(str(opt)+"\n")

with open (opt.outf+'/header.txt','w') as file:
    file.write(str(opt))
    file.write("seed: "+ str(opt.manualseed)+'\n')
    with open (opt.outf+'/test_metric.csv','w') as file:
        file.write("epoch, passed,total \n")

# set the manual seed.
random.seed(opt.manualseed)
torch.manual_seed(opt.manualseed)
torch.cuda.manual_seed_all(opt.manualseed)

additional_targets = {
    'centroids': 'keypoints'
}

def to_tensor(x, **kwargs):
    return x.transpose(2, 0, 1).astype('float32')

def scale_down(x, **kwargs):
    return cv2.resize(x, (x.shape[0] // 8, x.shape[1] // 8))


img_size = (480,640)

mean = [0.45, 0.45, 0.45]
std = [0.25, 0.25, 0.25]

start: 22:29:54.566151


In [None]:
transform = A.Compose([
    A.ShiftScaleRotate(scale_limit=0.1, rotate_limit=0.1, shift_limit=0.1, p=0.1, border_mode=0),

    A.RandomCrop(height=img_size[0], width=img_size[1]),

    A.IAAAdditiveGaussianNoise(p=0.2),
    A.IAAPerspective(p=0.5),

    A.OneOf(
        [
            A.CLAHE(p=1),
            A.RandomBrightness(p=1),
            A.RandomGamma(p=1),
        ],
        p=0.9,
    ),

    A.OneOf(
        [
            A.IAASharpen(p=1),
            A.Blur(blur_limit=3, p=1),
            A.MotionBlur(blur_limit=3, p=1),
        ],
        p=0.9,
    ),

    A.OneOf(
        [
            A.RandomContrast(p=1),
            A.HueSaturationValue(p=1),
        ],
        p=0.9,
    )],
additional_targets=additional_targets,
keypoint_params=A.KeypointParams("xy", remove_invisible=False))

preprocessing_transform = A.Compose([
        A.Normalize(mean=mean, std=std),
        A.Lambda(mask=scale_down),
        A.Lambda(image=to_tensor, mask=to_tensor)],
    additional_targets=additional_targets,
    keypoint_params=A.KeypointParams("xy", remove_invisible=False))

#load the dataset using the loader in utils_pose
trainingdata = None
if not opt.data == "":
    train_dataset = MultipleVertexJson(
        root = opt.data,
        preprocessing_transform=preprocessing_transform,
        objectsofinterest=opt.object,
        sigma = opt.sigma,
        data_size = opt.datasize,
        save = opt.save,
        transform = transform,
    )

    trainingdata = torch.utils.data.DataLoader(train_dataset,
        batch_size = opt.subbatchsize,
        shuffle = True,
        num_workers = opt.workers,
        pin_memory = True,
        drop_last=True
    )

    """train_dataset.test = True
    for i in range(len(trainingdata)):
        images = next(iter(trainingdata))

        save_image(images['image'],'{}/train_{}.png'.format( opt.outf,str(i).zfill(5)),mean=mean[0],std=std[0])
        print ("Saving batch %d" % i)
    train_dataset.test = False"""

    if opt.save:
        print ('things are saved in {}'.format(opt.outf))
        quit()


testingdata = None
if not opt.datatest == "":
    test_dataset = MultipleVertexJson(
            root = opt.datatest,
            preprocessing_transform=preprocessing_transform,
            objectsofinterest=opt.object,
            sigma = opt.sigma,
            data_size = opt.datasize,
            save = opt.save,
            test = True
            )

    testingdata = torch.utils.data.DataLoader(
        test_dataset,
        batch_size = opt.subbatchsize // 2,
        shuffle = True,
        num_workers = opt.workers,
        pin_memory = True,
        drop_last=True)


if not trainingdata is None:
    print('training data: {} batches'.format(len(trainingdata)))
if not testingdata is None:
    print ("testing data: {} batches".format(len(testingdata)))

net = DopeNetwork(pretrained=opt.pretrained)
net = net.to(device)


if opt.net != '':
    # Load state dict from file
    state_dict = torch.load(opt.net, map_location='cuda')

    # If the state dict keys start with "module.", remove that prefix
    new_state_dict = OrderedDict()
    for k, v in state_dict.items():
        new_key = k[7:] if k.startswith("module.") else k
        new_state_dict[new_key] = v

    # Use the new_state_dict directly
    net.load_state_dict(new_state_dict)


parameters = filter(lambda p: p.requires_grad, net.parameters())
optimizer = optim.Adam(parameters,lr=opt.lr)

with open (opt.outf+'/loss_train.csv','w') as file:
    file.write('epoch,batchid,loss\n')

with open (opt.outf+'/loss_test.csv','w') as file:
    file.write('epoch,batchid,loss\n')

nb_update_network = 0

def _runnetwork(epoch, loader, train=True, scaler=None, pbar=None):
    global nb_update_network
    # net
    if train:
        net.train()
    else:
        net.eval()

    if train:
        optimizer.zero_grad()
    for batch_idx, targets in enumerate(loader):

        data = Variable(targets['image'].to(device).float())

        with amp.autocast():
            output_belief, output_affinities = net(data)

            target_belief = Variable(targets['beliefs'].to(device).float())
            target_affinity = Variable(targets['affinities'].to(device).float())

            loss = None

            for l in output_belief:
                if loss is None:
                    loss = ((l - target_belief) ** 2).mean()
                else:
                    loss += ((l - target_belief) ** 2).mean()


            # Affinities loss
            for l in output_affinities: #output, each belief map layers.
                loss_tmp = ((l - target_affinity) * (l-target_affinity)).mean()
                loss += loss_tmp

        if train:
            scaler.scale(loss).backward()
            if batch_idx % (opt.batchsize // opt.subbatchsize) == 0:
                if train:
                    scaler.step(optimizer)
                    scaler.update()
                    nb_update_network+=1
                    optimizer.zero_grad()

        if train:
            namefile = '/loss_train.csv'
        else:
            namefile = '/loss_test.csv'

        with open (opt.outf+namefile,'a') as file:
            s = '{}, {},{:.15f}\n'.format(
                epoch,batch_idx,loss.data.item())
            # print (s)
            file.write(s)

        # break
        if not opt.nbupdates is None and nb_update_network > int(opt.nbupdates):
            torch.save(net.state_dict(), '{}/net_{}.pth'.format(opt.outf, opt.namefile))
            break

        if train:
            if pbar is not None:
                pbar.set_description("Training loss: %0.4f (%d/%d)" % (loss.data.item(), batch_idx, len(loader)))
        else:
            if pbar is not None:
                pbar.set_description("Testing loss: %0.4f (%d/%d)" % (loss.data.item(), batch_idx, len(loader)))
        if batch_idx % 10 == 0:
          try:
            torch.save(net.state_dict(), f'{opt.outf}/net.pth')
            print("Guardando model")
          except Exception as e:
            print(f"Error saving model at epoch: {e}")

    if train:
        optimizer.zero_grad()


scaler = amp.GradScaler()
torch.backends.cudnn.benchmark = True
pbar = tqdm(range(1, opt.epochs + 1))

for epoch in pbar:
    # Run training and testing as before
    if trainingdata is not None:
        _runnetwork(epoch, trainingdata, scaler=scaler, pbar=pbar)

    if opt.datatest != "":
        _runnetwork(epoch, testingdata, train=False, pbar=pbar)
        if opt.data == "":
            break  # Exit if only testing
    try:
        torch.save(net.state_dict(), f'{opt.outf}/net_{opt.namefile}_{epoch}.pth')
    except Exception as e:
        print(f"Error saving model at epoch {epoch}: {e}")

    # Stop training if nb_update_network exceeds the limit
    if opt.nbupdates is not None and nb_update_network > int(opt.nbupdates):
        break

print("end:", datetime.datetime.now().time())

training data: 473 batches
testing data: 16 batches
Training network pretrained on imagenet.


Downloading: "https://download.pytorch.org/models/vgg19-dcbb9e9d.pth" to /root/.cache/torch/hub/checkpoints/vgg19-dcbb9e9d.pth
100%|██████████| 548M/548M [00:05<00:00, 113MB/s] 


  0%|          | 0/10 [00:00<?, ?it/s]

Guardando model
Guardando model
Guardando model
Guardando model
Guardando model
Guardando model
Guardando model
Guardando model
Guardando model
Guardando model
Guardando model
Guardando model
Guardando model
Guardando model
Guardando model
Guardando model
Guardando model
Guardando model
Guardando model
Guardando model
Guardando model
Guardando model
Guardando model
Guardando model
Guardando model
Guardando model
Guardando model
Guardando model
Guardando model
Guardando model
Guardando model
Guardando model
Guardando model
Guardando model
Guardando model
Guardando model
Guardando model
Guardando model
Guardando model
Guardando model
Guardando model
Guardando model
Guardando model
Guardando model
Guardando model
Guardando model
Guardando model
Guardando model
Guardando model
Guardando model
Guardando model
Guardando model
Guardando model
Guardando model
Guardando model
Guardando model
Guardando model
Guardando model
Guardando model
Guardando model
Guardando model
Guardando model
Guardand