In [1]:
def is_dist_avail_and_initialized():
    if not dist.is_available():
        return False
    if not dist.is_initialized():
        return False
    return True


def get_world_size():
    if not is_dist_avail_and_initialized():
        return 1
    return dist.get_world_size()


def get_rank():
    if not is_dist_avail_and_initialized():
        return 0
    return dist.get_rank()


def is_main_process():
    return get_rank() == 0


def save_on_master(*args, **kwargs):
    if is_main_process():
        torch.save(*args, **kwargs)


def setup_for_distributed(is_master):
    """
    This function disables printing when not in master process
    """
    import builtins as __builtin__
    builtin_print = __builtin__.print

    def print(*args, **kwargs):
        force = kwargs.pop('force', False)
        if is_master or force:
            builtin_print(*args, **kwargs)

    __builtin__.print = print
    
def init_distributed():

    # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
    dist_url = "env://" # default
    # only works with torch.distributed.launch // torch.run
    rank = int(os.environ["RANK"])
    world_size = int(os.environ['WORLD_SIZE'])
    local_rank = int(os.environ['LOCAL_RANK'])

    dist.init_process_group(
            backend="nccl",
            init_method=dist_url,
            world_size=world_size,
            rank=rank)

    # this will make all .cuda() calls work properly
    torch.cuda.set_device(local_rank)
    # synchronizes all the threads to reach this point before moving on
    dist.barrier()
    setup_for_distributed(rank == 0)
    
    return world_size

def sync_across_gpus(tensor, local_rank, world_size):

    # print(local_rank, tensor)
    dist.barrier()
    if tensor.dim() == 0:
        gather_t_tensor = [torch.zeros_like(tensor) for _ in range(world_size)]
        dist.all_gather(gather_t_tensor, tensor)
        gather_t_tensor = torch.stack(gather_t_tensor)
    else:
        local_batch_size = torch.tensor(tensor.size(0)).cuda().int()
        all_batch_sizes = [torch.zeros(1).cuda().int()] * world_size
        dist.all_gather(all_batch_sizes, local_batch_size)
        max_batch_size = torch.stack(all_batch_sizes).max().item()

        if local_batch_size < max_batch_size:
            padding = torch.zeros((max_batch_size - local_batch_size, *tensor.shape[1:]), dtype=tensor.dtype, device=tensor.device)
            tensor = torch.cat([tensor, padding], dim=0)

        dist.barrier()

        gather_t_tensor = [torch.zeros((max_batch_size, *tensor.shape[1:]), dtype=tensor.dtype, device=tensor.device) for _ in range(world_size)]
        dist.all_gather(gather_t_tensor, tensor)
        cleaned_tensors = [t[:all_batch_sizes[i]] for i, t in enumerate(gather_t_tensor)]
        gather_t_tensor = torch.cat(cleaned_tensors)

    return gather_t_tensor
    
def seed_everything(seed=1234):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

    
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from tqdm import tqdm
import os
from glob import glob
import copy
import time
import math
import command
import random

#os.environ['CUDA_VISIBLE_DEVICES'] = '1,2,3'

import cv2
from PIL import Image
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rcParams['figure.figsize'] = 12, 8

from skimage import img_as_ubyte
import albumentations as A
from albumentations.pytorch import ToTensorV2

from sklearn.model_selection import *
from sklearn.metrics import *

import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
import segmentation_models_pytorch as smp
import timm

from transformers import get_cosine_schedule_with_warmup

import torch.distributed as dist

In [None]:
img1 = Image.open('/mnt/md0/dacl10k/external/Crackseg9k/Images/CRACK500_20160222_080850_1281_361.png')
 = Image.open('/mnt/md0/dacl10k/external/Crackseg9k/Images/CRACK500_20160222_080850_1281_721.png')
 = Image.open('/mnt/md0/dacl10k/external/Crackseg9k/Images/CRACK500_20160222_080850_1921_1.png')
 = Image.open('/mnt/md0/dacl10k/external/Crackseg9k/Images/CRACK500_20160222_080850_1921_361.png')
 = Image.open('/mnt/md0/dacl10k/external/Crackseg9k/Images/CRACK500_20160222_080850_641_361.png')

In [4]:
sorted(glob('/mnt/md0/dacl10k/external/Crackseg9k/Images/*'))

['/mnt/md0/dacl10k/external/Crackseg9k/Images/CRACK500_20160222_080850_1281_361.png',
 '/mnt/md0/dacl10k/external/Crackseg9k/Images/CRACK500_20160222_080850_1281_721.png',
 '/mnt/md0/dacl10k/external/Crackseg9k/Images/CRACK500_20160222_080850_1921_1.png',
 '/mnt/md0/dacl10k/external/Crackseg9k/Images/CRACK500_20160222_080850_1921_361.png',
 '/mnt/md0/dacl10k/external/Crackseg9k/Images/CRACK500_20160222_080850_641_361.png',
 '/mnt/md0/dacl10k/external/Crackseg9k/Images/CRACK500_20160222_080933_361_641.png',
 '/mnt/md0/dacl10k/external/Crackseg9k/Images/CRACK500_20160222_080933_721_1.png',
 '/mnt/md0/dacl10k/external/Crackseg9k/Images/CRACK500_20160222_080933_721_641.png',
 '/mnt/md0/dacl10k/external/Crackseg9k/Images/CRACK500_20160222_081011_1281_721.png',
 '/mnt/md0/dacl10k/external/Crackseg9k/Images/CRACK500_20160222_081011_1921_721.png',
 '/mnt/md0/dacl10k/external/Crackseg9k/Images/CRACK500_20160222_081011_1_721.png',
 '/mnt/md0/dacl10k/external/Crackseg9k/Images/CRACK500_20160222_0