In [None]:
import os
import numpy as np 
import pandas as pd 
from datetime import datetime
import time
import random
from tqdm.auto import tqdm


#Torch
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.sampler import SequentialSampler, RandomSampler
import torchvision.transforms as T

#sklearn
from sklearn.model_selection import StratifiedKFold
from skimage import io

################# DETR FUCNTIONS FOR LOSS######################## 
import sys
sys.path.append('./detr_custom/')

from models.matcher import HungarianMatcher
from models.detr import SetCriterion
#################################################################

import matplotlib.pyplot as plt

#Glob
from glob import glob

from typing import Iterable, Sequence, List, Tuple, Dict, Optional, Any
import albumentations as A
from albumentations.pytorch import ToTensorV2
from PIL import Image
from generators import BlenderStandardDataset, Blender3DDataset
import itertools
import seaborn as sns
import multiprocessing as mp
from utils import debugs, debug, debugt

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

In [None]:
TORCH_CACHE_DIR = 'torch_cache'
DATASET_DIR = '/mnt/blendervol/3d_data'
SQL_TABLE = 'bboxes_std'
BATCH_SIZE = 32

In [None]:
from importlib import reload
import generators
reload(generators)
from generators import BlenderStandardDataset

datagen = Blender3DDataset(DATASET_DIR, SQL_TABLE, BATCH_SIZE, shuffle=False)
X, y = datagen[0]

In [8]:
len(datagen.indices)

60618

# Image statistics

In [5]:
def calc_mean(data: Iterable[Tuple[np.ndarray, Any]], disable_tqdm=True):
    denominator = 0
    mean = np.zeros(3)
    for imgbatch, __ in tqdm(data, total=len(data), disable=disable_tqdm):
        n = len(imgbatch)
        imgbatch = np.array([x[0] for x in imgbatch])
        mean = np.average((mean, imgbatch.mean((0,1,2))), 0, weights=(denominator, n))
        denominator += n
        
    return mean

def calc_var(data: Iterable[Tuple[np.ndarray, Any]], mean: np.ndarray, disable_tqdm=True):
    denominator = 0
    var = np.zeros_like(mean)
    for imgbatch, __ in tqdm(data, total=len(data), disable=disable_tqdm):
        n = len(imgbatch)
        imgbatch = np.array([x[0] for x in imgbatch])
        batchvar = ((imgbatch - mean)**2).mean((0,1,2))
        var = np.average((var, batchvar), axis=0, weights=(denominator, n))
        denominator += n
    return var
    
def calc_stats(data: Iterable):
    print('Calculating mean:')
    mean = calc_mean(data, False)
    print('Calculating variance')
    var = calc_var(data, mean, False)
    print(f'Mean: {mean}')
    print(f'Variance: {var}')
    print(f'Std: {np.sqrt(var)}')
    
    return mean, var

# m, v = calc_stats(datagen)

In [6]:
def _mp_calc_mean(rng: Tuple[int, int]):
    TORCH_CACHE_DIR = 'torch_cache'
    DATASET_DIR = '/mnt/blendervol/3d_data'
    SQL_TABLE = 'bboxes_std'
    BATCH_SIZE = 32
    datagen = Blender3DDataset(DATASET_DIR, SQL_TABLE, BATCH_SIZE, shuffle=False, imgnrs=range(*rng))
    return calc_mean(datagen)


def _mp_calc_var(arg: Tuple[np.ndarray, Tuple[int, int]]):
    mean, rng = arg
    TORCH_CACHE_DIR = 'torch_cache'
    DATASET_DIR = '/mnt/blendervol/3d_data'
    SQL_TABLE = 'bboxes_std'
    BATCH_SIZE = 32
    datagen = Blender3DDataset(DATASET_DIR, SQL_TABLE, BATCH_SIZE, shuffle=False, imgnrs=range(*rng))
    return calc_var(datagen, mean)


def calc_mean_mp(data: Iterable):
    with mp.Pool(20) as pool:
        ranges = ((BATCH_SIZE*i, BATCH_SIZE*(i+1)) for i in range(len(data)))
        pgen = pool.imap_unordered(_mp_calc_mean, ranges, chunksize=20)
        means = tuple(tqdm(pgen, total=len(data)))
    return np.mean(means, axis=0)
    
    
def calc_var_mp(data: Iterable, mean):
    with mp.Pool(20) as pool:
        args = ((mean, (BATCH_SIZE*i, BATCH_SIZE*(i+1))) for i in range(len(data)))
        pgen = pool.imap_unordered(_mp_calc_var, args, chunksize=20)
        vars_ = tuple(tqdm(pgen, total=len(data)))
    return np.mean(vars_, axis=0)
    
    
def calc_stats_mp(data: Iterable):
    print('Calculating mean:')
    mean = calc_mean_mp(data)
    print('Calculating variance')
    var = calc_var_mp(data, mean)
    print(f'Mean: {mean}')
    print(f'Variance: {var}')
    print(f'Std: {np.sqrt(var)}')
    
    return mean, var

m_mp, v_mp = calc_stats_mp(datagen)

Calculating mean:


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1894.0), HTML(value='')))


Calculating variance


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1894.0), HTML(value='')))


Mean: [0.65629897 0.76457309 0.43896555]
Variance: [0.00418913 0.00505205 0.00331689]
Std: [0.06472352 0.07107777 0.05759248]


In [11]:
def save_stats(n, mean, var):
    with open("dataset_stats.txt", "w+") as f:
        f.write(
            f"mean: {mean}\n"
            f"var: {var}\n"
            f"std: {np.sqrt(var)}\n"
            f"n_datapoints: {n}\n"
        )
        
save_stats(len(datagen.indices), m_mp, v_mp)

# Label statistics

In [None]:
93*32

In [None]:
def class_stats(data: Iterable):
    return pd.read_sql("""
        SELECT class_, COUNT(*) FROM bboxes_std 
        WHERE imgnr <= 2976
        GROUP BY class_ 
    """, data.con)

histdict = class_stats(datagen)
histdict

<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>

In [None]:
raise ValueError

In [None]:
A = np.array([
    np.full((16,16), 1),
    np.full((16,16), 2),
    np.full((16,16), 3)
])[None,...].repeat(10, 0).transpose((0,2,3,1))

B = np.array([
    np.full((16,16), 2),
    np.full((16,16), 4),
    np.full((16,16), 6)
])[None,...].repeat(10, 0).transpose((0,2,3,1))

C = np.array([
    np.full((16,16), 3),
    np.full((16,16), 6),
    np.full((16,16), 9)
])[None,...].repeat(10, 0).transpose((0,2,3,1))

D = np.array([
    np.full((16,16), 4),
    np.full((16,16), 8),
    np.full((16,16), 12)
])[None,...].repeat(10, 0).transpose((0,2,3,1))

E = np.array([
    np.full((16,16), 5),
    np.full((16,16), 10),
    np.full((16,16), 15)
])[None,...].repeat(10, 0).transpose((0,2,3,1))

np.mean([A,B,C,D,E], (0,1,2,3))

In [None]:
np.std([np.full(16*16*10, 2),
        np.full(16*16*10, 4),
        np.full(16*16*10, 6),
        np.full(16*16*10, 8),
        np.full(16*16*10, 10)
       ])

In [None]:
np.std([A,B,C,D,E], (0,1,2,3))

In [None]:
# Super memory efficient implementation :^)
mean = np.zeros(3)
denominator = 0
for thing in [A, B, C]:
    mean = np.average((mean, thing.mean((0,1,2))), axis=0, weights=(denominator, len(thing)))
    denominator += len(thing)

print('Mean:')
print(mean)

In [None]:
var = np.zeros_like(mean)
denominator_var = 0

for thing in [A, B, C]:    
    m_ = ((thing-mean)**2).mean((0,1,2))
    
    print(var)
    print(m_)
    print()
    
    var = np.average(
        (var, m_),
        axis=0,
        weights=(denominator_var, len(thing))
    )
    
    denominator_var += len(thing)
    
print('Var: ')
print(var)

In [None]:
np.array([((A - mean)**2).mean((0,1,2)), ((B - mean)**2).mean((0,1,2)), ((C - mean)**2).mean((0,1,2))]).mean(0)

In [None]:
class Dummygen:
    def __len__(self):
        return 5
    
    def __getitem__(self, index):
        return ([A,B,C,D,E][index], None)
        
m, v = calc_stats(Dummygen())
print(np.sqrt(v))