In [None]:
import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm
import numpy as np
import time
import os
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
from collections import defaultdict
import pickle

from color_regions import *
from network import *
from visualizations import *
from utils import *
from hooks import *
from config_objects import *
from training import *

# set up autoreloading of shared code
%load_ext autoreload
%autoreload 1
%aimport color_regions,network,visualizations,utils,hooks,config_objects,training
%aimport

torch.backends.cudnn.benchmark = True
device = "cuda:0" if torch.cuda.is_available() else "cpu"

In [None]:
import wandb
import pickle
import dataclasses


with open("./full_random_noisy/log.pkl", "rb") as p:
    data = pickle.load(p)

    layer_sizes=dict(medium_size=[[16, 3, 1], [32, 3, 1]],
                    tiny_size=[[2, 3, 4], [6, 3, 4]],
                    large_size=[[16, 3, 1], [32, 3, 2], [32, 3, 2], [64, 3, 2]],
                    huge_size=[[16, 3, 1], [32, 3, 2], [32, 3, 2], [64, 3, 2], [64, 3, 1], [128, 3, 1]])
    
    inv_layer_sizes = {str(v): k for k,v in layer_sizes.items()}
    
for k,v in data["train_results"].items():
    conf = dataclasses.asdict(k)
    conf["layer_sizes"] = inv_layer_sizes[str(conf["layer_sizes"])]
    wandb.init(project='project-apfij50gijdpoaij', config=conf)
    for va_loss, va_acc, tr_loss in zip(*v[:3]):
        wandb.log({"va_loss": va_loss, "va_acc": va_acc, "tr_loss": tr_loss})
    
    #print(data["test_results"][k])

In [None]:
import wandb
import pickle
import dataclasses


with open("./full_random_noisy/log.pkl", "rb") as p:
    data = pickle.load(p)

    layer_sizes=dict(medium_size=[[16, 3, 1], [32, 3, 1]],
                    tiny_size=[[2, 3, 4], [6, 3, 4]],
                    large_size=[[16, 3, 1], [32, 3, 2], [32, 3, 2], [64, 3, 2]],
                    huge_size=[[16, 3, 1], [32, 3, 2], [32, 3, 2], [64, 3, 2], [64, 3, 1], [128, 3, 1]])
    
    inv_layer_sizes = {str(v): k for k,v in layer_sizes.items()}

best_run = {model_size:None for model_size in layer_sizes}
best_acc = {model_size:-float("inf") for model_size in layer_sizes}
for k,v in data["train_results"].items():
    conf = dataclasses.asdict(k)
    #conf["layer_sizes"] = inv_layer_sizes[str(conf["layer_sizes"])]
    size_type = inv_layer_sizes[str(conf["layer_sizes"])]
    if data["test_results"][k][1] > best_acc[size_type]:
        best_acc[size_type] = data["test_results"][k][1]
        best_run[size_type] = k
    #wandb.init(project='project-apfij50gijdpoaij', config=conf)
        #wandb.log({"va_loss": va_loss, "va_acc": va_acc, "tr_loss": tr_loss})
    
    #print(data["test_results"][k])

In [None]:
already_uploaded = data

In [None]:
best_acc
# {'medium_size': 0.97576,
#  'tiny_size': 0.84508,
#  'large_size': 0.96756,
#  'huge_size': 0.97484}

In [None]:
best_run
# huge is worth training more (radiant-flower 23)
# large maybe worth (quiet-grass 58)
# medium not worth (rare-bee 3)
# tiny worth training (helpful-cloud 43)
# {'medium_size': ExperimentConfig(layer_sizes=[[16, 3, 1], [32, 3, 1]], fc_layers=[], groups=1, global_avg_pooling=True, learn_rate=0.01, weight_decay=1e-07, gain=0.05, epochs=50),
#  'tiny_size': ExperimentConfig(layer_sizes=[[2, 3, 4], [6, 3, 4]], fc_layers=[], groups=1, global_avg_pooling=False, learn_rate=0.001, weight_decay=3.7926901907322535e-06, gain=0.2, epochs=30),
#  'large_size': ExperimentConfig(layer_sizes=[[16, 3, 1], [32, 3, 2], [32, 3, 2], [64, 3, 2]], fc_layers=[], groups=1, global_avg_pooling=False, learn_rate=0.001, weight_decay=0.0026366508987303553, gain=0.05, epochs=30),
#  'huge_size': ExperimentConfig(layer_sizes=[[16, 3, 1], [32, 3, 2], [32, 3, 2], [64, 3, 2], [64, 3, 1], [128, 3, 1]], fc_layers=[], groups=1, global_avg_pooling=True, learn_rate=0.01, weight_decay=2.06913808111479e-07, gain=0.1, epochs=30)}

In [None]:
train_indices = (0, 250_000) # size of training set
valid_indices = (1_250_000, 1_275_000)
test_indices = (3_260_000, 3_560_000)

critical_color_values = list(range(0,241,30))

dset_config = ColorDatasetConfig(task_difficulty="hard",
                                 noise_size=(1,9),
                                 num_classes=3,
                                 num_objects=0,  # => permuted
                                 radius=(1/8., 1/7.),
                                 device=device,
                                 batch_size=128)

# copies the config each time
train_set = ColorDatasetGenerator(train_indices, dset_config)
valid_set = ColorDatasetGenerator(valid_indices, dset_config)
test_set = ColorDatasetGenerator(test_indices, dset_config)
# train_set.cfg.infinite = True

In [None]:
# the "hard" task
critical_color_values = list(range(0,241,30))

plt.figure(figsize=(6,6))
color_probe = np.linspace(0, 255, 255)
color_class = [hard_color_classifier(x) for x in color_probe]
plt.subplot(2,1,1)
plt.plot(color_probe, color_class)
plt.xticks(critical_color_values)
plt.yticks([0, 1, 2])
plt.ylabel("Class")

med_color_class = [medium_color_classifier(x) for x in color_probe]
plt.subplot(2,1,2)
plt.plot(color_probe, med_color_class)
plt.xlabel("Image Intensity")
plt.xticks([100, 150, 200])
plt.yticks([0, 1, 2])
plt.ylabel("Class")

In [None]:
num_x = 4
num_y = 4
plt.figure(figsize=(3*num_x, 3*num_y))
# back_probs = [0.25]
#valid_set.back_p = 0.25
for i in range(num_x*num_y):

    #valid_set.back_p = back_probs[i % 3]
    while not (80 < (img_gen := valid_set.generate_one())[2] < 150): # only do ones with target color >= 40
        pass
    plt.subplot(num_y, num_x, i+1)
#     if i // num_x == 0:
#         plt.title(f"p={valid_set.back_p}")
    imshow_centered_colorbar(img_gen[0], cmap="gray", colorbar=False)
#     plt.subplot(num_x, num_y, i*2+2)
#     plot_color_classes(valid_set, (0, 128), alpha=1.0)
#     plt.vlines([clr], 0, 128)

In [None]:
# can specify a probability via the second value in the tuple for each entry
hyperparameters=dict(learn_rate=[1e-4, 1e-3, 1e-2],
                     weight_decay=10**np.linspace(-7, -1, 20),
                     global_avg_pooling=[True, False],
                     layer_sizes=dict(medium_size=[[16, 3, 1], [32, 3, 1]],
                                      tiny_size=[[2, 3, 4], [6, 3, 4]],
                                      large_size=[[16, 3, 1], [32, 3, 2], [32, 3, 2], [64, 3, 2]],
                                      huge_size=[[16, 3, 1], [32, 3, 2], [32, 3, 2], [64, 3, 2], [64, 3, 1], [128, 3, 1]],
                                     ),
                    gain=[0, 0.05, 0.1, 0.2, 0.3])
prob_dists = dict(layer_sizes=[0.4, 0.4, 0.1, 0.1])
run_experiments(train_set, valid_set, "./full_random_noisy", hyperparameters, 
                search_type="random", prob_dists=prob_dists, num_rand=500)

In [None]:
acc_arr = np.zeros((2,12))
loss_arr = np.zeros((2,12))

for i, learn_rate in enumerate([1e-4, 1e-3, 1e-2]):
        for j, weight_decay in enumerate(10**np.linspace(-7, -3, 12)):
            acc_arr[i,j] = results[(learn_rate, weight_decay, )]
            

In [None]:
# best model from the sample (note that none actually used gain or weight decay, due to a mistake)
large_noise_net = ResNet([[16, 3, 1], [32, 3, 1]], valid_set.num_classes, [128, 128, 1], 
                   "decay_noise/large_size_0.2_0.0000351.dict", global_avg_pooling=True, 
                   fc_layers=[]).to(device)
print(large_noise_net.num_params())
large_noise_net.load_model_state_dict()

In [None]:
noise_net = ResNet([[2, 3, 4], [6, 3, 4]], valid_set.num_classes, [128, 128, 1], 
                   "permuted_hard_tiny.dict", global_avg_pooling=True,
                   fc_layers=[]).to(device)
print(noise_net.num_params())
noise_net.load_model_state_dict()

In [None]:
evaluate(noise_net, nn.CrossEntropyLoss(), test_loader, device=device)
# tiny, 84.4% accuracy

In [None]:
evaluate(large_noise_net, nn.CrossEntropyLoss(), test_loader, device=device)
# large, 97.6% accuracy

In [None]:
noise_net.eval()
avg_img = np.ones((valid_set.size, valid_set.size))
tensor_avg_img = tensorize(avg_img, device=device)
responses = []
for color in np.arange(255):
    tensor_avg_img[...] = color
    responses.append(permuted_net(tensor_avg_img).detach().cpu().numpy())
responses = np.asarray(responses).squeeze()

In [None]:
for i in range(3):
    plt.plot(np.arange(255), responses[:,i], label=f"logit {i}")
plt.legend()
plot_color_classes(valid_set, (responses.min(), responses.max()))

In [None]:
def averaging_test(dataset, sample, edge_width=10):
    avg_area = np.pi/3*(dataset.radius[1]**2+dataset.radius[0]**2+dataset.radius[0]*dataset.radius[1])
    pct_area = avg_area / (dataset.size**2)
    print(f"Targets are on average {pct_area:.1%} of the image")
    other_points = []
    
    total_answered = 0
    right_calibrated = 0
    right_naive = 0
    right_color_set = 0
    right_base = 0
    right_edge_set = 0
    right_background_set = 0
    
    avg_img = np.ones((dataset.size, dataset.size))
    tensor_avg_img = tensorize(avg_img, device=device)
    for _ in tqdm(range(sample)):
        img_gen, lbl, color, *_ = dataset.generate_one()
        color = color[0]
        foreground_mask = np.where(img_gen>2)
        other_space = img_gen[(img_gen > 2) & (img_gen != color)].sum() / foreground_mask[0].shape[0]
        
        prediction = (img_gen[foreground_mask].mean() - 36.9)/(1-36.9/128)
        if np.isnan(prediction) or np.isnan(other_space):
            continue
        tensor_avg_img[...] = color  # color setting
        color_set_classif = permuted_net(tensor_avg_img).argmax()
        
        tensor_avg_img[...] = img_gen.mean()  # naive averaging
        naive_classif = permuted_net(tensor_avg_img).argmax()
        
        tensor_avg_img[...] = prediction  # calibrated averaging
        calibrated_classif = permuted_net(tensor_avg_img).argmax()
        
        tensor_img_gen = tensorize(img_gen, device=device)
        base_classif = permuted_net(tensor_img_gen).argmax() # regular classification
        
        tensor_img_gen[tensor_img_gen == 0] = (color + 30) % 255 # set background to a different class
        background_set_classif = permuted_net(tensor_img_gen).argmax()
        
        # edge set test (set to color since thats the best results)
        tensor_avg_img[...] = 0
        tensor_avg_img[0,0, 0:edge_width] = color
        tensor_avg_img[0,0, -edge_width:] = color
        tensor_avg_img[0,0,:, 0:edge_width] = color
        tensor_avg_img[0,0,:, -edge_width:] = color
        edge_set_classif = permuted_net(tensor_avg_img).argmax()
        
        total_answered += 1
        right_base += lbl.argmax() == base_classif
        right_background_set += lbl.argmax() == background_set_classif
        right_edge_set += lbl.argmax() == edge_set_classif
        right_calibrated += lbl.argmax() == calibrated_classif
        right_naive += lbl.argmax() == naive_classif
        right_color_set += lbl.argmax() == color_set_classif
    print(f"Calibrated got {right_calibrated/total_answered:.2%} correct")
    print(f"Naive got {right_naive/total_answered:.2%} correct")
    print(f"Color setting got {right_color_set/total_answered:.2%} correct")
    print(f"Edge setting got {right_edge_set/total_answered:.2%} correct")
    print(f"Background setting got {right_background_set/total_answered:.2%} correct")
    print(f"Base got {right_base/total_answered:.2%} correct")
    
result = averaging_test(valid_set, 100_000)
# PCA map to see edge behaviour (average a bunch of them?)
# color set edge test
# background only set test? (do it maliciously) (see how badly it hurts performance)

In [None]:
def error_by_color(dataset, sample=100_000):
    points = []
    avg_area = np.pi/3*(dataset.radius[1]**2+dataset.radius[0]**2+dataset.radius[0]*dataset.radius[1])
    pct_area = avg_area / (dataset.size**2)
    print(f"Targets are on average {pct_area:.1%} of the image")
    other_points = []
    total_answered = 0
    right_calibrated = 0
    right_naive = 0
    right_really_naive = 0
    for _ in tqdm(range(sample)):
        img_gen, lbl, color, *_ = dataset.generate_one()
        #prediction = np.minimum(img_gen/pct_area, 255)
        foreground_mask = np.where(img_gen>2)
        other_space = img_gen[(img_gen > 2) & (img_gen != color)].sum() / foreground_mask[0].shape[0]
        #print(len(foreground_mask[0]), img_gen[(img_gen > 2) & (img_gen != color)].size)
        # model: avg = color*(1-pct) + 128*pct
        # calculate pct by figuring out the average sum of non-target non-background pixels
        # divided by the size of the non-background area => gives you 128*pct
        
        prediction = (img_gen[foreground_mask].mean() - 36.9)/(1-36.9/128)
        if np.isnan(prediction) or np.isnan(other_space):
            continue
        total_answered += 1
        right_calibrated += lbl.argmax() == color_classifier(prediction)
        right_naive += lbl.argmax() == color_classifier(img_gen[foreground_mask].mean())
        right_really_naive += lbl.argmax() == color_classifier(img_gen.mean()/pct_area)
        points.append((color, prediction))
        other_points.append((color, other_space))
    print(f"Calibrated got {right_calibrated/total_answered:.2%} correct")
    print(f"Naive got {right_naive/total_answered:.2%} correct")
    print(f"Really naive got {right_really_naive/total_answered:.2%} correct")

    return np.asarray(points), np.asarray(other_points)
result = error_by_color(valid_set, sample=100_000)

In [None]:
plt.scatter(result[1][:,0], result[1][:,1], s=0.05)
plt.plot(np.arange(255), c="r")
result[1][:,1].mean()

In [None]:
plt.scatter(result[0][:,0], result[0][:,1], s=0.05)
plt.plot(np.arange(255), c="r")

In [None]:
np.random.seed(5_123_456)
test_img, lbl, color, size, *_  = valid_set.generate_one()
print(color)
plt.imshow(test_img, cmap="gray")
tensor_test_img = tensorize(test_img, device=device)

interp_net = AllActivations(noise_net)
interp_net.eval()
interp_net(tensor_test_img)

In [None]:
c = 0
block = 1

#uniform_inpt = torch.full((1,16,32,32), 100.0).to(device)
#plt.imshow(tiny_net.conv_blocks[0].conv2.weight[c, in_c].detach().cpu().numpy(), cmap="bwr")
conv_maps = tiny_net.conv_blocks[block].conv2.weight[c, :]
#imshow_centered_colorbar(conv_maps[7].detach().cpu().numpy(), cmap="bwr")
conv_scale = conv_maps.max(axis=-1).values.max(axis=-1).values
conv_shift = tiny_net.conv_blocks[block].conv2.bias[c]
bn_scale = tiny_net.conv_blocks[block].batch_norm2.weight[c]
bn_shift = tiny_net.conv_blocks[block].batch_norm2.bias[c]
bn_var = tiny_net.conv_blocks[block].batch_norm2.running_var[c]
bn_mean = tiny_net.conv_blocks[block].batch_norm2.running_mean[c]
print(conv_shift, bn_scale, bn_shift, bn_var, bn_mean)
#(c*conv_scale + conv_shift - bn_mean) / torch.sqrt(bn_var) * bn_scale + bn_shift
slope = (conv_scale/torch.sqrt(bn_var)*bn_scale).detach().cpu().numpy()
bias = ((conv_shift - bn_mean)/torch.sqrt(bn_var)*bn_scale + bn_shift).detach().cpu().numpy()

lines = np.asarray([profile_plots[f"conv_blocks.{block}.act_func1_{x}"][0] for x in range(6)])

uniform_scaling = slope.dot(lines) + bias


In [None]:
plt.plot(np.maximum(uniform_scaling, 0))

In [None]:
%matplotlib notebook
feature_gram, projected_weights = visualizations.fc_conv_feature_angles(noise_net, 
                            "fully_connected.0.act_func", num_embed=3, normalize=True)

# Small Network Weight Analysis

In [None]:
noise_net.eval()
profile_plots,_ = activation_color_profile(AllActivations(noise_net), valid_loader, valid_set, device=device)

In [None]:
np.random.seed(5_13_46)
test_img, lbl, color, size, pos, noise, orig_img  = valid_set.generate_one()
print(color)

plt.figure(figsize=(12,16))
plt.subplot(1,2,1)
plt.imshow(test_img, cmap="gray")
plt.subplot(1,2,2)
denoised_img = np.where(test_img == color, color, 0)
plt.imshow(denoised_img, cmap="gray")

tensor_test_img = tensorize(test_img, device=device)
denoised_tensor_img = tensorize(denoised_img, device=device)

interp_net = AllActivations(noise_net)
interp_net.eval()
print(interp_net(tensor_test_img))

de_interp_net = AllActivations(noise_net)
de_interp_net.eval()
print(de_interp_net(denoised_tensor_img))

In [None]:
show_conv_weights(interp_net, "conv_blocks.0.act_func1", color_profile=profile_plots)

In [None]:
show_conv_layer(interp_net, "conv_blocks.0.act_func1")

In [None]:
show_conv_weights(interp_net, "conv_blocks.0.act_func2", color_profile=profile_plots)

In [None]:
show_conv_layer(interp_net, "conv_blocks.0.act_func2")

In [None]:
show_conv_layer(de_interp_net, "conv_blocks.0.act_func2")


In [None]:
#print(interp_net.model.conv_blocks[1].conv1.bias)
show_conv_weights(interp_net, "conv_blocks.1.act_func1", color_profile=profile_plots)

In [None]:
show_conv_layer(interp_net, "conv_blocks.1.act_func1")

In [None]:
show_conv_layer(de_interp_net, "conv_blocks.1.act_func1")

In [None]:
show_conv_weights(interp_net, "conv_blocks.1.act_func2", color_profile=profile_plots)

In [None]:
show_conv_layer(interp_net, "conv_blocks.1.act_func2")

In [None]:
show_conv_layer(de_interp_net, "conv_blocks.1.act_func2")

In [None]:
# show_fc_conv(interp_net, color_profile=profile_plots, fixed_height=True, full_gridspec=True)
# no longer have this since its GAP
show_fc(interp_net, "fully_connected.0.act_func", color_profile=profile_plots)

In [None]:
show_conv_layer(interp_net, "conv_blocks.1.act_func1")

In [None]:
show_conv_layer(interp_net, "conv_blocks.1.act_func2")

In [None]:
show_conv_layer(de_interp_net, "conv_blocks.1.act_func2")

In [None]:
fc_mapper = get_weight(interp_net, "fully_connected.0.fully_connected")

In [None]:
permuted_large_net.final_img_shape

In [None]:
show_conv_weights(interp_net, "conv_blocks.0.act_func2", color_profile=permuted_plots)

In [None]:
show_conv_layer(interp_net, "conv_blocks.0.act_func2")

In [None]:
show_conv_weights(interp_net, "conv_blocks.1.act_func1", color_profile=permuted_plots)

In [None]:
show_conv_layer(interp_net, "conv_blocks.1.act_func1")

In [None]:
show_conv_weights(interp_net, "conv_blocks.1.act_func2", color_profile=permuted_plots)

In [None]:
show_conv_layer(interp_net, "conv_blocks.1.act_func2")
# uniform image of average and pass into network
# pasting images onto each other
# send in images that have only target pixels

In [None]:
show_fc_conv(interp_net, color_profile=permuted_plots, fixed_height=True, full_gridspec=True)

# PCA Direction Analysis

In [None]:
default_scales = [3,5,7,9,13,15]
if 0: 
    %store -r noise_back_pca_directions_1_stride noise_back_pca_directions_s_stride
else:
    noise_back_pca_directions_1_stride = find_pca_directions(valid_set, 4096, default_scales, 1)
    noise_back_pca_directions_s_stride = find_pca_directions(valid_set, 4096, default_scales, default_scales)
    %store noise_back_pca_directions_1_stride noise_back_pca_directions_s_stride

In [None]:
visualize_pca_directions(noise_back_pca_directions_s_stride, "Strides=scales", default_scales, lines=True)

In [None]:
seeds = [1_2123, 1_40_124, 1_508_559, 1_5_019_258, 1_2_429_852, 9032, 5832, 12, 5014, 92, 42, 52, 
         52_934, 935_152, 1_000_000, 1_000_001, 27, 24, 512, 999_105]  # 20 

In [None]:
pca_map_s_strides, _, grad_maps, explain_imgs = generate_many_pca(permuted_net, seeds, 
                noise_back_pca_directions_1_stride, default_scales, valid_set, component=0, 
                batch_size=512, strides=3, skip_1_stride=True, device=device)

In [None]:
plt_grid_figure([explain_imgs, pca_map_s_strides, grad_maps], transpose=True, titles=["Image", "Strides=3", "Gradient"])