In [1]:

import numpy as np
import os
import zipfile
from bs4 import BeautifulSoup as bs
import matplotlib.pyplot as plt

import torch
from torch.utils.data import DataLoader,Dataset
from torchvision.transforms import ToTensor
from torch.nn import functional as F


In [2]:
path = "RAVEN-10000-release.zip"
#must delete dataset file if want to redo
if not os.path.isdir('RAVEN-10000'):
    with zipfile.ZipFile(path,mode='r') as dataset:
        dataset.extractall('RAVEN-10000') #extract into dataset


In [3]:
#accessing Zhang's utils.py
from utils import dataset

path = 'RAVEN-10000_dset/' #glob only worked here with relative file path
dsetype = 'train'
img_size = 160

batch_size = 32
num_workers = 2

train_set = dataset(dataset_path=path,dataset_type='train',
                    img_size=img_size,test=False)
#wraps iterable around training dataset
train_loader = DataLoader(train_set,batch_size = 32,
                          shuffle=True)
#returns iterator 
train_loader_iter = iter(train_loader)
images,targets = next(train_loader_iter)

#plot imgs
if 0: 
    train_instance =  images[0]
    plt.figure(figsize=(10,10))
    for i in range(16):
        plt.subplot(4,4,i+1) #this the important one
        plt.xticks([])
        plt.yticks([])
        plt.grid(False)
        plt.imshow(train_instance[i],cmap=plt.cm.binary)
    plt.show()
    

In [29]:
#Exploring model from input
import networks
self = networks.CoPINet()
x = images.view(-1,16,80,80)

N,_,H,W = x.shape
input_features = self.maxpool(
    self.relu(
        self.bn1(
            self.conv1(
                x.view(-1,80,80).unsqueeze(1)))))

#inference branch

prior = x[:,:8,:,:] #first eight imgs (no choices)
input_features = self.maxpool(
    self.relu(
        self.inf_bn1(
            self.inf_conv1(prior.contiguous().view(-1,80,80).unsqueeze(1)))))
        
input_features = input_features.view(-1,8,64,20,20)

row1_features = torch.sum(input_features[:, 0:3, :, :, :], dim=1)
row2_features = torch.sum(input_features[:, 3:6, :, :, :], dim=1)
row_features = self.relu(
    self.inf_bn_row(
        self.inf_conv_row(
            torch.cat((row1_features, row2_features), dim=0))))
final_row_features = row_features[:N, :, :, :] + row_features[
    N:, :, :, :]

col1_features = torch.sum(input_features[:, 0:9:3, :, :, :], dim=1)
col2_features = torch.sum(input_features[:, 1:9:3, :, :, :], dim=1)
col_features = self.relu(
    self.inf_bn_col(
        self.inf_conv_col(
            torch.cat((col1_features, col2_features), dim=0))))
final_col_features = col_features[:N, :, :, :] + col_features[
    N:, :, :, :]

input_features = final_row_features + final_col_features
input_features = self.avgpool(input_features).view(-1, 64)

predict_rules = self.predict_rule(
    input_features)  # N, self.num_attr * self.num_rule
predict_rules = predict_rules.view(-1, self.num_rule)
predict_rules = self.inference(predict_rules)

basis_bias = self.basis_bias(predict_rules)  # N * self.num_attr, 64
basis_bias = torch.sum(basis_bias.view(-1, self.num_attr, 64),
                       dim=1)  # N, 64

contrast1_bias = self.contrast1_bias_trans(basis_bias)
contrast1_bias = contrast1_bias.view(-1, 64, 1,
                                     1).expand(-1, -1, 20, 20)
contrast2_bias = self.contrast2_bias_trans(basis_bias)
contrast2_bias = contrast2_bias.view(-1, 64, 1,
                                     1).expand(-1, -1, 10, 10)

# Perception Branch
input_features = self.maxpool(
    self.relu(self.bn1(self.conv1(x.view(-1, 80, 80).unsqueeze(1)))))
input_features = input_features.view(-1, 16, 64, 20, 20)

choices_features = input_features[:, 8:, :, :, :].unsqueeze(
    2)  # N, 8, 64, 20, 20 -> N, 8, 1, 64, 20, 20

row1_features = torch.sum(input_features[:, 0:3, :, :, :],
                          dim=1)  # N, 64, 20, 20
row2_features = torch.sum(input_features[:, 3:6, :, :, :],
                          dim=1)  # N, 64, 20, 20
row3_pre = input_features[:, 6:8, :, :, :].unsqueeze(1).expand(
    N, 8, 2, 64, 20, 20
)  # N, 2, 64, 20, 20 -> N, 1, 2, 64, 20, 20 -> N, 8, 2, 64, 20, 20
row3_features = torch.sum(
    torch.cat((row3_pre, choices_features), dim=2), dim=2).view(
        -1, 64, 20, 20
    )  # N, 8, 3, 64, 20, 20 -> N, 8, 64, 20, 20 -> N * 8, 64, 20, 20
row_features = self.relu(
    self.bn_row(
        self.conv_row(
            torch.cat((row1_features, row2_features, row3_features),
                      dim=0))))

row1 = row_features[:N, :, :, :].unsqueeze(1).unsqueeze(1).expand(
    N, 8, 1, 64, 20, 20)
row2 = row_features[N:2 * N, :, :, :].unsqueeze(1).unsqueeze(1).expand(
    N, 8, 1, 64, 20, 20)
row3 = row_features[2 * N:, :, :, :].view(-1, 8, 64, 20,
                                          20).unsqueeze(2)
final_row_features = torch.sum(torch.cat((row1, row2, row3), dim=2),
                               dim=2)

col1_features = torch.sum(input_features[:, 0:9:3, :, :, :],
                          dim=1)  # N, 64, 20, 20
col2_features = torch.sum(input_features[:, 1:9:3, :, :, :],
                          dim=1)  # N, 64, 20, 20
col3_pre = input_features[:, 2:8:3, :, :, :].unsqueeze(1).expand(
    N, 8, 2, 64, 20, 20
)  # N, 2, 64, 20, 20 -> N, 1, 2, 64, 20, 20 -> N, 8, 2, 64, 20, 20
col3_features = torch.sum(
    torch.cat((col3_pre, choices_features), dim=2), dim=2).view(
        -1, 64, 20, 20
    )  # N, 8, 3, 64, 20, 20 -> N, 8, 64, 20, 20 -> N * 8, 64, 20, 20
col_features = self.relu(
    self.bn_col(
        self.conv_col(
            torch.cat((col1_features, col2_features, col3_features),
                      dim=0))))

col1 = col_features[:N, :, :, :].unsqueeze(1).unsqueeze(1).expand(
    N, 8, 1, 64, 20, 20)
col2 = col_features[N:2 * N, :, :, :].unsqueeze(1).unsqueeze(1).expand(
    N, 8, 1, 64, 20, 20)
col3 = col_features[2 * N:, :, :, :].view(-1, 8, 64, 20,
                                          20).unsqueeze(2)
final_col_features = torch.sum(torch.cat((col1, col2, col3), dim=2),
                               dim=2)

input_features = final_row_features + final_col_features
input_features = input_features.view(-1, 64, 20, 20)

res1_in = input_features.view(-1, 8, 64, 20, 20)
res1_contrast = self.res1_contrast_bn(
    self.res1_contrast(
        torch.cat((torch.sum(res1_in, dim=1), contrast1_bias), dim=1)))
res1_in = res1_in - res1_contrast.unsqueeze(1)
res2_in = self.res1(res1_in.view(-1, 64, 20, 20))
res2_in = res2_in.view(-1, 8, 128, 10, 10)
res2_contrast = self.res2_contrast_bn(
    self.res2_contrast(
        torch.cat((torch.sum(res2_in, dim=1), contrast2_bias), dim=1)))
res2_in = res2_in - res2_contrast.unsqueeze(1)
out = self.res2(res2_in.view(-1, 128, 10, 10))

avgpool = self.avgpool(out)
avgpool = avgpool.view(-1, 256)
final = avgpool
final = self.mlp(final)
final = final.view(-1,8)

model_output = final

In [27]:
import criteria
loss,G,zeros = criteria.contrast_loss(model_output,targets)

In [38]:
print('''
Notes on network.py structure:

- Gumbel-softmax on line 127. So all that stuff before is the encoding? And the stuff
after is preparing contrast_bias 1 and 2, for input to perception branch

- Creating 'input features' from x: This is the job of the conv layer (and relu and batchnorm).
Remember, conv layers take an image and produce a feature map (through application of conv operator 
then some type of pooling), where the feature map is a score against learned kernel.

- Lines 103 onwards collect features across row and column.

- It seems like lines 140 through to 201 is just the feature encoding for 
the perception branch

- It looks like 200 - 214 is the contrast block. Takes input features (avgd across rows and columns). 
Then through convnet, batchnorm, resblock. 

- Note that there are 2 contrast blocks working in series. Don't know why this is the case.

Notes on loss: 

- So CoPINet returns an (m,8) tensor. 
    Hypothesis 1 - the eight vectors here are positions of all OUa's in some latent space
    Hypothesis 2 - Each of the eight 128 vectors are negative potentials for the 80x80 quarter-images (32 x 4 = 128)
    
- Use BCE with logits because we're inputting negative potentials (logits) into 
cost function, NOT probabilities.

- G contains potentials (only first 32 rows are relevant). zeros are actual answers (this tensor is created
from 'targets')


Questions for Dr.:

- How are the conv layers designed? The kernel sizes, stride lengths seem pretty arbitrary to me and no mention in paper
- First step - reshape (160,160) imgs to (80,80).

''')


Notes on network.py structure:

- Gumbel-softmax on line 127. So all that stuff before is the encoding? And the stuff
after is preparing contrast_bias 1 and 2, for input to perception branch

- Creating 'input features' from x: This is the job of the conv layer (and relu and batchnorm).
Remember, conv layers take an image and produce a feature map (through application of conv operator 
then some type of pooling), where the feature map is a score against learned kernel.

- Lines 103 onwards collect features across row and column.

- It seems like lines 140 through to 201 is just the feature encoding for 
the perception branch

- It looks like 200 - 214 is the contrast block. Takes input features (avgd across rows and columns). 
Then through convnet, batchnorm, resblock. 

- Note that there are 2 contrast blocks working in series. Don't know why this is the case.

Notes on loss: 

- So CoPINet returns an (m,8) tensor. 
    Hypothesis 1 - the eight vectors here are positions of all OUa's