# Environment Setup

In [1]:
!pip install torch

Collecting torch
[?25l  Downloading https://files.pythonhosted.org/packages/69/43/380514bd9663f1bf708abeb359b8b48d3fabb1c8e95bb3427a980a064c57/torch-0.4.0-cp36-cp36m-manylinux1_x86_64.whl (484.0MB)
[K    100% |████████████████████████████████| 484.0MB 30kB/s 
tcmalloc: large alloc 1073750016 bytes == 0x5cb26000 @  0x7fa3d868c1c4 0x46d6a4 0x5fcbcc 0x4c494d 0x54f3c4 0x553aaf 0x54e4c8 0x54f4f6 0x553aaf 0x54efc1 0x54f24d 0x553aaf 0x54efc1 0x54f24d 0x553aaf 0x54efc1 0x54f24d 0x551ee0 0x54e4c8 0x54f4f6 0x553aaf 0x54efc1 0x54f24d 0x551ee0 0x54efc1 0x54f24d 0x551ee0 0x54e4c8 0x54f4f6 0x553aaf 0x54e4c8
[?25hInstalling collected packages: torch
Successfully installed torch-0.4.0


In [0]:
# import necessary packages

import sys
import numpy as np
import collections
import torch
from torch.autograd import Variable
import torch.nn.functional as F
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

np.random.seed(100)

In [0]:
epochs_ = 5000

# Utility functions

In [0]:
def generate_data(k=2, d=1, k_useful=1, n_rows=1500, print_param=False):
  
  """
  generate data that is suitable for attention models with no dependency between parts of the image 
  
  :parameters
  k - number of parts in each data point
  d - number of sub parts in each 'k' part
  k_useful - number of parts with useful information [k_useful <= k]
  n_rows - number of rows in the data set
  print_param - boolean value to set the printinf status of parametrs
  """
  data = []
  
  loc_useful = 0
  scale_useful = 1
  loc_non_useful = 2
  scale_non_useful = 1
  
  is_useful_all = []
  
  for _ in range(n_rows):

    # each element of 'is_useful' shows whether the corresponding part is useful or not
    is_useful = np.array([0]*k)
    useful_idx = np.random.choice(range(k), size=k_useful, replace=False)
    for i in useful_idx:
      is_useful[i] = 1
    is_useful_all.append(list(is_useful))

    data_point = []
    for i in range(k):
      data_part = []
      if is_useful[i] == 1:
        data_part = np.random.normal(loc=loc_useful, scale=scale_useful, size=d)
      else:
        #loc_non_useful = np.random.rand()
        #scale_non_useful = np.random.choice(np.linspace(0, 100, 10))
        data_part = np.random.normal(loc=loc_non_useful, scale=scale_non_useful, size=d)
      data_point.append(list(data_part))
    
    data.append(data_point)
    
  if print_param:
    print("number of parts: ", k)
    print("useful parts: ", is_useful_all[:5])
    print("loc_non_useful: ", loc_useful)
    print("scale_non_useful: ", scale_useful)
    print("loc_non_useful: ", loc_non_useful)
    print("scale_non_useful: ", scale_non_useful)
    print("-"*50)
                             
  data = np.array(data)
  data = data.round(decimals=2)
  return data, is_useful_all, loc_useful, np.array([scale_useful]), loc_non_useful, np.array([scale_non_useful])                             
                                  
#data, is_useful_all = generate_data()

In [0]:
def generate_data_2(k=2, d=1, k_useful=1, n_rows=1500, print_param=False):
  
  """
  generate data suitable for attention models with the dependency between parts of the image
  
  :parameters
  k - number of parts in each data point
  d - number of sub parts in each 'k' part
  k_useful - number of parts with useful information [k_useful <= k]
  n_rows - number of rows in the data set
  print_param - boolean value to set the printinf status of parametrs
  """
  data = []
  
  mean_useful = [0]*d_
  std_dev_useful = np.random.randn(d_, d_)
  mean_non_useful = [2]*d_
  std_dev_non_useful = np.random.randn(d_, d_)
  
  is_useful_all = []
  
  for _ in range(n_rows):

    # each element of 'is_useful' shows whether the corresponding part is useful or not
    is_useful = np.array([0]*k)
    useful_idx = np.random.choice(range(k), size=k_useful, replace=False)
    for i in useful_idx:
      is_useful[i] = 1
    is_useful_all.append(list(is_useful))

    data_point = []
    for i in range(k):
      data_part = []
      if is_useful[i] == 1:
        data_part = np.matmul(np.random.randn(d_), std_dev_useful) + mean_useful
        #data_part = np.random.normal(loc=loc_useful, scale=scale_useful, size=d)
      else:
        data_part = np.matmul(np.random.randn(d_), std_dev_non_useful) + mean_non_useful
        #loc_non_useful = np.random.rand()
        #scale_non_useful = np.random.choice(np.linspace(0, 100, 10))
        #data_part = np.random.normal(loc=loc_non_useful, scale=scale_non_useful, size=d)
      data_point.append(list(data_part))
    
    data.append(data_point)
    
  if print_param:
    print("number of parts: ", k)
    print("useful parts: ", is_useful_all[:5])
    print("mean_non_useful: ", mean_useful)
    print("std_dev_non_useful: ", std_dev_useful)
    print("mean_non_useful: ", mean_non_useful)
    print("std_dev_non_useful: ", std_dev_non_useful)
    print("-"*50)
                             
  data = np.array(data)
  data = data.round(decimals=2)
  return data, is_useful_all, mean_useful, std_dev_useful, mean_non_useful, std_dev_non_useful 
                                     
                                  
#data, is_useful_all = generate_data_2()

In [0]:
def find_y(x, is_useful_all):
  
  """
  find 'y' (class) based on the useful part
  
  :parameters
  x - data features
  is_useful_all - boolean nxk array
  d - number of sub parts in each 'k' part of each data point
  """
  x_useful = []
  d = x.shape[2]
  
  for row, is_useful in zip(x, is_useful_all):
    useful_sum = np.array([0.]*d)
    for idx, value in enumerate(is_useful):
      if value == 1:
        useful_sum += row[idx]
    x_useful.append(useful_sum)
    
  x_useful = np.array(x_useful)
  w = np.random.uniform(low=-1, high=1, size=(d, 1))
  #print(x_useful.shape, w.shape)
  # n_rows*d, d*1
  # print(x_useful, w)
  y = np.matmul(x_useful, w)
  y = (y>0).astype(int)
  return y

#y = find_y(data, is_useful_all)
#print(y[:10])

In [0]:
def get_useful_data(data, is_useful_all):
  '''
  Warning: Works only when n_useful=1
  '''
  data_useful = []

  for row, is_useful in zip(data, is_useful_all):
    for idx, value in enumerate(is_useful):
      if value == 1:
        data_useful.append(row[idx])
        break

  data_useful = np.array(data_useful)
  
  return data_useful

In [0]:
def display_plot(data, is_useful_all, y_pred_int, y_int, plot_type):
  
  '''
  plot_type: 
  0 (special_case - all [2d]), 
  1 (special_case - useful [1d]), 
  2 (not_special_case - useful [2d])
  '''
  
  f1 = None
  f2 = None
  if plot_type == 0:
    f1 = data.reshape(-1,2)[:,0]
    f2 = data.reshape(-1,2)[:,0]
  elif plot_type == 1:
    data_useful = get_useful_data(data, is_useful_all)
    f1 = data_useful[:,0]
    f2 = np.ones(len(f1))
  else:
    data_useful = get_useful_data(data, is_useful_all)
    f1 = data_useful[:,0]
    f2 = data_useful[:,1]
  
  plt.figure(figsize=(10,5))
  
  plt.subplot(121)
  plt.scatter(f1, f2, c=y_pred_int, label=y_pred_int, s=3, cmap='RdYlGn', alpha=0.5)
  plt.xlabel("f1")
  plt.ylabel("f2")
  plt.title("[Predicted] Epoch " + str(i))
  #plt.legend()

  plt.subplot(122)
  plt.scatter(f1, f2, c=y_int, label=y_int, s=3, cmap='RdYlGn', alpha=0.5)
  plt.xlabel("f1")
  plt.ylabel("f2")
  plt.title("[Real] Epoch " + str(i))
  #plt.legend()

  plt.show()
  
  return 1

In [0]:
def log_loss(y, y_pred):
  return np.round(np.sum(-y*np.log(y_pred)), decimals=2)

# Model

In [0]:
# define the structure of NN
# both before and after
softmax_res = [] 

class Model1(torch.nn.Module):
        
  def __init__(self, n_nodes):
    super(Model1, self).__init__()
    #n_nodes = 2
    ip_n = d_
    h1_n = n_nodes
    hmid_n = 1
    h2_n = n_nodes
    op_n = 1
    self.h1_layer = torch.nn.Linear(ip_n, h1_n)
    self.hmid_layer = torch.nn.Linear(h1_n, hmid_n)
    self.h2_layer = torch.nn.Linear(k_*d_, h2_n)
    self.op_layer = torch.nn.Linear(h2_n, op_n)
    self.relu = torch.nn.ReLU()

    # deviation
    h1_stdv = 1./np.sqrt(h1_n)
    hmid_stdv = 1./np.sqrt(hmid_n)
    h2_stdv = 1./np.sqrt(h2_n)
    op_stdv = 1./np.sqrt(op_n)

    # weight init
    self.h1_layer.weight.data = torch.Tensor(np.random.uniform(low=-h1_stdv, high=h1_stdv, size=(h1_n, ip_n)))
    self.hmid_layer.weight.data = torch.Tensor(np.random.uniform(low=-hmid_stdv, high=hmid_stdv, size=(hmid_n, h1_n)))
    self.h2_layer.weight.data = torch.Tensor(np.random.uniform(low=-h2_stdv, high=h2_stdv, size=(h2_n, k_*d_)))
    self.op_layer.weight.data = torch.Tensor(np.random.uniform(low=-op_stdv, high=op_stdv, size=(op_n, h2_n)))
    '''
    print("Weights")
    print(self.h1_layer.weight.size())
    print(self.h1_layer.weight.data)
    print(self.h2_layer.weight.data)
    print(self.op_layer.weight.data)
    '''

    # bias init
    self.h1_layer.bias.data = torch.Tensor(np.random.uniform(low=-h1_stdv, high=h1_stdv, size=h1_n))
    self.hmid_layer.bias.data = torch.Tensor(np.random.uniform(low=-hmid_stdv, high=hmid_stdv, size=hmid_n))
    self.h2_layer.bias.data = torch.Tensor(np.random.uniform(low=-h2_stdv, high=h2_stdv, size=h2_n))
    self.op_layer.bias.data = torch.Tensor(np.random.uniform(low=-op_stdv, high=op_stdv, size=op_n))
    '''
    print("Bias")
    print(self.h1_layer.bias.size())
    print(self.h1_layer.bias.data)
    print(self.h2_layer.bias.data)
    print(self.op_layer.bias.data)
    '''


  def forward(self, x):
  
    op_h1_layer = self.relu(self.h1_layer(x))
    op_hmid_layer = self.relu(self.hmid_layer(op_h1_layer))
    op_softmax = F.softmax(op_hmid_layer, dim=1)
    #softmax_res.append(op_softmax)
    
    attention_capture = op_softmax.squeeze()
    attention_capture = attention_capture.detach().numpy()
    attention_capture = attention_capture.flatten()
    #print(attention_capture.shape)
    #print(attention_capture); 
    #sys.exit()
    
    attention = x*op_softmax
    #print("attn: ", attention.shape); input()
    attention = attention.view(-1, k_*d_) 
    op_h2_layer = self.relu(self.h2_layer(attention))
    y_pred = self.relu(self.op_layer(op_h2_layer))
      
    return attention_capture, y_pred
    
#model = Model1(n_nodes=2)

In [0]:
# define the structure of NN
# only before
softmax_res = [] 

class Model2(torch.nn.Module):
        
  def __init__(self, n_nodes):
    super(Model2, self).__init__()
    #n_nodes = 2
    ip_n = d_
    h1_n = n_nodes
    hmid_n = 1
    #h2_n = n_nodes
    op_n = 1
    self.h1_layer = torch.nn.Linear(ip_n, h1_n)
    self.hmid_layer = torch.nn.Linear(h1_n, hmid_n)
    #self.h2_layer = torch.nn.Linear(k_*d_, h2_n)
    self.op_layer = torch.nn.Linear(k_*d_, op_n)
    self.relu = torch.nn.ReLU()

    # deviation
    h1_stdv = 1./np.sqrt(h1_n)
    hmid_stdv = 1./np.sqrt(hmid_n)
    #h2_stdv = 1./np.sqrt(h2_n)
    op_stdv = 1./np.sqrt(op_n)

    # weight init
    self.h1_layer.weight.data = torch.Tensor(np.random.uniform(low=-h1_stdv, high=h1_stdv, size=(h1_n, ip_n)))
    self.hmid_layer.weight.data = torch.Tensor(np.random.uniform(low=-hmid_stdv, high=hmid_stdv, size=(hmid_n, h1_n)))
    #self.h2_layer.weight.data = torch.Tensor(np.random.uniform(low=-h2_stdv, high=h2_stdv, size=(h2_n, k_*d_)))
    self.op_layer.weight.data = torch.Tensor(np.random.uniform(low=-op_stdv, high=op_stdv, size=(op_n, k_*d_)))
    '''
    print("Weights")
    print(self.h1_layer.weight.size())
    print(self.h1_layer.weight.data)
    print(self.h2_layer.weight.data)
    print(self.op_layer.weight.data)
    '''

    # bias init
    self.h1_layer.bias.data = torch.Tensor(np.random.uniform(low=-h1_stdv, high=h1_stdv, size=h1_n))
    self.hmid_layer.bias.data = torch.Tensor(np.random.uniform(low=-hmid_stdv, high=hmid_stdv, size=hmid_n))
    #self.h2_layer.bias.data = torch.Tensor(np.random.uniform(low=-h2_stdv, high=h2_stdv, size=h2_n))
    self.op_layer.bias.data = torch.Tensor(np.random.uniform(low=-op_stdv, high=op_stdv, size=op_n))
    '''
    print("Bias")
    print(self.h1_layer.bias.size())
    print(self.h1_layer.bias.data)
    print(self.h2_layer.bias.data)
    print(self.op_layer.bias.data)
    '''


  def forward(self, x):
  
    op_h1_layer = self.relu(self.h1_layer(x))
    op_hmid_layer = self.relu(self.hmid_layer(op_h1_layer))
    op_softmax = F.softmax(op_hmid_layer, dim=1)
    #softmax_res.append(op_softmax)
    
    attention_capture = op_softmax.squeeze()
    attention_capture = attention_capture.detach().numpy()
    attention_capture = attention_capture.flatten()
    #print(attention_capture.shape)
    #print(attention_capture); 
    #sys.exit()
    
    attention = x*op_softmax
    #print("attn: ", attention.shape); input()
    attention = attention.view(-1, k_*d_) 
    #op_h2_layer = self.relu(self.h2_layer(attention))
    y_pred = self.relu(self.op_layer(attention))
      
    return attention_capture, y_pred
    
#model = Model2(n_nodes=2)

In [0]:
# define the structure of NN
# only after
softmax_res = [] 

class Model3(torch.nn.Module):
        
  def __init__(self, n_nodes):
    super(Model3, self).__init__()
    #n_nodes = 2
    ip_n = d_
    #h1_n = n_nodes
    hmid_n = 1
    h2_n = n_nodes
    op_n = 1
    #self.h1_layer = torch.nn.Linear(ip_n, h1_n)
    self.hmid_layer = torch.nn.Linear(ip_n, hmid_n)
    self.h2_layer = torch.nn.Linear(k_*d_, h2_n)
    self.op_layer = torch.nn.Linear(h2_n, op_n)
    self.relu = torch.nn.ReLU()

    # deviation
    #h1_stdv = 1./np.sqrt(h1_n)
    hmid_stdv = 1./np.sqrt(hmid_n)
    h2_stdv = 1./np.sqrt(h2_n)
    op_stdv = 1./np.sqrt(op_n)

    # weight init
    #self.h1_layer.weight.data = torch.Tensor(np.random.uniform(low=-h1_stdv, high=h1_stdv, size=(h1_n, ip_n)))
    self.hmid_layer.weight.data = torch.Tensor(np.random.uniform(low=-hmid_stdv, high=hmid_stdv, size=(hmid_n, ip_n)))
    self.h2_layer.weight.data = torch.Tensor(np.random.uniform(low=-h2_stdv, high=h2_stdv, size=(h2_n, k_*d_)))
    self.op_layer.weight.data = torch.Tensor(np.random.uniform(low=-op_stdv, high=op_stdv, size=(op_n, h2_n)))
    '''
    print("Weights")
    print(self.h1_layer.weight.size())
    print(self.h1_layer.weight.data)
    print(self.h2_layer.weight.data)
    print(self.op_layer.weight.data)
    '''

    # bias init
    #self.h1_layer.bias.data = torch.Tensor(np.random.uniform(low=-h1_stdv, high=h1_stdv, size=h1_n))
    self.hmid_layer.bias.data = torch.Tensor(np.random.uniform(low=-hmid_stdv, high=hmid_stdv, size=hmid_n))
    self.h2_layer.bias.data = torch.Tensor(np.random.uniform(low=-h2_stdv, high=h2_stdv, size=h2_n))
    self.op_layer.bias.data = torch.Tensor(np.random.uniform(low=-op_stdv, high=op_stdv, size=op_n))
    '''
    print("Bias")
    print(self.h1_layer.bias.size())
    print(self.h1_layer.bias.data)
    print(self.h2_layer.bias.data)
    print(self.op_layer.bias.data)
    '''


  def forward(self, x):
  
    #op_h1_layer = self.relu(self.h1_layer(x))
    op_hmid_layer = self.relu(self.hmid_layer(x))
    op_softmax = F.softmax(op_hmid_layer, dim=1)
    #softmax_res.append(op_softmax)
    
    attention_capture = op_softmax.squeeze()
    attention_capture = attention_capture.detach().numpy()
    attention_capture = attention_capture.flatten()
    #print(attention_capture.shape)
    #print(attention_capture); 
    #sys.exit()
    
    attention = x*op_softmax
    #print("attn: ", attention.shape); input()
    attention = attention.view(-1, k_*d_) 
    op_h2_layer = self.relu(self.h2_layer(attention))
    y_pred = self.relu(self.op_layer(op_h2_layer))
      
    return attention_capture, y_pred
    
#model = Model3(n_nodes=2)

In [0]:
# define the structure of NN
# no before and after
softmax_res = [] 

class Model4(torch.nn.Module):
        
  def __init__(self, n_nodes):
    super(Model4, self).__init__()
    #n_nodes = 2
    ip_n = d_
    #h1_n = n_nodes
    hmid_n = 1
    #h2_n = n_nodes
    op_n = 1
    #self.h1_layer = torch.nn.Linear(ip_n, h1_n)
    self.hmid_layer = torch.nn.Linear(ip_n, hmid_n)
    #self.h2_layer = torch.nn.Linear(k_*d_, h2_n)
    self.op_layer = torch.nn.Linear(k_*d_, op_n)
    self.relu = torch.nn.ReLU()

    # deviation
    #h1_stdv = 1./np.sqrt(h1_n)
    hmid_stdv = 1./np.sqrt(hmid_n)
    #h2_stdv = 1./np.sqrt(h2_n)
    op_stdv = 1./np.sqrt(op_n)

    # weight init
    #self.h1_layer.weight.data = torch.Tensor(np.random.uniform(low=-h1_stdv, high=h1_stdv, size=(h1_n, ip_n)))
    self.hmid_layer.weight.data = torch.Tensor(np.random.uniform(low=-hmid_stdv, high=hmid_stdv, size=(hmid_n, ip_n)))
    #self.h2_layer.weight.data = torch.Tensor(np.random.uniform(low=-h2_stdv, high=h2_stdv, size=(h2_n, k_*d_)))
    self.op_layer.weight.data = torch.Tensor(np.random.uniform(low=-op_stdv, high=op_stdv, size=(op_n, k_*d_)))
    '''
    print("Weights")
    print(self.h1_layer.weight.size())
    print(self.h1_layer.weight.data)
    print(self.h2_layer.weight.data)
    print(self.op_layer.weight.data)
    '''

    # bias init
    #self.h1_layer.bias.data = torch.Tensor(np.random.uniform(low=-h1_stdv, high=h1_stdv, size=h1_n))
    self.hmid_layer.bias.data = torch.Tensor(np.random.uniform(low=-hmid_stdv, high=hmid_stdv, size=hmid_n))
    #self.h2_layer.bias.data = torch.Tensor(np.random.uniform(low=-h2_stdv, high=h2_stdv, size=h2_n))
    self.op_layer.bias.data = torch.Tensor(np.random.uniform(low=-op_stdv, high=op_stdv, size=op_n))
    '''
    print("Bias")
    print(self.h1_layer.bias.size())
    print(self.h1_layer.bias.data)
    print(self.h2_layer.bias.data)
    print(self.op_layer.bias.data)
    '''


  def forward(self, x):
  
    #op_h1_layer = self.relu(self.h1_layer(x))
    op_hmid_layer = self.relu(self.hmid_layer(x))
    op_softmax = F.softmax(op_hmid_layer, dim=1)
    #softmax_res.append(op_softmax)
    
    attention_capture = op_softmax.squeeze()
    attention_capture = attention_capture.detach().numpy()
    attention_capture = attention_capture.flatten()
    #print(attention_capture.shape)
    #print(attention_capture); 
    #sys.exit()
    
    attention = x*op_softmax
    #print("attn: ", attention.shape); input()
    attention = attention.view(-1, k_*d_) 
    #op_h2_layer = self.relu(self.h2_layer(attention))
    y_pred = self.relu(self.op_layer(attention))
      
    return attention_capture, y_pred
    
#model = Model4(n_nodes=2)

# Experiment

In [14]:
result = []
cnt = 0
attention_capture = None

k_d_list = [(2,1), (25,2), (25,100)] # list of tuples in (k, d) format
for k_, d_ in k_d_list[:2]: ##I1
  #result[str((k_, d_))] = {}
  is_special_case = (k_==2 and d_==1)
  
  n_ = max(k_*d_*30, 1500)
  
  data, is_useful_all, mean_useful, std_dev_useful, mean_non_useful, std_dev_non_useful = generate_data_2(k=k_, d=d_, n_rows=n_)
  y = find_y(data, is_useful_all)
  
  x_train, x_test, y_train, y_test, idx_train, idx_test = train_test_split(data, y, range(n_), test_size=0.33, random_state=100, stratify=y)
  
  x_train_tensor = torch.from_numpy(x_train).type(torch.FloatTensor)
  x_test_tensor = torch.from_numpy(x_test).type(torch.FloatTensor)
  y_train_tensor = torch.from_numpy(y_train).type(torch.FloatTensor)
  y_test_tensor = torch.from_numpy(y_test).type(torch.FloatTensor)
  
  is_useful_train = np.array(is_useful_all[:y_train.shape[0]])
  is_useful_train = is_useful_train.flatten()
  #print(is_useful_train.shape)
  #print(is_useful_train)
  
  '''
  print(data.shape)
  print(X_train.shape)
  print(X_test.shape)
  print(y_train.shape)
  print(y_test.shape)
  input()
  '''
  
  n_nodes_list = [2, 4, 8, 16] # number of nodes in every layer
  for n_nodes in n_nodes_list[:1]: ##I2
    #result[str((k_, d_))][str(n_nodes)] = {}
    
    lr_list = [0.1, 0.01, 0.001, 0.0001]
    for lr in lr_list[:1]: ## I3
      #result[str((k_, d_))][str(n_nodes)][str(lr)] = {}
      
      before_after_list = [[True, True], [True, False], [False, True], [False, False]]
      for before, after in before_after_list[:1]: #I4
        #result[str((k_, d_))][str(n_nodes)][str(lr)][str((before, after))] = {}
        
        if before and after:
          model = Model1(n_nodes=n_nodes)
        elif before:
          model = Model2(n_nodes=n_nodes)
        elif after:
          model = Model3(n_nodes=n_nodes)
        else:
          model = Model4(n_nodes=n_nodes)
          
        loss_method = torch.nn.MSELoss()
        optimizer = torch.optim.SGD(model.parameters(), lr=lr)
      
        # go forward and backward over the network and update parameters

        display_steps = np.linspace(0,epochs_, num=5, dtype=int)
        display_steps = np.append(display_steps, [epochs_-1])

        loss_array = np.array([])
        acc_array = np.array([])
        attention_loss_array = np.array([])

        for i in range(epochs_):
            
            attention_capture, y_train_pred = model.forward(x_train_tensor)
            
            attention_loss = log_loss(is_useful_train, attention_capture)
            #print(attention_loss); sys.exit()
            attention_loss_array = np.append(attention_loss_array, [attention_loss])

            loss = loss_method(y_train_pred, y_train_tensor)
            loss_array = np.append(loss_array, [loss.item()])

            y_train_pred_int = (y_train_pred>=0.5).squeeze().type(torch.IntTensor).data.numpy()
            y_train_int = y_train_tensor.squeeze().type(torch.IntTensor).data.numpy()
            accuracy = sum([int(v1 == v2) for v1, v2 in zip(y_train_pred_int, y_train_int)])/len(y_train_tensor)
            acc_array = np.append(acc_array, [accuracy])

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            '''
            is_useful_all = np.array(is_useful_all)
            if i in display_steps:
              # visualize training
              if not is_special_case:
                display_plot(x_train, is_useful_all[idx_train], y_train_pred_int, y_train_int, plot_type=2)
              else:
                #display_plot(x_train, is_useful_all[idx_train], y_train_pred_int, y_train_int, plot_type=0)
                display_plot(x_train, is_useful_all[idx_train], y_train_pred_int, y_train_int, plot_type=1)
            '''
         
        attention_capture_test, y_test_pred = model.forward(x_test_tensor)
        y_test_pred_int = (y_test_pred>=0.5).squeeze().type(torch.IntTensor).data.numpy()
        y_test_int = y_test_tensor.squeeze().type(torch.IntTensor).data.numpy()
        test_accuracy = sum([int(v1 == v2) for v1, v2 in zip(y_test_pred_int, y_test_int)])/len(y_test_tensor)
        
        #result[str((k_, d_))][str(n_nodes)][str(lr)][str((before, after))]['loss_array'] = loss_array.tolist()
        #result[str((k_, d_))][str(n_nodes)][str(lr)][str((before, after))]['acc_array'] = acc_array.tolist()
        #result[str((k_, d_))][str(n_nodes)][str(lr)][str((before, after))]['test_accuracy'] = test_accuracy
        
        class_count = {}
        class_count['0'] = y.flatten().tolist().count(0)
        class_count['1'] = y.flatten().tolist().count(1)
        
        result_each = {}
        result_each['epochs_'] = epochs_
        result_each['k_'] = k_
        result_each['d_'] = d_
        result_each['mean_useful'] = mean_useful
        result_each['std_dev_useful'] = std_dev_useful.tolist()
        result_each['mean_non_useful'] = mean_non_useful
        result_each['std_dev_non_useful'] = std_dev_non_useful.tolist()
        result_each['class_count'] = class_count
        result_each['n_nodes'] = n_nodes
        result_each['lr'] = lr
        result_each['before'] = before
        result_each['after'] = after
        result_each['loss_array'] = loss_array.tolist()[::50]
        result_each['acc_array'] = acc_array.tolist()[::50]
        result_each['attention_loss_array'] = attention_loss_array.tolist()[::50]
        result_each['test_accuracy'] = test_accuracy
        
        result.append(result_each)
        
        cnt += 1
        print("Experiment: " + str(cnt) + " completed")
      



1
2
3
4
5
6
7
8


In [16]:
print(result)

{'(2, 1)': {'2': {'0.1': {'(True, True)': {'loss_array': [0.18500478565692902, 0.09176315367221832, 0.04538392275571823, 0.022300517186522484, 0.010885084047913551, 0.005284485407173634, 0.0025567386765033007, 0.0012360999826341867, 0.0005996367544867098, 0.0002939421101473272, 0.00014747315435670316, 7.741533045191318e-05, 4.3944492063019425e-05, 2.796554508677218e-05, 2.0339506590971723e-05, 1.6699277693987824e-05, 1.4959948202886153e-05, 1.4126910173217766e-05, 1.3725873031944502e-05, 1.3530503565561958e-05, 1.3433284948405344e-05, 1.3382699762587436e-05, 1.335461547569139e-05, 1.3336941265151836e-05, 1.3324347492016386e-05, 1.331423845840618e-05, 1.330523537035333e-05, 1.3296848919708282e-05, 1.3288566151459236e-05, 1.3280599887366407e-05, 1.3272586329549085e-05, 1.326459641859401e-05, 1.3256710190034937e-05, 1.3248701179691125e-05, 1.3240814951132052e-05, 1.3232995115686208e-05, 1.322505522693973e-05, 1.3217215382610448e-05, 1.3209310054662637e-05, 1.3201402907725424e-05, 1.319353

# Result as JSON

In [0]:
import json

with open('result2.json', 'w') as fp:
    
    json.dump(result, fp)
    
from google.colab import files
files.download('result2.json')