In [0]:
import numpy as np
import json
from sklearn.model_selection import train_test_split

np.random.seed(100)

In [0]:
def generate_data(k=2, d=1, k_useful=1, n_rows=1500, print_param=False):
  
  """
  generate data suitable for attention models with the dependency between parts of the image
  
  :parameters
  k - number of parts in each data point
  d - number of sub parts in each 'k' part
  k_useful - number of parts with useful information [k_useful <= k]
  n_rows - number of rows in the data set
  print_param - boolean value to set the printinf status of parametrs
  """
  data = []
  
  mean_useful = [0.]*d
  std_dev_useful = np.eye(d)
  mean_non_useful = [3.]*d
  std_dev_non_useful = 2.*np.eye(d)
  
  is_useful_all = []
  
  for _ in range(n_rows):

    # each element of 'is_useful' shows whether the corresponding part is useful or not
    is_useful = np.array([0]*k)
    useful_idx = np.random.choice(range(k), size=k_useful, replace=False)
    for i in useful_idx:
      is_useful[i] = 1
    is_useful = [int(x) for x in is_useful]
    is_useful_all.append(is_useful)

    data_point = []
    for i in range(k):
      data_part = []
      if is_useful[i] == 1:
        data_part = np.matmul(np.random.randn(d_), std_dev_useful) + mean_useful
      else:
        data_part = np.matmul(np.random.randn(d_), std_dev_non_useful) + mean_non_useful
      data_point.append(list(data_part))
    
    data.append(data_point)
    
  if print_param:
    print("number of parts: ", k)
    print("useful parts: ", is_useful_all[:5])
    print("mean_non_useful: ", mean_useful)
    print("std_dev_non_useful: ", std_dev_useful)
    print("mean_non_useful: ", mean_non_useful)
    print("std_dev_non_useful: ", std_dev_non_useful)
    print("-"*50)
                             
  data = np.array(data)
  data = data.round(decimals=2)
  
  return data, is_useful_all, mean_useful, std_dev_useful, mean_non_useful, std_dev_non_useful 

In [0]:
def find_y(x, is_useful_all):
  
  """
  find 'y' (class) based on the useful part
  
  :parameters
  x - data features
  is_useful_all - boolean nxk array
  d - number of sub parts in each 'k' part of each data point
  """
  x_useful = []
  d = x.shape[2]
  
  for row, is_useful in zip(x, is_useful_all):
    useful_sum = np.array([0.]*d)
    for idx, value in enumerate(is_useful):
      if value == 1:
        useful_sum += row[idx]
    x_useful.append(useful_sum)
    
  x_useful = np.array(x_useful)
  w = np.random.uniform(low=-1, high=1, size=(d, 1))
  
  y = np.matmul(x_useful, w)
  y = (y>0).astype(int)
  
  return y, w

In [0]:
all_data = []
cnt = 0

k_d_list = [(2,1), (9,2), (9,32)] # list of tuples in (k, d) format

for k_, d_ in k_d_list:
  
  n_ = max(k_*d_*3, 1500)
  
  data, is_useful_all, mean_useful, std_dev_useful, mean_non_useful, std_dev_non_useful = generate_data(k=k_, d=d_, n_rows=n_)
  y , w = find_y(data, is_useful_all)
 
  x_train, x_test, y_train, y_test, idx_train, idx_test = train_test_split(data, y, range(n_), test_size=0.33, random_state=100, stratify=y)
  
  class_count = {}
  class_count['0'] = y.flatten().tolist().count(0)
  class_count['1'] = y.flatten().tolist().count(1)
  
  each_data = {}
  each_data['mean_useful'] = mean_useful
  each_data['std_dev_useful'] = std_dev_useful.tolist()
  each_data['mean_non_useful'] = mean_non_useful
  each_data['std_dev_non_useful'] = std_dev_non_useful.tolist()
  each_data['x_train'] = x_train.tolist()
  each_data['x_test'] = x_test.tolist()
  each_data['y_train'] = y_train.tolist()
  each_data['y_test'] = y_test.tolist()
  each_data['w'] = w.tolist()
  each_data['class_count'] = class_count
  each_data['is_useful_all'] = is_useful_all
  each_data['idx_train'] = idx_train
  each_data['idx_test'] =idx_test
  all_data.append(each_data)
  
  cnt += 1
  print(str(cnt))

In [0]:
with open('data.json', 'w') as fp:
    json.dump(all_data, fp)

In [0]:
from google.colab import files
files.download('data.json')