## Imports
Run this script in the py_36 virtual environment.
Check we indeed have python 3.6:

In [1]:
import sys
# should be: 3.6.13 |Anaconda, Inc.| (default, Mar 16 2021, 11:37:27) [MSC v.1916 64 bit (AMD64)]
print(sys.version)

3.6.13 |Anaconda, Inc.| (default, Mar 16 2021, 11:37:27) [MSC v.1916 64 bit (AMD64)]


Package imports and version checks:

In [None]:
# AWS Sagemaker changes:
! pip install tensorflow==1.4.0
! pip install pandas==0.23.4
! pip install scipy==1.1.0
! pip install matplotlib==3.0.0
! pip install seaborn==0.9.0
! pip install scikit-learn==0.20.0
! pip install numpy==1.15.2

In [10]:
import numpy as np
print("Numpy version should be 1.15.2:", np.__version__)
np.set_printoptions(suppress=True, precision=3)
import tensorflow as tf
print("Tensorflow version should be 1.4.0:", tf.__version__)
import scipy
print("Scipy version should be 1.1.0:", scipy.__version__)
import sklearn
print("Sklearn version should be 0.20.0:", sklearn.__version__) # 0.20.1 okay too
import matplotlib
print("Matplotlib version should be 3.0.0:", matplotlib.__version__)
import pandas as pd
print("Pandas version should be 0.23.4:", pd.__version__)
import seaborn as sns; sns.set(style="ticks", color_codes=True)
print("Seaborn version should be 0.9.0:", sns.__version__)

Numpy version should be 1.15.2: 1.15.2
Tensorflow version should be 1.4.0: 1.4.0
Scipy version should be 1.1.0: 1.1.0
Sklearn version should be 0.20.0: 0.20.1
Matplotlib version should be 3.0.0: 3.0.0
Pandas version should be 0.23.4: 0.23.4
Seaborn version should be 0.9.0: 0.9.0


In [11]:
from scipy.stats import bernoulli
import json
import os
import random
from random import sample
from sklearn.feature_selection import mutual_info_regression
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import sklearn.preprocessing as preprocessing
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import mutual_info_regression, mutual_info_classif
plt.switch_backend('agg')
tfd = tf.contrib.distributions
import utils.process as process
import utils.params as params
import utils.data_prep as data_prep
import utils.active_learning as active_learning
from types import SimpleNamespace
import time
pd.set_option('display.max_columns', None) # to see all cols when viewing db
print(os.getcwd())

C:\Users\jahutter\Documents\Programming\Thesis code\VAEM_main


In [9]:
# log to which which devices operations & tensors are assigned: 
#tf.debugging.set_log_device_placement(True)
#print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
sess = tf.Session(config=tf.ConfigProto(log_device_placement=True)) # to log which device is used for model training

In [12]:
from tensorflow.python.client import device_lib 
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 17506357567851034394
]


## Selecting & loading real dataset

***NOTE:*** target col must be last column in integer_cols (because the columns are reordered: categorical put before numerical)


In [13]:
random_seed=42
dataset_name = "Census" # "Adult" #  "Sachs" # "Asia" #   "Sachs2" #  "Cancer2" #   "Cancer" #     "Bank" #             "Intrusion" #  
fake_file_root = "../Fake_Datasets/"
if dataset_name in ["Adult", "Census", "Bank"]:
    real_file_root = "../Real_Datasets/"
else:
    real_file_root = "../BN_Datasets/"
real_path = real_file_root + dataset_name+"/"+dataset_name
# real_path = real_file_root +dataset_name
with open(real_path+"_meta.json") as json_file:
    meta = json.load(json_file)


target = meta["target"]
categorical_cols = meta["categorical_cols"]
categorical_cols.append(target)
print("categorical_cols:", categorical_cols)
integer_cols = meta["numerical_cols"]
print("integer_cols:", integer_cols)
#integer_cols.append(target) # put target as integer col (instead of categorical as it should be!)
# discrete_cols = subset of integer_cols; take integer values such as month, day or continuous but discretized such as salary
discrete_cols = [] #meta["discrete_cols"] # temp overwrite to fix slice index bug
print("discrete_cols:", discrete_cols)
#real_path = real_path+"_train_processed.csv"
real_path = real_path+".csv"

real_data = pd.read_csv(real_path)

categorical_cols: ['work_class', 'industry', 'occupation', 'education', 'enroll_in_edu', 'marital_status', 'major_industry', 'major_occupation', 'race', 'hispanic_origin', 'gender', 'union_member', 'unemployment_reason', 'employment_status', 'tax_filer_status', 'reg_prev_residence', 'state_prev_residence', 'household_status', 'household_summary', 'migr_code_msa', 'migr_code_reg', 'migr_code_in_reg', 'migr_same', 'migr_sun', 'fam_under_18', 'country_father', 'country_mother', 'country_self', 'citizenship', 'own_business', 'vet_qva', 'vet_ben', 'year', 'tot_pers_income']
integer_cols: ['age', 'wage', 'capital_gains', 'capital_loss', 'stock_dividends', 'n_pers_employer', 'weeks_work']
discrete_cols: []


In [14]:
print(real_data.shape)
real_data.head()

(50000, 41)


Unnamed: 0,age,work_class,industry,occupation,education,wage,enroll_in_edu,marital_status,major_industry,major_occupation,race,hispanic_origin,gender,union_member,unemployment_reason,employment_status,capital_gains,capital_loss,stock_dividends,tax_filer_status,reg_prev_residence,state_prev_residence,household_status,household_summary,migr_code_msa,migr_code_reg,migr_code_in_reg,migr_same,migr_sun,n_pers_employer,fam_under_18,country_father,country_mother,country_self,citizenship,own_business,vet_qva,vet_ben,weeks_work,year,tot_pers_income
0,41,4,33,26,12,1500,2,2,19,0,4,0,0,0,3,1,0,0,0,2,3,36,33,7,0,0,0,1,0,1,4,40,40,40,4,0,1,2,51,95,0
1,15,3,0,0,6,0,2,4,14,6,4,0,1,1,3,0,0,0,0,4,3,36,8,2,7,6,7,2,2,0,0,40,40,40,4,0,1,2,0,94,0
2,50,6,4,2,14,0,2,2,4,2,4,0,1,1,3,1,0,0,0,2,3,36,18,4,0,0,0,1,0,1,4,40,40,40,4,0,1,2,52,95,1
3,3,3,0,0,10,0,2,4,14,6,4,0,0,1,3,0,0,0,0,4,3,36,8,2,7,6,7,2,2,0,2,40,40,40,4,0,1,0,0,94,0
4,14,3,0,0,10,0,2,4,14,6,4,6,0,1,3,0,0,0,0,4,3,36,8,2,0,0,0,1,0,0,1,40,40,40,4,0,1,0,0,95,0


### Only for Census data:
Remove some cols of Census data to avoid OOM error

In [4]:
for c in categorical_cols:
    print("Column", c, "has", len(real_data[c].unique()), "unique categories")

Column work_class has 9 unique categories
Column industry has 51 unique categories
Column occupation has 47 unique categories
Column education has 17 unique categories
Column enroll_in_edu has 3 unique categories
Column marital_status has 7 unique categories
Column major_industry has 24 unique categories
Column major_occupation has 15 unique categories
Column race has 5 unique categories
Column hispanic_origin has 10 unique categories
Column gender has 2 unique categories
Column union_member has 3 unique categories
Column unemployment_reason has 6 unique categories
Column employment_status has 8 unique categories
Column tax_filer_status has 6 unique categories
Column reg_prev_residence has 6 unique categories
Column state_prev_residence has 51 unique categories
Column household_status has 34 unique categories
Column household_summary has 8 unique categories
Column migr_code_msa has 10 unique categories
Column migr_code_reg has 9 unique categories
Column migr_code_in_reg has 10 unique c

In [5]:
real_data.shape

(50000, 41)

In [4]:
# remove some cols for Census data (too large to process):
del_cols = ["country_father", "country_mother","country_self", "state_prev_residence", "reg_prev_residence", "industry", "vet_qva", "vet_ben", "migr_code_msa", "migr_code_reg","migr_code_in_reg", "household_status"]
#del_cols = ["state_prev_residence", "reg_prev_residence", "industry", "vet_qva", "vet_ben"]
for c in del_cols:
    del(real_data[c])
categorical_cols = ['work_class', 'industry', 'occupation', 'education', 'enroll_in_edu', 'marital_status', 'major_industry', 'major_occupation', 'race', 'hispanic_origin', 'gender', 'union_member', 'unemployment_reason', 'employment_status', 'tax_filer_status', 'reg_prev_residence', 'state_prev_residence', 'household_status', 'household_summary', 'migr_code_msa', 'migr_code_reg', 'migr_code_in_reg', 'migr_same', 'migr_sun', 'fam_under_18', 'country_father', 'country_mother', 'country_self', 'citizenship', 'own_business', 'vet_qva', 'vet_ben', 'year', 'tot_pers_income']
new_cat_cols = []
for c in categorical_cols:
    if c not in del_cols:
        new_cat_cols.append(c)
print(new_cat_cols)
categorical_cols = new_cat_cols

['work_class', 'occupation', 'education', 'enroll_in_edu', 'marital_status', 'major_industry', 'major_occupation', 'race', 'hispanic_origin', 'gender', 'union_member', 'unemployment_reason', 'employment_status', 'tax_filer_status', 'household_summary', 'migr_same', 'migr_sun', 'fam_under_18', 'citizenship', 'own_business', 'year', 'tot_pers_income']


In [29]:
#inspect dataset:
real_data.head()
real_data.columns
n_col = real_data.shape[1]
print(n_col)
#real_data.to_csv(real_file_root +dataset_name+"_"+str(n_col)+"col.csv", index=False)
real_data.to_csv(real_file_root + dataset_name+"/"+dataset_name+"_"+str(n_col)+"coll.csv", index=False)

29


In [6]:
len(categorical_cols)+len(integer_cols)

29

In [18]:
df = pd.read_csv(real_file_root+dataset_name+"/"+dataset_name+"_train_processed.csv")
len(df.columns)

33

## Preping data for VAE:

In [14]:
Data_train_compressed, Data_train_decompressed, Data_train_noisy_decompressed, mask_train_decompressed, Data_test_decompressed, mask_test_compressed, mask_test_decompressed, cat_dims, DIM_FLT, dic_var_type, records_d, list_discrete, list_discrete_compressed = data_prep.data_prep(real_data, categorical_cols, integer_cols, discrete_cols, target, random_seed)

In [17]:
Data_train_decompressed.shape

(40000, 507)

Checks to look at the created variables:

In [7]:
print("records_d:", records_d)
#print("db cols:", real_data.columns)
print("categorical cols", categorical_cols)
print("integer cols:", integer_cols)
print("discrete cols:", discrete_cols)
print("cat_dims", cat_dims, ", sum:", sum(cat_dims))
print("list_discrete_compressed:", list_discrete_compressed)
print("list_discrete", list_discrete)
print("TO CHECK: list discrete =  sum(cat_dims) + position of discrete vars in integer cols - 1 (correction for index start at 0)")

records_d: []
categorical cols ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'gender', 'native-country', 'income']
integer cols: ['age', 'fnlwgt', 'capital-gain', 'capital-loss', 'hours-per-week']
discrete cols: []
cat_dims [ 9 16  7 15  6  5  2 42  2] , sum: 104
list_discrete_compressed: []
list_discrete []
TO CHECK: list discrete =  sum(cat_dims) + position of discrete vars in integer cols - 1 (correction for index start at 0)


### Load hyperparameters:

In [15]:
# VAEM notebook used 3000 epochs
#batch_size decreased from 100 to 50 for Census
args = {
    "epochs" : 1,
    "latent_dim" : 20, 
    "p" : 0.99, 
    "iteration" : 10, 
    "batch_size" : 100,
    "K" : 100, 
    "M" : 10, 
    "repeat" : 1,
    "data_name" : "bank",
    "output_dir" : "./saved_weights/"+dataset_name+"/",
    "data_dir" : "./"+real_file_root+dataset_name+"/",
    "list_strategy" : [] ,
    "list_stage" : [1,2]
}
args= SimpleNamespace(**args) #to make them accessible as eg arg.epochs

## Training VAEM & generating synthetic data

In [16]:
num_exp = 1
times = []
print("Running ", num_exp, "experiment(s) for ", dataset_name, "with ", str(args.epochs), " epochs")
for i in list(range(1,num_exp+1)):
    print("PERFORMING EXPERIMENT", i)

    print("START TRAINING VAEM MODEL")
    start_time = time.time()
    vae = active_learning.p_vae_active_learning(Data_train_compressed, Data_train_noisy_decompressed, mask_train_decompressed, Data_test_decompressed, mask_test_compressed, mask_test_decompressed, cat_dims, DIM_FLT, dic_var_type, args, records_d, list_discrete)
    print("FINISHED TRAINING VAEM MODEL")

    print("SAMPLING FAKE DATA FROM MODEL")
    tf.reset_default_graph()
    # mask_train_decompressed*0 = matrix of all 0s == everything is unobserved i.e. will generate data
    # Data_fake_noisy = decoded/reconstructed/generated data mtx x
    Data_fake_noisy,z_posterior,x_recon_cat_p = vae.get_imputation( Data_train_noisy_decompressed, mask_train_decompressed*0,cat_dims,dic_var_type)
    
    # Process real train data for comparison with fake (compress back to 1 col per categorical var):
    x_real = process.compress_data(Data_train_decompressed,cat_dims, dic_var_type)
    cols = np.concatenate((categorical_cols, integer_cols))

    # invert_noise = round values in discrete cols to nearest possible value:
    # invert_noise twice??? already in get_imputation line 619 ??
    Data_fake = process.invert_noise(Data_fake_noisy,list_discrete_compressed,records_d) 
    ## Normalize [0,1] fake data using min & max values from real data: 
    Data_std = (Data_fake - x_real.min(axis=0)) / (x_real.max(axis=0) - x_real.min(axis=0))
    #Data_fake = Data_std * (max_Data - min_Data) + min_Data
    Data_fake = Data_std # ignore additional scaling

    #Data_fake.shape
    Data_fake = pd.DataFrame(data=Data_fake, index=None, columns=cols)
    # round target column back to binary values:
    #Data_fake[target]=Data_fake[target].round()
    # save fake data to csv:
    Data_fake.to_csv(fake_file_root+dataset_name+"/"+dataset_name+"_fake_vaem_"+str(args.epochs)+"epochs_"+str(i)+".csv", index=False)
    #Data_fake.head()

    end_time = time.time()
    print("FINISHED SAMPLING FAKE DATA AFTER", end_time-start_time, "SECONDS")
    times.append(end_time-start_time)
pd.DataFrame(times, columns=["Comp.times"]).to_csv("../Evaluation/Comp_time/times_"+dataset_name+"_vaem_"+str(args.epochs)+"epochs.csv", index=False)

Running  1 experiment(s) for  Census with  1  epochs
PERFORMING EXPERIMENT 1
START TRAINING VAEM MODEL
Calling process.inverse_noise_tf
Data_invert: Tensor("is/generator/mul_5:0", shape=(?, 506), dtype=float32)
Calling process.inverse_noise_tf
Data_invert: Tensor("is/generator/mul_13:0", shape=(?, 506), dtype=float32)
partial_vaem.__init__() called with vars: cat_dims= [ 9 51 47 17  3  7 24 15  5 10  2  3  6  8  6  6 51 34  8 10  9 10  3  4
  5 43 43 43  5  3  3  3  2  2]
Epoch: 0 	negative training ELBO per observed feature: 2.10, Cat_term: 0.15, Flt_term: 1.93,z_term: 0.00
Iterating over var_dict in save_generator:
key is/generator/auto_std_local
value <tf.Variable 'is/generator/auto_std_local:0' shape=(1, 7) dtype=float32_ref>
key is/generator/fc-latent-local-cat-020/weights
value <tf.Variable 'is/generator/fc-latent-local-cat-020/weights:0' shape=(9, 9) dtype=float32_ref>
key is/generator/fc-latent-local-cat-020/biases
value <tf.Variable 'is/generator/fc-latent-local-cat-020/biases

Iterating over var_dict in save_encoder:
key is/sampling_local/encoder_local/fc-latent-local-flt-010/weights
value <tf.Variable 'is/sampling_local/encoder_local/fc-latent-local-flt-010/weights:0' shape=(1, 50) dtype=float32_ref>
key is/sampling_local/encoder_local/fc-latent-local-flt-010/biases
value <tf.Variable 'is/sampling_local/encoder_local/fc-latent-local-flt-010/biases:0' shape=(50,) dtype=float32_ref>
key is/sampling_local/encoder_local/fc-latent-local-flt-020/weights
value <tf.Variable 'is/sampling_local/encoder_local/fc-latent-local-flt-020/weights:0' shape=(50, 2) dtype=float32_ref>
key is/sampling_local/encoder_local/fc-latent-local-flt-020/biases
value <tf.Variable 'is/sampling_local/encoder_local/fc-latent-local-flt-020/biases:0' shape=(2,) dtype=float32_ref>
key is/sampling_local/encoder_local/fc-latent-local-flt-011/weights
value <tf.Variable 'is/sampling_local/encoder_local/fc-latent-local-flt-011/weights:0' shape=(1, 50) dtype=float32_ref>
key is/sampling_local/encode

Calling process.inverse_noise_tf
Data_invert: Tensor("is_1/generator/mul_5:0", shape=(?, 506), dtype=float32)
Calling process.inverse_noise_tf
Data_invert: Tensor("is_1/generator/mul_13:0", shape=(?, 506), dtype=float32)
INFO:tensorflow:Restoring parameters from ./saved_weights/Census/encoder.tensorflow
INFO:tensorflow:Restoring parameters from ./saved_weights/Census/generator.tensorflow
partial_vaem.__init__() called with vars: cat_dims= [ 9 51 47 17  3  7 24 15  5 10  2  3  6  8  6  6 51 34  8 10  9 10  3  4
  5 43 43 43  5  3  3  3  2  2]
Epoch: 0 	negative training ELBO per observed feature: 29439.60, Cat_term: 0.15, Flt_term: 1.98,z_term: 357.57
Iterating over var_dict in save_generator:
key is/generator/auto_std_local
value <tf.Variable 'is/generator/auto_std_local:0' shape=(1, 7) dtype=float32_ref>
key is/generator/fc-latent-local-cat-020/weights
value <tf.Variable 'is/generator/fc-latent-local-cat-020/weights:0' shape=(9, 9) dtype=float32_ref>
key is/generator/fc-latent-local-c

Iterating over var_dict in save_encoder:
key is/sampling_local/encoder_local/fc-latent-local-flt-010/weights
value <tf.Variable 'is/sampling_local/encoder_local/fc-latent-local-flt-010/weights:0' shape=(1, 50) dtype=float32_ref>
key is/sampling_local/encoder_local/fc-latent-local-flt-010/biases
value <tf.Variable 'is/sampling_local/encoder_local/fc-latent-local-flt-010/biases:0' shape=(50,) dtype=float32_ref>
key is/sampling_local/encoder_local/fc-latent-local-flt-020/weights
value <tf.Variable 'is/sampling_local/encoder_local/fc-latent-local-flt-020/weights:0' shape=(50, 2) dtype=float32_ref>
key is/sampling_local/encoder_local/fc-latent-local-flt-020/biases
value <tf.Variable 'is/sampling_local/encoder_local/fc-latent-local-flt-020/biases:0' shape=(2,) dtype=float32_ref>
key is/sampling_local/encoder_local/fc-latent-local-flt-011/weights
value <tf.Variable 'is/sampling_local/encoder_local/fc-latent-local-flt-011/weights:0' shape=(1, 50) dtype=float32_ref>
key is/sampling_local/encode

FINISHED TRAINING VAEM MODEL
SAMPLING FAKE DATA FROM MODEL


ResourceExhaustedError: OOM when allocating tensor with shape[40560000,100]
	 [[Node: is_1/sampling_global/encoder_global/fully_connected/MatMul = MatMul[T=DT_FLOAT, transpose_a=false, transpose_b=false, _device="/job:localhost/replica:0/task:0/device:CPU:0"](is_1/sampling_global/encoder_global/concat, is/sampling_global/encoder_global/fully_connected/weights/read)]]

Caused by op 'is_1/sampling_global/encoder_global/fully_connected/MatMul', defined at:
  File "C:\Users\jahutter\AppData\Local\Continuum\anaconda2\envs\py_36\lib\runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "C:\Users\jahutter\AppData\Local\Continuum\anaconda2\envs\py_36\lib\runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "C:\Users\jahutter\AppData\Local\Continuum\anaconda2\envs\py_36\lib\site-packages\ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "C:\Users\jahutter\AppData\Local\Continuum\anaconda2\envs\py_36\lib\site-packages\traitlets\config\application.py", line 664, in launch_instance
    app.start()
  File "C:\Users\jahutter\AppData\Local\Continuum\anaconda2\envs\py_36\lib\site-packages\ipykernel\kernelapp.py", line 612, in start
    self.io_loop.start()
  File "C:\Users\jahutter\AppData\Local\Continuum\anaconda2\envs\py_36\lib\site-packages\tornado\platform\asyncio.py", line 199, in start
    self.asyncio_loop.run_forever()
  File "C:\Users\jahutter\AppData\Local\Continuum\anaconda2\envs\py_36\lib\asyncio\base_events.py", line 442, in run_forever
    self._run_once()
  File "C:\Users\jahutter\AppData\Local\Continuum\anaconda2\envs\py_36\lib\asyncio\base_events.py", line 1462, in _run_once
    handle._run()
  File "C:\Users\jahutter\AppData\Local\Continuum\anaconda2\envs\py_36\lib\asyncio\events.py", line 145, in _run
    self._callback(*self._args)
  File "C:\Users\jahutter\AppData\Local\Continuum\anaconda2\envs\py_36\lib\site-packages\tornado\ioloop.py", line 688, in <lambda>
    lambda f: self._run_callback(functools.partial(callback, future))
  File "C:\Users\jahutter\AppData\Local\Continuum\anaconda2\envs\py_36\lib\site-packages\tornado\ioloop.py", line 741, in _run_callback
    ret = callback()
  File "C:\Users\jahutter\AppData\Local\Continuum\anaconda2\envs\py_36\lib\site-packages\tornado\gen.py", line 814, in inner
    self.ctx_run(self.run)
  File "C:\Users\jahutter\AppData\Local\Continuum\anaconda2\envs\py_36\lib\site-packages\tornado\gen.py", line 162, in _fake_ctx_run
    return f(*args, **kw)
  File "C:\Users\jahutter\AppData\Local\Continuum\anaconda2\envs\py_36\lib\site-packages\tornado\gen.py", line 775, in run
    yielded = self.gen.send(value)
  File "C:\Users\jahutter\AppData\Local\Continuum\anaconda2\envs\py_36\lib\site-packages\ipykernel\kernelbase.py", line 365, in process_one
    yield gen.maybe_future(dispatch(*args))
  File "C:\Users\jahutter\AppData\Local\Continuum\anaconda2\envs\py_36\lib\site-packages\tornado\gen.py", line 234, in wrapper
    yielded = ctx_run(next, result)
  File "C:\Users\jahutter\AppData\Local\Continuum\anaconda2\envs\py_36\lib\site-packages\tornado\gen.py", line 162, in _fake_ctx_run
    return f(*args, **kw)
  File "C:\Users\jahutter\AppData\Local\Continuum\anaconda2\envs\py_36\lib\site-packages\ipykernel\kernelbase.py", line 268, in dispatch_shell
    yield gen.maybe_future(handler(stream, idents, msg))
  File "C:\Users\jahutter\AppData\Local\Continuum\anaconda2\envs\py_36\lib\site-packages\tornado\gen.py", line 234, in wrapper
    yielded = ctx_run(next, result)
  File "C:\Users\jahutter\AppData\Local\Continuum\anaconda2\envs\py_36\lib\site-packages\tornado\gen.py", line 162, in _fake_ctx_run
    return f(*args, **kw)
  File "C:\Users\jahutter\AppData\Local\Continuum\anaconda2\envs\py_36\lib\site-packages\ipykernel\kernelbase.py", line 545, in execute_request
    user_expressions, allow_stdin,
  File "C:\Users\jahutter\AppData\Local\Continuum\anaconda2\envs\py_36\lib\site-packages\tornado\gen.py", line 234, in wrapper
    yielded = ctx_run(next, result)
  File "C:\Users\jahutter\AppData\Local\Continuum\anaconda2\envs\py_36\lib\site-packages\tornado\gen.py", line 162, in _fake_ctx_run
    return f(*args, **kw)
  File "C:\Users\jahutter\AppData\Local\Continuum\anaconda2\envs\py_36\lib\site-packages\ipykernel\ipkernel.py", line 306, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "C:\Users\jahutter\AppData\Local\Continuum\anaconda2\envs\py_36\lib\site-packages\ipykernel\zmqshell.py", line 536, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "C:\Users\jahutter\AppData\Local\Continuum\anaconda2\envs\py_36\lib\site-packages\IPython\core\interactiveshell.py", line 2867, in run_cell
    raw_cell, store_history, silent, shell_futures)
  File "C:\Users\jahutter\AppData\Local\Continuum\anaconda2\envs\py_36\lib\site-packages\IPython\core\interactiveshell.py", line 2895, in _run_cell
    return runner(coro)
  File "C:\Users\jahutter\AppData\Local\Continuum\anaconda2\envs\py_36\lib\site-packages\IPython\core\async_helpers.py", line 68, in _pseudo_sync_runner
    coro.send(None)
  File "C:\Users\jahutter\AppData\Local\Continuum\anaconda2\envs\py_36\lib\site-packages\IPython\core\interactiveshell.py", line 3072, in run_cell_async
    interactivity=interactivity, compiler=compiler, result=result)
  File "C:\Users\jahutter\AppData\Local\Continuum\anaconda2\envs\py_36\lib\site-packages\IPython\core\interactiveshell.py", line 3263, in run_ast_nodes
    if (await self.run_code(code, result,  async_=asy)):
  File "C:\Users\jahutter\AppData\Local\Continuum\anaconda2\envs\py_36\lib\site-packages\IPython\core\interactiveshell.py", line 3343, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-16-f504613fe088>", line 9, in <module>
    vae = active_learning.p_vae_active_learning(Data_train_compressed, Data_train_noisy_decompressed, mask_train_decompressed, Data_test_decompressed, mask_test_compressed, mask_test_decompressed, cat_dims, DIM_FLT, dic_var_type, args, records_d, list_discrete)
  File "C:\Users\jahutter\Documents\Programming\Thesis code\VAEM_main\utils\active_learning.py", line 61, in p_vae_active_learning
    vae = train_p_vae(stage, Data_train, Data_train,mask_train, epochs, latent_dim,cat_dims,dim_flt,batch_size, p, K,iteration, records_d, list_discrete, args)
  File "C:\Users\jahutter\Documents\Programming\Thesis code\VAEM_main\utils\active_learning.py", line 241, in train_p_vae
    vae = model.partial_vaem(**kwargs)
  File "C:\Users\jahutter\Documents\Programming\Thesis code\VAEM_main\models\model.py", line 55, in __init__
    self._build_graph()
  File "C:\Users\jahutter\Documents\Programming\Thesis code\VAEM_main\models\model.py", line 104, in _build_graph
    self.encoded_global = self._encode._partial_encoder_global(self.z_local,self.x,self.mask)
  File "C:\Users\jahutter\Documents\Programming\Thesis code\VAEM_main\models\encoders.py", line 87, in _partial_encoder_global
    encoded = layers.fully_connected(x_aug, self._K)
  File "C:\Users\jahutter\AppData\Local\Continuum\anaconda2\envs\py_36\lib\site-packages\tensorflow\contrib\framework\python\ops\arg_scope.py", line 181, in func_with_args
    return func(*args, **current_args)
  File "C:\Users\jahutter\AppData\Local\Continuum\anaconda2\envs\py_36\lib\site-packages\tensorflow\contrib\layers\python\layers\layers.py", line 1639, in fully_connected
    outputs = layer.apply(inputs)
  File "C:\Users\jahutter\AppData\Local\Continuum\anaconda2\envs\py_36\lib\site-packages\tensorflow\python\layers\base.py", line 671, in apply
    return self.__call__(inputs, *args, **kwargs)
  File "C:\Users\jahutter\AppData\Local\Continuum\anaconda2\envs\py_36\lib\site-packages\tensorflow\python\layers\base.py", line 575, in __call__
    outputs = self.call(inputs, *args, **kwargs)
  File "C:\Users\jahutter\AppData\Local\Continuum\anaconda2\envs\py_36\lib\site-packages\tensorflow\python\layers\core.py", line 162, in call
    outputs = standard_ops.matmul(inputs, self.kernel)
  File "C:\Users\jahutter\AppData\Local\Continuum\anaconda2\envs\py_36\lib\site-packages\tensorflow\python\ops\math_ops.py", line 1891, in matmul
    a, b, transpose_a=transpose_a, transpose_b=transpose_b, name=name)
  File "C:\Users\jahutter\AppData\Local\Continuum\anaconda2\envs\py_36\lib\site-packages\tensorflow\python\ops\gen_math_ops.py", line 2436, in _mat_mul
    name=name)
  File "C:\Users\jahutter\AppData\Local\Continuum\anaconda2\envs\py_36\lib\site-packages\tensorflow\python\framework\op_def_library.py", line 787, in _apply_op_helper
    op_def=op_def)
  File "C:\Users\jahutter\AppData\Local\Continuum\anaconda2\envs\py_36\lib\site-packages\tensorflow\python\framework\ops.py", line 2956, in create_op
    op_def=op_def)
  File "C:\Users\jahutter\AppData\Local\Continuum\anaconda2\envs\py_36\lib\site-packages\tensorflow\python\framework\ops.py", line 1470, in __init__
    self._traceback = self._graph._extract_stack()  # pylint: disable=protected-access

ResourceExhaustedError (see above for traceback): OOM when allocating tensor with shape[40560000,100]
	 [[Node: is_1/sampling_global/encoder_global/fully_connected/MatMul = MatMul[T=DT_FLOAT, transpose_a=false, transpose_b=false, _device="/job:localhost/replica:0/task:0/device:CPU:0"](is_1/sampling_global/encoder_global/concat, is/sampling_global/encoder_global/fully_connected/weights/read)]]


*** Or do separate training & sampling for fake db for debugging:***

In [57]:
print("START TRAINING VAEM MODEL")
start_time = time.time()
vae = active_learning.p_vae_active_learning(Data_train_compressed, Data_train_noisy_decompressed, mask_train_decompressed, Data_test_decompressed, mask_test_compressed, mask_test_decompressed, cat_dims, DIM_FLT, dic_var_type, args, records_d, list_discrete)
end_time = time.time()
print("FINISHED TRAINING VAEM MODEL AFTER", end_time - start_time, "SECONDS")

START TRAINING VAEM MODEL
BATCH SIZE: Tensor("is/strided_slice:0", shape=(), dtype=int32)
Calling process.inverse_noise_tf
Tensor("is/generator/mul_5:0", shape=(?, 108), dtype=float32)


ValueError: slice index 108 of dimension 1 out of bounds. for 'is/generator/strided_slice_20' (op: 'StridedSlice') with input shapes: [?,108], [2], [2], [2] and with computed input tensors: input[1] = <0 108>, input[2] = <0 109>, input[3] = <1 1>.

In [33]:
i=3
print("SAMPLING FAKE DATA FROM MODEL")
start_time = time.time()

tf.reset_default_graph()
# mask_train_decompressed*0 = matrix of all 0s == everything is unobserved i.e. will generate data
# Data_fake_noisy = decoded/reconstructed/generated data mtx x
Data_fake_noisy,z_posterior,x_recon_cat_p = vae.get_imputation( Data_train_noisy_decompressed, mask_train_decompressed*0,cat_dims,dic_var_type)

# Process real train data for comparison with fake (compress back to 1 col per categorical var):
x_real = process.compress_data(Data_train_decompressed,cat_dims, dic_var_type)
cols = np.concatenate((categorical_cols, integer_cols))

# invert_noise = round values in discrete cols to nearest possible value:
# invert_noise twice??? already in get_imputation line 619 ??
Data_fake = process.invert_noise(Data_fake_noisy,list_discrete_compressed,records_d) 
## Normalize [0,1] fake data using min & max values from real data: 
Data_std = (Data_fake - x_real.min(axis=0)) / (x_real.max(axis=0) - x_real.min(axis=0))
#Data_fake = Data_std * (max_Data - min_Data) + min_Data
Data_fake = Data_std # ignore additional scaling

#Data_fake.shape
Data_fake = pd.DataFrame(data=Data_fake, index=None, columns=cols)
#Data_fake[target]=Data_fake[target].round() # when target was treated as float -> needed to round to get binary target
# save fake data to csv:
Data_fake.to_csv(fake_file_root+dataset_name+"/"+dataset_name+"_fake_vaem_"+str(args.epochs)+"epochs_"+str(i)+".csv", index=False)
#Data_fake.head()

end_time = time.time()
print("FINISHED SAMPLING FAKE DATA AFTER", end_time-start_time, "SECONDS")

SAMPLING FAKE DATA FROM MODEL
1.0
1.0
1.0
1.0
1.0
1.0
FINISHED SAMPLING FAKE DATA AFTER 22.06118369102478 SECONDS


In [18]:
Data_fake["y"].max() # range is 0.33 - 0.41 for 1 epoch; (0.05 - 0.088)&(0.06 - 0.1) for 30 epochs; 0.1-0.29 for 300 epochs

1.0

### Only need to process real train & test data once per dataset:

In [9]:
print("PROCESSING REAL TRAIN DATA")
# Process real train data for comparison with fake (compress back to 1 col per categorical var):
x_real = process.compress_data(Data_train_decompressed,cat_dims, dic_var_type)
cols = np.concatenate((categorical_cols, integer_cols))

# Normalize [0,1] real data:
Data_std = (x_real - x_real.min(axis=0)) / (x_real.max(axis=0) - x_real.min(axis=0))
#scaling_factor = (x_real.max(axis=0) - x_real.min(axis=0))/(max_Data - min_Data)
#Data_real = Data_std * (max_Data - min_Data) + min_Data
Data_real = Data_std
Data_real = pd.DataFrame(data=Data_real, index=None, columns=cols)
# save real (processed) data to csv:
Data_real.to_csv(real_file_root+dataset_name+"/"+dataset_name+"_train_processed.csv", index=False)
#Data_real.head()

print("PROCESSING REAL TEST DATA")
test_data = process.compress_data(Data_test_decompressed, cat_dims, dic_var_type)
test_std = (test_data - x_real.min(axis=0)) / (x_real.max(axis=0) - x_real.min(axis=0))
Data_test = pd.DataFrame(data=test_std, index=None, columns=cols)
# save real (processed) test data to csv: 
Data_test.to_csv(real_file_root+dataset_name+"/"+dataset_name+"_test_processed.csv", index=False)
#Data_test.head()

PROCESSING REAL TRAIN DATA
PROCESSING REAL TEST DATA
