In [None]:
# Dependencies
import os
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from PIL import Image
from sklearn.metrics import confusion_matrix

from keras.utils import to_categorical
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.callbacks import ModelCheckpoint

from functions.ontram import ontram
from functions.fit_ontram import fit_ontram
from functions.methods import predict

from functions.load_UTKFace_data import load_UTKFace_data 
from functions.plot_results import plot_results

os.environ['CUDA_VISIBLE_DEVICES'] = '0' # which GPU is visible
%matplotlib inline

In [None]:
DATA_DIR = '/tf/notebooks/data/UTKFace/UTKFace/'
OUTPUT_DIR = '/tf/notebooks/hezo/ordinal_regression/callbacks/UTKFace_ontram/'

### Load data

In [None]:
X_train, train, X_valid, valid, X_test, test = load_UTKFace_data(DATA_DIR + 'UTKFace.h5')

In [None]:
plt.hist(train.age_group, bins = 7)

In [None]:
plt.imshow(X_train[0])

In [None]:
# One hot encoding
Y_train = to_categorical(train.age_group)
Y_valid = to_categorical(valid.age_group)
Y_test = to_categorical(test.age_group)
print(Y_train.shape, Y_valid.shape, Y_test.shape)

# ONTRAM

### Complex intercept

In [None]:
def mod_bl(y_dim, dropout_rate = 0.3):
    in_ = keras.Input(shape = X_train.shape[1:], name = 'bl_in')
    
    x = layers.Convolution2D(16, (3, 3), padding = 'same')(in_)
    x = layers.Activation('relu')(x)
    x = layers.Dropout(dropout_rate)(x)
    x = layers.Convolution2D(16, (3, 3), padding='same')(x)
    x = layers.Activation('relu')(x)
    x = layers.Dropout(dropout_rate)(x)
    x = layers.MaxPooling2D(pool_size = (2, 2))(x)
    
    x = layers.Convolution2D(32, (3, 3), padding = 'same')(x)
    x = layers.Activation('relu')(x)
    x = layers.Dropout(dropout_rate)(x)
    x = layers.Convolution2D(32, (3, 3), padding='same')(x)
    x = layers.Activation('relu')(x)
    x = layers.Dropout(dropout_rate)(x)
    x = layers.MaxPooling2D(pool_size = (2, 2))(x)
    
    x = layers.Convolution2D(32, (3, 3), padding = 'same')(x)
    x = layers.Activation('relu')(x)
    x = layers.Dropout(dropout_rate)(x)
    x = layers.Convolution2D(32, (3, 3), padding='same')(x)
    x = layers.Activation('relu')(x)
    x = layers.Dropout(dropout_rate)(x)
    x = layers.MaxPooling2D(pool_size = (2, 2))(x)
    
    x = layers.Flatten()(x)
    x = layers.Dense(500)(x)
    x = layers.Activation('relu')(x)
    x = layers.Dropout(dropout_rate)(x)
    x = layers.Dense(50)(x)
    x = layers.Activation('relu')(x)
    x = layers.Dropout(dropout_rate)(x)
    out_ = layers.Dense(y_dim, activation = 'linear', name = 'bl_out')(x)
    
    return keras.Model(inputs = in_, outputs = out_)

In [None]:
# Training
for i in range(5):
    print('Run ', i)
    
    # define model
    nn_bl = mod_bl(Y_train.shape[1]-1)
    m = ontram(nn_bl = nn_bl, response_varying = True)
    
    # train
    hist = fit_ontram(m, 
                      x_train_im = X_train, y_train = Y_train, 
                      x_test_im = X_valid, y_test = Y_valid, 
                      batch_size = 32,
                      epochs = 30,
                      output_dir = OUTPUT_DIR + 'RV_without_covariables/' + 'run' + str(i) + '/')
    
    # save history
    dat = pd.DataFrame({'train_loss': hist['train_loss'], 'train_acc': hist['train_acc'], 'test_loss': hist['test_loss'], 'test_acc': hist['test_acc']})
    dat.to_csv(OUTPUT_DIR + 'RV_without_covariables/' + 'run' + str(i) + '/history.csv', index = False)
    
    # get the best model and the predictions
    best_model = np.where(dat.test_loss == np.min(dat.test_loss))[0][0]
    m.model.load_weights(OUTPUT_DIR + 'RV_without_covariables/' + 'run' + str(i) + '/model-' + str(best_model) + '.hdf5')
    pred = predict(m, bl = X_test, y = Y_test)
    
    # save predictions of the best model
    out = test.join(pd.DataFrame(pred['pdf'], columns = ['p_0', 'p_1', 'p_2', 'p_3', 'p_4', 'p_5', 'p_6']))
    out = out.join(pd.DataFrame(pred['response'], columns = ['pred']))
    out.to_csv(OUTPUT_DIR + 'RV_without_covariables/' + 'run' + str(i) + '/test_predictions.csv', index=False)

### Complex intercept + linear shift (covariables)

In [None]:
# to categorical
gender_ohe = to_categorical(train.gender)[:,1:] # male reference: 0 = male, 1 = female
race_ohe = to_categorical(train.race)[:,1:] # White reference: 0 = White, 1 = Black, 2 = Asian, 3 = Indian, 4 = Others (like Hispanic, Latino, Middle Eastern)
X_train_gr = np.concatenate((gender_ohe, race_ohe), axis = 1)
print(X_train_gr.shape)

In [None]:
# to categorical
gender_ohe = to_categorical(valid.gender)[:,1:] # male reference: 0 = male, 1 = female
race_ohe = to_categorical(valid.race)[:,1:] # White reference: 0 = White, 1 = Black, 2 = Asian, 3 = Indian, 4 = Others (like Hispanic, Latino, Middle Eastern)
X_valid_gr = np.concatenate((gender_ohe, race_ohe), axis = 1)
print(X_valid_gr.shape)

In [None]:
# to categorical
gender_ohe = to_categorical(test.gender)[:,1:] # male reference: 0 = male, 1 = female
race_ohe = to_categorical(test.race)[:,1:] # White reference: 0 = White, 1 = Black, 2 = Asian, 3 = Indian, 4 = Others (like Hispanic, Latino, Middle Eastern)
X_test_gr = np.concatenate((gender_ohe, race_ohe), axis = 1)
print(X_test_gr.shape)

In [None]:
for i in range(5):
    print('Run ', i)
    
    # define model
    nn_bl = mod_bl(Y_train.shape[1] - 1)
    in_ = keras.Input(shape = X_train_gr.shape[1:], name = 'x_in')
    out_ = layers.Dense(1, activation = 'linear', name = 'x_out', use_bias = False)(in_)
    nn_x = keras.Model(inputs = in_, outputs = out_)
    m = ontram(nn_bl = nn_bl, nn_x = nn_x, response_varying = True)
    
    # train
    hist = fit_ontram(m, 
                  x_train_im = X_train, y_train = Y_train, x_train = X_train_gr, 
                  x_test_im = X_valid, y_test = Y_valid,  x_test = X_valid_gr,
                  batch_size = 32,
                  epochs = 30,
                  output_dir = OUTPUT_DIR + 'RV_with_covariables/' + 'run' + str(i) + '/')
    
    # save history
    dat = pd.DataFrame({'train_loss': hist['train_loss'], 'train_acc': hist['train_acc'], 'test_loss': hist['test_loss'], 'test_acc': hist['test_acc']})
    dat.to_csv(OUTPUT_DIR + 'RV_with_covariables/' + 'run' + str(i) + '/history.csv', index = False)
    
    # get the best model and the predictions
    best_model = np.where(dat.test_loss == np.min(dat.test_loss))[0][0]
    m.model.load_weights(OUTPUT_DIR + 'RV_with_covariables/' + 'run' + str(i) + '/model-' + str(best_model) + '.hdf5')
    pred = predict(m, bl = X_test, x = X_test_gr, y = Y_test)
    
    # save predictions of the best model
    out = test.join(pd.DataFrame(pred['pdf'], columns = ['p_0', 'p_1', 'p_2', 'p_3', 'p_4', 'p_5', 'p_6']))
    out = out.join(pd.DataFrame(pred['response'], columns = ['pred']))
    out.to_csv(OUTPUT_DIR + 'RV_with_covariables/' + 'run' + str(i) + '/test_predictions.csv', index=False)

In [None]:
# save the estimates
for i in range(0,5):
    
    # define model
    nn_bl = mod_bl(Y_train.shape[1] - 1)
    in_ = keras.Input(shape = X_train_gr.shape[1:], name = 'x_in')
    out_ = layers.Dense(1, activation = 'linear', name = 'x_out', use_bias = False)(in_)
    nn_x = keras.Model(inputs = in_, outputs = out_)
    m = ontram(nn_bl = nn_bl, nn_x = nn_x, response_varying = True)
    
    # load history to find the best model
    dat = pd.read_csv(OUTPUT_DIR + 'RV_with_covariables/' + 'run' + str(i) + '/history.csv')
    
    # get the best model and the predictions
    best_model = np.where(dat.test_loss == np.min(dat.test_loss))[0][0]
    m.model.load_weights(OUTPUT_DIR + 'RV_with_covariables/' + 'run' + str(i) + '/model-' + str(best_model) + '.hdf5')
    pred = predict(m, bl = X_test, x = X_test_gr, y = Y_test)
    
    # save betas
    if i == 0:
        n = len(pred["beta_w"][0])
        beta = pred["beta_w"][0].reshape((n)) # make it 1D
        dat_beta = pd.DataFrame({"beta": beta, "model": "CI-LS_racegender", "run": i}) 
    else:
        n = len(pred["beta_w"][0])
        beta = pred["beta_w"][0].reshape((n)) # make it 1D
        dat_beta = dat_beta.append(pd.DataFrame({"beta": beta, "model": "CI-LS_racegender", "run": i}))

dat_beta.to_csv(OUTPUT_DIR + 'RV_with_covariables/' + 'run' + str(i) + '/beta_estimates.csv', index=False)

### Complex intercept + linear shift (gender)

In [None]:
# to categorical
X_train_gr = to_categorical(train.gender)[:,1:] # male reference: 0 = male, 1 = female
print(X_train_gr.shape)

In [None]:
# to categorical
X_valid_gr = to_categorical(valid.gender)[:,1:] # male reference: 0 = male, 1 = female
print(X_valid_gr.shape)

In [None]:
# to categorical
X_test_gr = to_categorical(test.gender)[:,1:] # male reference: 0 = male, 1 = female
print(X_test_gr.shape)

In [None]:
for i in range(5):
    print('Run ', i)
    
    # define model
    nn_bl = mod_bl(Y_train.shape[1] - 1)
    in_ = keras.Input(shape = X_train_gr.shape[1:], name = 'x_in')
    out_ = layers.Dense(1, activation = 'linear', name = 'x_out', use_bias = False)(in_)
    nn_x = keras.Model(inputs = in_, outputs = out_)
    m = ontram(nn_bl = nn_bl, nn_x = nn_x, response_varying = True)
    
    # train
    hist = fit_ontram(m, 
                  x_train_im = X_train, y_train = Y_train, x_train = X_train_gr, 
                  x_test_im = X_valid, y_test = Y_valid,  x_test = X_valid_gr,
                  batch_size = 32,
                  epochs = 30,
                  output_dir = OUTPUT_DIR + 'RV_with_gender/' + 'run' + str(i) + '/')
    
    # save history
    dat = pd.DataFrame({'train_loss': hist['train_loss'], 'train_acc': hist['train_acc'], 'test_loss': hist['test_loss'], 'test_acc': hist['test_acc']})
    dat.to_csv(OUTPUT_DIR + 'RV_with_gender/' + 'run' + str(i) + '/history.csv', index = False)
    
    # get the best model and the predictions
    best_model = np.where(dat.test_loss == np.min(dat.test_loss))[0][0]
    m.model.load_weights(OUTPUT_DIR + 'RV_with_gender/' + 'run' + str(i) + '/model-' + str(best_model) + '.hdf5')
    pred = predict(m, bl = X_test, x = X_test_gr, y = Y_test)
    
    # save predictions of the best model
    out = test.join(pd.DataFrame(pred['pdf'], columns = ['p_0', 'p_1', 'p_2', 'p_3', 'p_4', 'p_5', 'p_6']))
    out = out.join(pd.DataFrame(pred['response'], columns = ['pred']))
    out.to_csv(OUTPUT_DIR + 'RV_with_gender/' + 'run' + str(i) + '/test_predictions.csv', index=False)

In [None]:
for i in range(5):
    
    # define model
    nn_bl = mod_bl(Y_train.shape[1] - 1)
    in_ = keras.Input(shape = X_train_gr.shape[1:], name = 'x_in')
    out_ = layers.Dense(1, activation = 'linear', name = 'x_out', use_bias = False)(in_)
    nn_x = keras.Model(inputs = in_, outputs = out_)
    m = ontram(nn_bl = nn_bl, nn_x = nn_x, response_varying = True)
    
    # load history to find the best model
    dat = pd.read_csv(OUTPUT_DIR + 'RV_with_gender/' + 'run' + str(i) + '/history.csv')
    
    # get the best model and the predictions
    best_model = np.where(dat.test_loss == np.min(dat.test_loss))[0][0]
    m.model.load_weights(OUTPUT_DIR + 'RV_with_gender/' + 'run' + str(i) + '/model-' + str(best_model) + '.hdf5')
    pred = predict(m, bl = X_test, x = X_test_gr, y = Y_test)
    
    # save betas
    if i == 0:
        n = len(pred["beta_w"][0])
        beta = pred["beta_w"][0].reshape((n)) # make it 1D
        dat_beta = pd.DataFrame({"beta": beta, "model": "CS-LS_gender", "run": i}) 
    else:
        n = len(pred["beta_w"][0])
        beta = pred["beta_w"][0].reshape((n)) # make it 1D
        dat_beta = dat_beta.append(pd.DataFrame({"beta": beta, "model": "CS-LS_gender", "run": i}))

dat_beta.to_csv(OUTPUT_DIR + 'RV_with_gender/' + 'run' + str(i) + '/beta_estimates.csv', index=False)

### Complex intercept + linear shift (simulated data)

In [None]:
X_train_sim = np.array(train.loc[:,['x_1', 'x_2', 'x_3', 'x_4', 'x_5', 'x_6', 'x_7', 'x_8', 'x_9', 'x_10']])
X_train_sim.shape

In [None]:
X_valid_sim = np.array(valid.loc[:,['x_1', 'x_2', 'x_3', 'x_4', 'x_5', 'x_6', 'x_7', 'x_8', 'x_9', 'x_10']])
X_valid_sim.shape

In [None]:
X_test_sim = np.array(test.loc[:,['x_1', 'x_2', 'x_3', 'x_4', 'x_5', 'x_6', 'x_7', 'x_8', 'x_9', 'x_10']])
X_test_sim.shape

In [None]:
for i in range(5):
    print('Run ', i)
    
    # define model
    nn_bl = mod_bl(Y_train.shape[1] - 1)
    in_ = keras.Input(shape = X_train_sim.shape[1:], name = 'x_in')
    out_ = layers.Dense(1, activation = 'linear', name = 'x_out', use_bias = False)(in_)
    nn_x = keras.Model(inputs = in_, outputs = out_)
    m = ontram(nn_bl = nn_bl, nn_x = nn_x, response_varying = True)
    
    # train
    hist = fit_ontram(m, 
                  x_train_im = X_train, y_train = Y_train, x_train = X_train_sim, 
                  x_test_im = X_valid, y_test = Y_valid,  x_test = X_valid_sim,
                  batch_size = 32,
                  epochs = 30,
                  output_dir = OUTPUT_DIR + 'RV_with_simulated_data/' + 'run' + str(i) + '/')
    
    # save history
    dat = pd.DataFrame({'train_loss': hist['train_loss'], 'train_acc': hist['train_acc'], 'test_loss': hist['test_loss'], 'test_acc': hist['test_acc']})
    dat.to_csv(OUTPUT_DIR + 'RV_with_simulated_data/' + 'run' + str(i) + '/history.csv', index = False)
    
    # get the best model and the predictions
    best_model = np.where(dat.test_loss == np.min(dat.test_loss))[0][0]
    m.model.load_weights(OUTPUT_DIR + 'RV_with_simulated_data/' + 'run' + str(i) + '/model-' + str(best_model) + '.hdf5')
    pred = predict(m, bl = X_test, x = X_test_sim, y = Y_test)
    
    # save predictions of the best model
    out = test.join(pd.DataFrame(pred['pdf'], columns = ['p_0', 'p_1', 'p_2', 'p_3', 'p_4', 'p_5', 'p_6']))
    out = out.join(pd.DataFrame(pred['response'], columns = ['pred']))
    out.to_csv(OUTPUT_DIR + 'RV_with_simulated_data/' + 'run' + str(i) + '/test_predictions.csv', index=False)

In [None]:
for i in range(5):
    
    # define model
    nn_bl = mod_bl(Y_train.shape[1] - 1)
    in_ = keras.Input(shape = X_train_sim.shape[1:], name = 'x_in')
    out_ = layers.Dense(1, activation = 'linear', name = 'x_out', use_bias = False)(in_)
    nn_x = keras.Model(inputs = in_, outputs = out_)
    m = ontram(nn_bl = nn_bl, nn_x = nn_x, response_varying = True)
    
    # load history to find the best model
    dat = pd.read_csv(OUTPUT_DIR + 'RV_with_simulated_data/' + 'run' + str(i) + '/history.csv')
    
    # get the best model and the predictions
    best_model = np.where(dat.test_loss == np.min(dat.test_loss))[0][0]
    m.model.load_weights(OUTPUT_DIR + 'RV_with_simulated_data/' + 'run' + str(i) + '/model-' + str(best_model) + '.hdf5')
    pred = predict(m, bl = X_test, x = X_test_sim, y = Y_test)
    
    # save betas
    if i == 0:
        n = len(pred["beta_w"][0])
        beta = pred["beta_w"][0].reshape((n)) # make it 1D
        dat_beta = pd.DataFrame({"beta": beta, "model": "CS-LS_sim", "run": i}) 
    else:
        n = len(pred["beta_w"][0])
        beta = pred["beta_w"][0].reshape((n)) # make it 1D
        dat_beta = dat_beta.append(pd.DataFrame({"beta": beta, "model": "CS-LS_sim", "run": i}))

dat_beta.to_csv(OUTPUT_DIR + 'RV_with_simulated_data/' + 'run' + str(i) + '/beta_estimates.csv', index=False)

### Simple intercept + complex shift (image)

In [None]:
def mod_bl(y_dim):
    nn_bl = keras.Sequential(name = "nn_bl")
    nn_bl.add(keras.Input(shape = (1, ), name = "bl_in"))
    nn_bl.add(layers.Dense(y_dim - 1, activation = "linear", use_bias = False, name = "bl_out"))
    return nn_bl

In [None]:
def mod_im(dropout_rate = 0.3):
    in_ = keras.Input(shape = X_train.shape[1:], name = 'im_in')
    
    x = layers.Convolution2D(16, (3, 3), padding = 'same')(in_)
    x = layers.Activation('relu')(x)
    x = layers.Dropout(dropout_rate)(x)
    x = layers.Convolution2D(16, (3, 3), padding='same')(x)
    x = layers.Activation('relu')(x)
    x = layers.Dropout(dropout_rate)(x)
    x = layers.MaxPooling2D(pool_size = (2, 2))(x)
    
    x = layers.Convolution2D(32, (3, 3), padding = 'same')(x)
    x = layers.Activation('relu')(x)
    x = layers.Dropout(dropout_rate)(x)
    x = layers.Convolution2D(32, (3, 3), padding='same')(x)
    x = layers.Activation('relu')(x)
    x = layers.Dropout(dropout_rate)(x)
    x = layers.MaxPooling2D(pool_size = (2, 2))(x)
    
    x = layers.Convolution2D(32, (3, 3), padding = 'same')(x)
    x = layers.Activation('relu')(x)
    x = layers.Dropout(dropout_rate)(x)
    x = layers.Convolution2D(32, (3, 3), padding='same')(x)
    x = layers.Activation('relu')(x)
    x = layers.Dropout(dropout_rate)(x)
    x = layers.MaxPooling2D(pool_size = (2, 2))(x)
    
    x = layers.Flatten()(x)
    x = layers.Dense(500)(x)
    x = layers.Activation('relu')(x)
    x = layers.Dropout(dropout_rate)(x)
    x = layers.Dense(50)(x)
    x = layers.Activation('relu')(x)
    x = layers.Dropout(dropout_rate)(x)
    out_ = layers.Dense(1, activation = 'linear', name = 'im_out')(x)
    
    return keras.Model(inputs = in_, outputs = out_)

In [None]:
for i in range(5):
    print('Run ', i)
    
    # define model
    nn_bl = mod_bl(Y_train.shape[1])
    nn_im = mod_im()
    m = ontram(nn_bl = nn_bl, nn_im = nn_im, response_varying = False)

    # train
    hist = fit_ontram(m, 
                  x_train_im = X_train, y_train = Y_train,
                  x_test_im = X_valid, y_test = Y_valid,
                  batch_size = 32,
                  epochs = 30,
                  output_dir = OUTPUT_DIR + 'CS_without_covariables/' + 'run' + str(i) + '/')
    
    # save history
    dat = pd.DataFrame({'train_loss': hist['train_loss'], 'train_acc': hist['train_acc'], 'test_loss': hist['test_loss'], 'test_acc': hist['test_acc']})
    dat.to_csv(OUTPUT_DIR + 'CS_without_covariables/' + 'run' + str(i) + '/history.csv', index = False)
    
    # get the best model and the predictions
    best_model = np.where(dat.test_loss == np.min(dat.test_loss))[0][0]
    m.model.load_weights(OUTPUT_DIR + 'CS_without_covariables/' + 'run' + str(i) + '/model-' + str(best_model) + '.hdf5')
    pred = predict(m, x_im = X_test, y = Y_test)
    
    # save predictions of the best model
    out = test.join(pd.DataFrame(pred['pdf'], columns = ['p_0', 'p_1', 'p_2', 'p_3', 'p_4', 'p_5', 'p_6']))
    out = out.join(pd.DataFrame(pred['response'], columns = ['pred']))
    out.to_csv(OUTPUT_DIR + 'CS_without_covariables/' + 'run' + str(i) + '/test_predictions.csv', index=False)
    
    # save predictions of the best model with eta
    out = test.join(pd.DataFrame(pred['pdf'], columns = ['p_0', 'p_1', 'p_2', 'p_3', 'p_4', 'p_5', 'p_6'])).join(pd.DataFrame(pred['eta'], columns = ['eta']))
    out = out.join(pd.DataFrame(pred['response'], columns = ['pred']))
    out.to_csv(OUTPUT_DIR + 'CS_without_covariables/' + 'run' + str(i) + '/test_predictions_eta.csv', index=False)

### Simple intercept + complex shift (image) + linear shift(covariables)

In [None]:
# to categorical
gender_ohe = to_categorical(train.gender)[:,1:] # male reference: 0 = male, 1 = female
race_ohe = to_categorical(train.race)[:,1:] # White reference: 0 = White, 1 = Black, 2 = Asian, 3 = Indian, 4 = Others (like Hispanic, Latino, Middle Eastern)
X_train_gr = np.concatenate((gender_ohe, race_ohe), axis = 1)
print(X_train_gr.shape)

In [None]:
# to categorical
gender_ohe = to_categorical(valid.gender)[:,1:] # male reference: 0 = male, 1 = female
race_ohe = to_categorical(valid.race)[:,1:] # White reference: 0 = White, 1 = Black, 2 = Asian, 3 = Indian, 4 = Others (like Hispanic, Latino, Middle Eastern)
X_valid_gr = np.concatenate((gender_ohe, race_ohe), axis = 1)
print(X_valid_gr.shape)

In [None]:
# to categorical
gender_ohe = to_categorical(test.gender)[:,1:] # male reference: 0 = male, 1 = female
race_ohe = to_categorical(test.race)[:,1:] # White reference: 0 = White, 1 = Black, 2 = Asian, 3 = Indian, 4 = Others (like Hispanic, Latino, Middle Eastern)
X_test_gr = np.concatenate((gender_ohe, race_ohe), axis = 1)
print(X_test_gr.shape)

In [None]:
def mod_bl(y_dim):
    nn_bl = keras.Sequential(name = "nn_bl")
    nn_bl.add(keras.Input(shape = (1, ), name = "bl_in"))
    nn_bl.add(layers.Dense(y_dim - 1, activation = "linear", use_bias = False, name = "bl_out"))
    return nn_bl

In [None]:
def mod_im(dropout_rate = 0.3):
    in_ = keras.Input(shape = X_train.shape[1:], name = 'im_in')
    
    x = layers.Convolution2D(16, (3, 3), padding = 'same')(in_)
    x = layers.Activation('relu')(x)
    x = layers.Dropout(dropout_rate)(x)
    x = layers.Convolution2D(16, (3, 3), padding='same')(x)
    x = layers.Activation('relu')(x)
    x = layers.Dropout(dropout_rate)(x)
    x = layers.MaxPooling2D(pool_size = (2, 2))(x)
    
    x = layers.Convolution2D(32, (3, 3), padding = 'same')(x)
    x = layers.Activation('relu')(x)
    x = layers.Dropout(dropout_rate)(x)
    x = layers.Convolution2D(32, (3, 3), padding='same')(x)
    x = layers.Activation('relu')(x)
    x = layers.Dropout(dropout_rate)(x)
    x = layers.MaxPooling2D(pool_size = (2, 2))(x)
    
    x = layers.Convolution2D(32, (3, 3), padding = 'same')(x)
    x = layers.Activation('relu')(x)
    x = layers.Dropout(dropout_rate)(x)
    x = layers.Convolution2D(32, (3, 3), padding='same')(x)
    x = layers.Activation('relu')(x)
    x = layers.Dropout(dropout_rate)(x)
    x = layers.MaxPooling2D(pool_size = (2, 2))(x)
    
    x = layers.Flatten()(x)
    x = layers.Dense(500)(x)
    x = layers.Activation('relu')(x)
    x = layers.Dropout(dropout_rate)(x)
    x = layers.Dense(50)(x)
    x = layers.Activation('relu')(x)
    x = layers.Dropout(dropout_rate)(x)
    out_ = layers.Dense(1, activation = 'linear', name = 'im_out')(x)
    
    return keras.Model(inputs = in_, outputs = out_)

In [None]:
for i in range(5):
    print('Run ', i)
    
    # define model
    nn_bl = mod_bl(Y_train.shape[1])
    in_ = keras.Input(shape = X_train_gr.shape[1:], name = 'x_in')
    out_ = layers.Dense(1, activation = 'linear', name = 'x_out', use_bias = False)(in_)
    nn_x = keras.Model(inputs = in_, outputs = out_)
    nn_im = mod_im()
    m = ontram(nn_bl = nn_bl, nn_im = nn_im, nn_x = nn_x, response_varying = False)

    # train
    hist = fit_ontram(m, 
                  x_train_im = X_train, y_train = Y_train, x_train = X_train_gr, 
                  x_test_im = X_valid, y_test = Y_valid, x_test = X_valid_gr,
                  batch_size = 32,
                  epochs = 30,
                  output_dir = OUTPUT_DIR + 'CS_with_covariables/' + 'run' + str(i) + '/')
    
    # save history
    dat = pd.DataFrame({'train_loss': hist['train_loss'], 'train_acc': hist['train_acc'], 'test_loss': hist['test_loss'], 'test_acc': hist['test_acc']})
    dat.to_csv(OUTPUT_DIR + 'CS_with_covariables/' + 'run' + str(i) + '/history.csv', index = False)
    
    # get the best model and the predictions
    best_model = np.where(dat.test_loss == np.min(dat.test_loss))[0][0]
    m.model.load_weights(OUTPUT_DIR + 'CS_with_covariables/' + 'run' + str(i) + '/model-' + str(best_model) + '.hdf5')
    pred = predict(m, x_im = X_test, x = X_test_gr, y = Y_test)
    
    # save predictions of the best model
    out = test.join(pd.DataFrame(pred['pdf'], columns = ['p_0', 'p_1', 'p_2', 'p_3', 'p_4', 'p_5', 'p_6']))
    out = out.join(pd.DataFrame(pred['response'], columns = ['pred']))
    out.to_csv(OUTPUT_DIR + 'CS_with_covariables/' + 'run' + str(i) + '/test_predictions.csv', index=False)
    
    # save predictions of the best model
    out = test.join(pd.DataFrame(pred['pdf'], columns = ['p_0', 'p_1', 'p_2', 'p_3', 'p_4', 'p_5', 'p_6'])).join(pd.DataFrame(pred['eta'], columns = ['eta']))
    out = out.join(pd.DataFrame(pred['response'], columns = ['pred']))
    out.to_csv(OUTPUT_DIR + 'CS_with_covariables/' + 'run' + str(i) + '/test_predictions_eta.csv', index=False)

In [None]:
for i in range(5):
    
    # define model
    nn_bl = mod_bl(Y_train.shape[1])
    in_ = keras.Input(shape = X_train_gr.shape[1:], name = 'x_in')
    out_ = layers.Dense(1, activation = 'linear', name = 'x_out', use_bias = False)(in_)
    nn_x = keras.Model(inputs = in_, outputs = out_)
    nn_im = mod_im()
    m = ontram(nn_bl = nn_bl, nn_im = nn_im, nn_x = nn_x, response_varying = False)
    
    # load history to find the best model
    dat = pd.read_csv(OUTPUT_DIR + 'CS_with_covariables/' + 'run' + str(i) + '/history.csv')
    
    # get the best model and the predictions
    best_model = np.where(dat.test_loss == np.min(dat.test_loss))[0][0]
    m.model.load_weights(OUTPUT_DIR + 'CS_with_covariables/' + 'run' + str(i) + '/model-' + str(best_model) + '.hdf5')
    pred = predict(m, x_im = X_test, x = X_test_gr, y = Y_test)
    
    # save betas
    if i == 0:
        n = len(pred["beta_w"][0])
        beta = pred["beta_w"][0].reshape((n)) # make it 1D
        dat_beta = pd.DataFrame({"beta": beta, "model": "CI-CS-LS_racegender", "run": i}) 
    else:
        n = len(pred["beta_w"][0])
        beta = pred["beta_w"][0].reshape((n)) # make it 1D
        dat_beta = dat_beta.append(pd.DataFrame({"beta": beta, "model": "CI-CS-LS_racegender", "run": i}))

dat_beta.to_csv(OUTPUT_DIR + 'CS_with_covariables/' + 'run' + str(i) + '/beta_estimates.csv', index=False)

### Simple intercept + complex shift (image) + linear shift (gender)

In [None]:
# to categorical
X_train_gr = to_categorical(train.gender)[:,1:] # male reference: 0 = male, 1 = female
print(X_train_gr.shape)

In [None]:
# to categorical
X_valid_gr = to_categorical(valid.gender)[:,1:] # male reference: 0 = male, 1 = female
print(X_valid_gr.shape)

In [None]:
# to categorical
X_test_gr = to_categorical(test.gender)[:,1:] # male reference: 0 = male, 1 = female
print(X_test_gr.shape)

In [None]:
def mod_bl(y_dim):
    nn_bl = keras.Sequential(name = "nn_bl")
    nn_bl.add(keras.Input(shape = (1, ), name = "bl_in"))
    nn_bl.add(layers.Dense(y_dim - 1, activation = "linear", use_bias = False, name = "bl_out"))
    return nn_bl

In [None]:
def mod_im(dropout_rate = 0.3):
    in_ = keras.Input(shape = X_train.shape[1:], name = 'im_in')
    
    x = layers.Convolution2D(16, (3, 3), padding = 'same')(in_)
    x = layers.Activation('relu')(x)
    x = layers.Dropout(dropout_rate)(x)
    x = layers.Convolution2D(16, (3, 3), padding='same')(x)
    x = layers.Activation('relu')(x)
    x = layers.Dropout(dropout_rate)(x)
    x = layers.MaxPooling2D(pool_size = (2, 2))(x)
    
    x = layers.Convolution2D(32, (3, 3), padding = 'same')(x)
    x = layers.Activation('relu')(x)
    x = layers.Dropout(dropout_rate)(x)
    x = layers.Convolution2D(32, (3, 3), padding='same')(x)
    x = layers.Activation('relu')(x)
    x = layers.Dropout(dropout_rate)(x)
    x = layers.MaxPooling2D(pool_size = (2, 2))(x)
    
    x = layers.Convolution2D(32, (3, 3), padding = 'same')(x)
    x = layers.Activation('relu')(x)
    x = layers.Dropout(dropout_rate)(x)
    x = layers.Convolution2D(32, (3, 3), padding='same')(x)
    x = layers.Activation('relu')(x)
    x = layers.Dropout(dropout_rate)(x)
    x = layers.MaxPooling2D(pool_size = (2, 2))(x)
    
    x = layers.Flatten()(x)
    x = layers.Dense(500)(x)
    x = layers.Activation('relu')(x)
    x = layers.Dropout(dropout_rate)(x)
    x = layers.Dense(50)(x)
    x = layers.Activation('relu')(x)
    x = layers.Dropout(dropout_rate)(x)
    out_ = layers.Dense(1, activation = 'linear', name = 'im_out')(x)
    
    return keras.Model(inputs = in_, outputs = out_)

In [None]:
for i in range(5):
    print('Run ', i)
    
    # define model
    nn_bl = mod_bl(Y_train.shape[1])
    in_ = keras.Input(shape = X_train_gr.shape[1:], name = 'x_in')
    out_ = layers.Dense(1, activation = 'linear', name = 'x_out', use_bias = False)(in_)
    nn_x = keras.Model(inputs = in_, outputs = out_)
    nn_im = mod_im()
    m = ontram(nn_bl = nn_bl, nn_im = nn_im, nn_x = nn_x, response_varying = False)

    # train
    hist = fit_ontram(m, 
                  x_train_im = X_train, y_train = Y_train, x_train = X_train_gr, 
                  x_test_im = X_valid, y_test = Y_valid, x_test = X_valid_gr,
                  batch_size = 32,
                  epochs = 30,
                  output_dir = OUTPUT_DIR + 'CS_with_gender/' + 'run' + str(i) + '/')
    
    # save history
    dat = pd.DataFrame({'train_loss': hist['train_loss'], 'train_acc': hist['train_acc'], 'test_loss': hist['test_loss'], 'test_acc': hist['test_acc']})
    dat.to_csv(OUTPUT_DIR + 'CS_with_gender/' + 'run' + str(i) + '/history.csv', index = False)
    
    # get the best model and the predictions
    best_model = np.where(dat.test_loss == np.min(dat.test_loss))[0][0]
    m.model.load_weights(OUTPUT_DIR + 'CS_with_gender/' + 'run' + str(i) + '/model-' + str(best_model) + '.hdf5')
    pred = predict(m, x_im = X_test, x = X_test_gr, y = Y_test)
    
    # save predictions of the best model
    out = test.join(pd.DataFrame(pred['pdf'], columns = ['p_0', 'p_1', 'p_2', 'p_3', 'p_4', 'p_5', 'p_6']))
    out = out.join(pd.DataFrame(pred['response'], columns = ['pred']))
    out.to_csv(OUTPUT_DIR + 'CS_with_gender/' + 'run' + str(i) + '/test_predictions.csv', index=False)
    
    # save predictions of the best model
    out = test.join(pd.DataFrame(pred['pdf'], columns = ['p_0', 'p_1', 'p_2', 'p_3', 'p_4', 'p_5', 'p_6'])).join(pd.DataFrame(pred['eta'], columns = ['eta']))
    out = out.join(pd.DataFrame(pred['response'], columns = ['pred']))
    out.to_csv(OUTPUT_DIR + 'CS_with_gender/' + 'run' + str(i) + '/test_predictions_eta.csv', index=False)

In [None]:
for i in range(5):
    
    # define model
    nn_bl = mod_bl(Y_train.shape[1])
    in_ = keras.Input(shape = X_train_gr.shape[1:], name = 'x_in')
    out_ = layers.Dense(1, activation = 'linear', name = 'x_out', use_bias = False)(in_)
    nn_x = keras.Model(inputs = in_, outputs = out_)
    nn_im = mod_im()
    m = ontram(nn_bl = nn_bl, nn_im = nn_im, nn_x = nn_x, response_varying = False)
    
    # load history to find the best model
    dat = pd.read_csv(OUTPUT_DIR + 'CS_with_gender/' + 'run' + str(i) + '/history.csv')
    
    # get the best model and the predictions
    best_model = np.where(dat.test_loss == np.min(dat.test_loss))[0][0]
    m.model.load_weights(OUTPUT_DIR + 'CS_with_gender/' + 'run' + str(i) + '/model-' + str(best_model) + '.hdf5')
    pred = predict(m, x_im = X_test, x = X_test_gr, y = Y_test)
    
    # save betas
    if i == 0:
        n = len(pred["beta_w"][0])
        beta = pred["beta_w"][0].reshape((n)) # make it 1D
        dat_beta = pd.DataFrame({"beta": beta, "model": "CI-CS-LS_gender", "run": i}) 
    else:
        n = len(pred["beta_w"][0])
        beta = pred["beta_w"][0].reshape((n)) # make it 1D
        dat_beta = dat_beta.append(pd.DataFrame({"beta": beta, "model": "CI-CS-LS_gender", "run": i}))

dat_beta.to_csv(OUTPUT_DIR + 'CS_with_gender/' + 'run' + str(i) + '/beta_estimates.csv', index=False)

### Simple intercept + complex shift (image) + linear shift(simulated data)

In [None]:
X_train_sim = np.array(train.loc[:,['x_1', 'x_2', 'x_3', 'x_4', 'x_5', 'x_6', 'x_7', 'x_8', 'x_9', 'x_10']])
X_train_sim.shape

In [None]:
X_valid_sim = np.array(valid.loc[:,['x_1', 'x_2', 'x_3', 'x_4', 'x_5', 'x_6', 'x_7', 'x_8', 'x_9', 'x_10']])
X_valid_sim.shape

In [None]:
X_test_sim = np.array(test.loc[:,['x_1', 'x_2', 'x_3', 'x_4', 'x_5', 'x_6', 'x_7', 'x_8', 'x_9', 'x_10']])
X_test_sim.shape

In [None]:
def mod_bl(y_dim):
    nn_bl = keras.Sequential(name = "nn_bl")
    nn_bl.add(keras.Input(shape = (1, ), name = "bl_in"))
    nn_bl.add(layers.Dense(y_dim - 1, activation = "linear", use_bias = False, name = "bl_out"))
    return nn_bl

In [None]:
def mod_im(dropout_rate = 0.3):
    in_ = keras.Input(shape = X_train.shape[1:], name = 'im_in')
    
    x = layers.Convolution2D(16, (3, 3), padding = 'same')(in_)
    x = layers.Activation('relu')(x)
    x = layers.Dropout(dropout_rate)(x)
    x = layers.Convolution2D(16, (3, 3), padding='same')(x)
    x = layers.Activation('relu')(x)
    x = layers.Dropout(dropout_rate)(x)
    x = layers.MaxPooling2D(pool_size = (2, 2))(x)
    
    x = layers.Convolution2D(32, (3, 3), padding = 'same')(x)
    x = layers.Activation('relu')(x)
    x = layers.Dropout(dropout_rate)(x)
    x = layers.Convolution2D(32, (3, 3), padding='same')(x)
    x = layers.Activation('relu')(x)
    x = layers.Dropout(dropout_rate)(x)
    x = layers.MaxPooling2D(pool_size = (2, 2))(x)
    
    x = layers.Convolution2D(32, (3, 3), padding = 'same')(x)
    x = layers.Activation('relu')(x)
    x = layers.Dropout(dropout_rate)(x)
    x = layers.Convolution2D(32, (3, 3), padding='same')(x)
    x = layers.Activation('relu')(x)
    x = layers.Dropout(dropout_rate)(x)
    x = layers.MaxPooling2D(pool_size = (2, 2))(x)
    
    x = layers.Flatten()(x)
    x = layers.Dense(500)(x)
    x = layers.Activation('relu')(x)
    x = layers.Dropout(dropout_rate)(x)
    x = layers.Dense(50)(x)
    x = layers.Activation('relu')(x)
    x = layers.Dropout(dropout_rate)(x)
    out_ = layers.Dense(1, activation = 'linear', name = 'im_out')(x)
    
    return keras.Model(inputs = in_, outputs = out_)

In [None]:
for i in range(5):
    print('Run ', i)
    
    # define model
    nn_bl = mod_bl(Y_train.shape[1])
    in_ = keras.Input(shape = X_train_sim.shape[1:], name = 'x_in')
    out_ = layers.Dense(1, activation = 'linear', name = 'x_out', use_bias = False)(in_)
    nn_x = keras.Model(inputs = in_, outputs = out_)
    nn_im = mod_im()
    m = ontram(nn_bl = nn_bl, nn_im = nn_im, nn_x = nn_x, response_varying = False)

    # train
    hist = fit_ontram(m, 
                  x_train_im = X_train, y_train = Y_train, x_train = X_train_sim, 
                  x_test_im = X_valid, y_test = Y_valid, x_test = X_valid_sim,
                  batch_size = 32,
                  epochs = 30,
                  output_dir = OUTPUT_DIR + 'CS_with_simulated_data/' + 'run' + str(i) + '/')
    
    # save history
    dat = pd.DataFrame({'train_loss': hist['train_loss'], 'train_acc': hist['train_acc'], 'test_loss': hist['test_loss'], 'test_acc': hist['test_acc']})
    dat.to_csv(OUTPUT_DIR + 'CS_with_simulated_data/' + 'run' + str(i) + '/history.csv', index = False)
    
    # get the best model and the predictions
    best_model = np.where(dat.test_loss == np.min(dat.test_loss))[0][0]
    m.model.load_weights(OUTPUT_DIR + 'CS_with_simulated_data/' + 'run' + str(i) + '/model-' + str(best_model) + '.hdf5')
    pred = predict(m, x_im = X_test, x = X_test_sim, y = Y_test)
    
    # save predictions of the best model
    out = test.join(pd.DataFrame(pred['pdf'], columns = ['p_0', 'p_1', 'p_2', 'p_3', 'p_4', 'p_5', 'p_6']))
    out = out.join(pd.DataFrame(pred['response'], columns = ['pred']))
    out.to_csv(OUTPUT_DIR + 'CS_with_simulated_data/' + 'run' + str(i) + '/test_predictions.csv', index=False)
    
    # out = test.join(pd.DataFrame(pred['pdf'], columns = ['p_0', 'p_1', 'p_2', 'p_3', 'p_4', 'p_5', 'p_6'])).join(pd.DataFrame(pred['eta'], columns = ['eta']))
    # out = out.join(pd.DataFrame(pred['response'], columns = ['pred']))
    # out.to_csv(OUTPUT_DIR + 'CS_with_simulated_data/' + 'run' + str(i) + '/test_predictions_eta.csv', index=False)

In [None]:
for i in range(5):
    
    # define model
    nn_bl = mod_bl(Y_train.shape[1])
    in_ = keras.Input(shape = X_train_sim.shape[1:], name = 'x_in')
    out_ = layers.Dense(1, activation = 'linear', name = 'x_out', use_bias = False)(in_)
    nn_x = keras.Model(inputs = in_, outputs = out_)
    nn_im = mod_im()
    m = ontram(nn_bl = nn_bl, nn_im = nn_im, nn_x = nn_x, response_varying = False)
    
    # load history to find the best model
    dat = pd.read_csv(OUTPUT_DIR + 'CS_with_simulated_data/' + 'run' + str(i) + '/history.csv')
    
    # get the best model and the predictions
    best_model = np.where(dat.test_loss == np.min(dat.test_loss))[0][0]
    m.model.load_weights(OUTPUT_DIR + 'CS_with_simulated_data/' + 'run' + str(i) + '/model-' + str(best_model) + '.hdf5')
    pred = predict(m, x_im = X_test, x = X_test_sim, y = Y_test)
    
    # save betas
    if i == 0:
        n = len(pred["beta_w"][0])
        beta = pred["beta_w"][0].reshape((n)) # make it 1D
        dat_beta = pd.DataFrame({"beta": beta, "model": "CI-CS-LS_sim", "run": i}) 
    else:
        n = len(pred["beta_w"][0])
        beta = pred["beta_w"][0].reshape((n)) # make it 1D
        dat_beta = dat_beta.append(pd.DataFrame({"beta": beta, "model": "CI-CS-LS_sim", "run": i}))

dat_beta.to_csv(OUTPUT_DIR + 'CS_with_simulated_data/' + 'run' + str(i) + '/beta_estimates.csv', index=False)

### Simple intercept + linear shift (simulated data) = POLR

In [None]:
X_train_sim = np.array(train.loc[:,['x_1', 'x_2', 'x_3', 'x_4', 'x_5', 'x_6', 'x_7', 'x_8', 'x_9', 'x_10']])
X_train_sim.shape

In [None]:
X_valid_sim = np.array(valid.loc[:,['x_1', 'x_2', 'x_3', 'x_4', 'x_5', 'x_6', 'x_7', 'x_8', 'x_9', 'x_10']])
X_valid_sim.shape

In [None]:
X_test_sim = np.array(test.loc[:,['x_1', 'x_2', 'x_3', 'x_4', 'x_5', 'x_6', 'x_7', 'x_8', 'x_9', 'x_10']])
X_test_sim.shape

In [None]:
def mod_bl(y_dim):
    nn_bl = keras.Sequential(name = "nn_bl")
    nn_bl.add(keras.Input(shape = (1, ), name = "bl_in"))
    nn_bl.add(layers.Dense(y_dim - 1, activation = "linear", use_bias = False, name = "bl_out"))
    return nn_bl

In [None]:
for i in range(5):
    print('Run ', i)
    
    # define model
    nn_bl = mod_bl(Y_train.shape[1])
    in_ = keras.Input(shape = X_train_sim.shape[1:], name = 'x_in')
    out_ = layers.Dense(1, activation = 'linear', name = 'x_out', use_bias = False)(in_)
    nn_x = keras.Model(inputs = in_, outputs = out_)
    m = ontram(nn_bl = nn_bl, nn_x = nn_x, response_varying = False)

    # train
    hist = fit_ontram(m, 
                  y_train = Y_train, x_train = X_train_sim, 
                  y_test = Y_valid, x_test = X_valid_sim,
                  batch_size = 32,
                  epochs = 200,
                  output_dir = OUTPUT_DIR + 'simulated_data_only/' + 'run' + str(i) + '/')
    
    # save history
    dat = pd.DataFrame({'train_loss': hist['train_loss'], 'train_acc': hist['train_acc'], 'test_loss': hist['test_loss'], 'test_acc': hist['test_acc']})
    dat.to_csv(OUTPUT_DIR + 'simulated_data_only/' + 'run' + str(i) + '/history.csv', index = False)
    pred = predict(m, x = X_test_sim, y = Y_test)
    
    # save predictions of the best model
    out = test.join(pd.DataFrame(pred['pdf'], columns = ['p_0', 'p_1', 'p_2', 'p_3', 'p_4', 'p_5', 'p_6']))
    out = out.join(pd.DataFrame(pred['response'], columns = ['pred']))
    out.to_csv(OUTPUT_DIR + 'simulated_data_only/' + 'run' + str(i) + '/test_predictions.csv', index=False)

In [None]:
for i in range(5):
    
    # define model
    nn_bl = mod_bl(Y_train.shape[1])
    in_ = keras.Input(shape = X_train_sim.shape[1:], name = 'x_in')
    out_ = layers.Dense(1, activation = 'linear', name = 'x_out', use_bias = False)(in_)
    nn_x = keras.Model(inputs = in_, outputs = out_)
    m = ontram(nn_bl = nn_bl, nn_x = nn_x, response_varying = False)
    
    # get the best model and the predictions
    m.model.load_weights(OUTPUT_DIR + 'simulated_data_only/' + 'run' + str(i) + '/model-' + str(199) + '.hdf5')
    pred = predict(m, x = X_test_sim, y = Y_test)
    
    # save betas
    if i == 0:
        n = len(pred["beta_w"][0])
        dat_beta = pd.DataFrame({"beta": beta, "model": "SI-LS_sim", "run": i}) 
    else:
        n = len(pred["beta_w"][0])
        beta = pred["beta_w"][0].reshape((n)) # make it 1D
        dat_beta = dat_beta.append(pd.DataFrame({"beta": beta, "model": "SI-LS_sim", "run": i}))

dat_beta.to_csv(OUTPUT_DIR + 'simulated_data_only/' + 'run' + str(i) + '/beta_estimates.csv', index=False)

### Simple intercept + linear shift (covariables) = POLR

In [None]:
# to categorical
gender_ohe = to_categorical(train.gender)[:,1:] # male reference: 0 = male, 1 = female
race_ohe = to_categorical(train.race)[:,1:] # White reference: 0 = White, 1 = Black, 2 = Asian, 3 = Indian, 4 = Others (like Hispanic, Latino, Middle Eastern)
X_train_gr = np.concatenate((gender_ohe, race_ohe), axis = 1)
print(X_train_gr.shape)

In [None]:
# to categorical
gender_ohe = to_categorical(valid.gender)[:,1:] # male reference: 0 = male, 1 = female
race_ohe = to_categorical(valid.race)[:,1:] # White reference: 0 = White, 1 = Black, 2 = Asian, 3 = Indian, 4 = Others (like Hispanic, Latino, Middle Eastern)
X_valid_gr = np.concatenate((gender_ohe, race_ohe), axis = 1)
print(X_valid_gr.shape)

In [None]:
# to categorical
gender_ohe = to_categorical(test.gender)[:,1:] # male reference: 0 = male, 1 = female
race_ohe = to_categorical(test.race)[:,1:] # White reference: 0 = White, 1 = Black, 2 = Asian, 3 = Indian, 4 = Others (like Hispanic, Latino, Middle Eastern)
X_test_gr = np.concatenate((gender_ohe, race_ohe), axis = 1)
print(X_test_gr.shape)

In [None]:
def mod_bl(y_dim):
    nn_bl = keras.Sequential(name = "nn_bl")
    nn_bl.add(keras.Input(shape = (1, ), name = "bl_in"))
    nn_bl.add(layers.Dense(y_dim - 1, activation = "linear", use_bias = False, name = "bl_out"))
    return nn_bl

In [None]:
for i in range(5):
    print('Run ', i)
    
    # define model
    nn_bl = mod_bl(Y_train.shape[1])
    in_ = keras.Input(shape = X_train_gr.shape[1:], name = 'x_in')
    out_ = layers.Dense(1, activation = 'linear', name = 'x_out', use_bias = False)(in_)
    nn_x = keras.Model(inputs = in_, outputs = out_)
    m = ontram(nn_bl = nn_bl, nn_x = nn_x, response_varying = False)

    # train
    hist = fit_ontram(m, 
                  y_train = Y_train, x_train = X_train_gr, 
                  y_test = Y_valid, x_test = X_valid_gr,
                  batch_size = 32,
                  epochs = 200,
                  output_dir = OUTPUT_DIR + 'covariables_only/' + 'run' + str(i) + '/')
    
    # save history
    dat = pd.DataFrame({'train_loss': hist['train_loss'], 'train_acc': hist['train_acc'], 'test_loss': hist['test_loss'], 'test_acc': hist['test_acc']})
    dat.to_csv(OUTPUT_DIR + 'covariables_only/' + 'run' + str(i) + '/history.csv', index = False)
    pred = predict(m, x = X_test_gr, y = Y_test)
    
    # save predictions of the best model
    out = test.join(pd.DataFrame(pred['pdf'], columns = ['p_0', 'p_1', 'p_2', 'p_3', 'p_4', 'p_5', 'p_6']))
    out = out.join(pd.DataFrame(pred['response'], columns = ['pred']))
    out.to_csv(OUTPUT_DIR + 'covariables_only/' + 'run' + str(i) + '/test_predictions.csv', index=False)

In [None]:
for i in range(5):
    
    # define model
    nn_bl = mod_bl(Y_train.shape[1])
    in_ = keras.Input(shape = X_train_gr.shape[1:], name = 'x_in')
    out_ = layers.Dense(1, activation = 'linear', name = 'x_out', use_bias = False)(in_)
    nn_x = keras.Model(inputs = in_, outputs = out_)
    m = ontram(nn_bl = nn_bl, nn_x = nn_x, response_varying = False)
    
    # get the best model and the predictions
    m.model.load_weights(OUTPUT_DIR + 'covariables_only/' + 'run' + str(i) + '/model-' + str(199) + '.hdf5')
    pred = predict(m, x = X_test_gr, y = Y_test)
    
    # save betas
    if i == 0:
        n = len(pred["beta_w"][0])
        dat_beta = pd.DataFrame({"beta": beta, "model": "SI-LS_racegender", "run": i}) 
    else:
        n = len(pred["beta_w"][0])
        beta = pred["beta_w"][0].reshape((n)) # make it 1D
        dat_beta = dat_beta.append(pd.DataFrame({"beta": beta, "model": "SI-LS_racegender", "run": i}))

dat_beta.to_csv(OUTPUT_DIR + 'covariables_only/' + 'run' + str(i) + '/beta_estimates.csv', index=False)

### Simple intercept + linear shift (gender) = POLR

In [None]:
# to categorical
X_train_gr = to_categorical(train.gender)[:,1:] # male reference: 0 = male, 1 = female
print(X_train_gr.shape)

In [None]:
# to categorical
X_valid_gr = to_categorical(valid.gender)[:,1:] # male reference: 0 = male, 1 = female
print(X_valid_gr.shape)

In [None]:
# to categorical
X_test_gr = to_categorical(test.gender)[:,1:] # male reference: 0 = male, 1 = female
print(X_test_gr.shape)

In [None]:
def mod_bl(y_dim):
    nn_bl = keras.Sequential(name = "nn_bl")
    nn_bl.add(keras.Input(shape = (1, ), name = "bl_in"))
    nn_bl.add(layers.Dense(y_dim - 1, activation = "linear", use_bias = False, name = "bl_out"))
    return nn_bl

In [None]:
for i in range(5):
    print('Run ', i)
    
    # define model
    nn_bl = mod_bl(Y_train.shape[1])
    in_ = keras.Input(shape = X_train_gr.shape[1:], name = 'x_in')
    out_ = layers.Dense(1, activation = 'linear', name = 'x_out', use_bias = False)(in_)
    nn_x = keras.Model(inputs = in_, outputs = out_)
    m = ontram(nn_bl = nn_bl, nn_x = nn_x, response_varying = False)

    # train
    hist = fit_ontram(m, 
                  y_train = Y_train, x_train = X_train_gr, 
                  y_test = Y_valid, x_test = X_valid_gr,
                  batch_size = 32,
                  epochs = 200,
                  output_dir = OUTPUT_DIR + 'gender_only/' + 'run' + str(i) + '/')
    
    # save history
    dat = pd.DataFrame({'train_loss': hist['train_loss'], 'train_acc': hist['train_acc'], 'test_loss': hist['test_loss'], 'test_acc': hist['test_acc']})
    dat.to_csv(OUTPUT_DIR + 'gender_only/' + 'run' + str(i) + '/history.csv', index = False)
    pred = predict(m, x = X_test_gr, y = Y_test)
    
    # save predictions of the best model
    out = test.join(pd.DataFrame(pred['pdf'], columns = ['p_0', 'p_1', 'p_2', 'p_3', 'p_4', 'p_5', 'p_6']))
    out = out.join(pd.DataFrame(pred['response'], columns = ['pred']))
    out.to_csv(OUTPUT_DIR + 'gender_only/' + 'run' + str(i) + '/test_predictions.csv', index=False)

In [None]:
for i in range(5):
    
    # define model
    nn_bl = mod_bl(Y_train.shape[1])
    in_ = keras.Input(shape = X_train_gr.shape[1:], name = 'x_in')
    out_ = layers.Dense(1, activation = 'linear', name = 'x_out', use_bias = False)(in_)
    nn_x = keras.Model(inputs = in_, outputs = out_)
    m = ontram(nn_bl = nn_bl, nn_x = nn_x, response_varying = False)
    
    # get the best model and the predictions
    m.model.load_weights(OUTPUT_DIR + 'gender_only/' + 'run' + str(i) + '/model-' + str(199) + '.hdf5')
    pred = predict(m, x = X_test_gr, y = Y_test)
    
    # save betas
    if i == 0:
        n = len(pred["beta_w"][0])
        dat_beta = pd.DataFrame({"beta": beta, "model": "SI-LS_gender", "run": i}) 
    else:
        n = len(pred["beta_w"][0])
        beta = pred["beta_w"][0].reshape((n)) # make it 1D
        dat_beta = dat_beta.append(pd.DataFrame({"beta": beta, "model": "SI-LS_gender", "run": i}))

dat_beta.to_csv(OUTPUT_DIR + 'gender_only/' + 'run' + str(i) + '/beta_estimates.csv', index=False)