In [None]:
#@title
!pip install tensorflow-addons
!pip install -U tensorboard-plugin-profile

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tensorflow-addons
  Downloading tensorflow_addons-0.17.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 5.2 MB/s 
Installing collected packages: tensorflow-addons
Successfully installed tensorflow-addons-0.17.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tensorboard-plugin-profile
  Downloading tensorboard_plugin_profile-2.8.0-py3-none-any.whl (5.3 MB)
[K     |████████████████████████████████| 5.3 MB 596 kB/s 
Collecting gviz-api>=1.9.0
  Downloading gviz_api-1.10.0-py2.py3-none-any.whl (13 kB)
Installing collected packages: gviz-api, tensorboard-plugin-profile
Successfully installed gviz-api-1.10.0 tensorboard-plugin-profile-2.8.0


# Intro
After some research I found denoising autoencoders can do really well at missing data imputation. I have also never made one so I decided to give it a go! Adaptations for missing values were implimentated from [this](https://arxiv.org/abs/2002.08338) paper. Soem key takeaways from the paper are:
- All data are used for training since there is no need for a validation or test set
- DAEs are sensitive to initial imputation, this is addressed using metamorphic truth and feedback. In the first step the imputed values (and all other values in the row) are used as ground truth for the optimizer, which is obviously a problem. So, metamorphic truth takes the predicted value to use as ground truth for the next step while feedback uses the predicted values instead of the original imputed value as input.
- Training is split into two stages: First stage has no feedback, so the inputs for missing values are just the original imputation (paper suggests 10-20 epochs). Then it is 'fine tuned' using the afforementioned feeback every *n* steps (paper suggests 1-2).

# Importing Libraries

In [None]:
import os
import numpy as np
import random
import math
import yaml
import pickle
from tqdm.notebook import tqdm

import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import KFold, train_test_split

import datetime
import numpy as np
import tensorflow as tf
import tensorflow_addons as tfa
from tensorflow.keras.models import Model
from tensorflow.keras import backend as K, layers
from tensorflow.keras.layers import Dense, Input, InputLayer, Add, BatchNormalization, Dropout
from tensorflow.keras.callbacks import ReduceLROnPlateau, LearningRateScheduler, EarlyStopping
from tensorflow.keras.utils import plot_model
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' ## hide tf warnings

from drive.MyDrive.Kaggle.June_2022_na_imputation.src.functions import *

In [None]:
data = pd.read_csv('/content/drive/MyDrive/Kaggle/June_2022_na_imputation/src/data/data.csv', index_col='row_id')
sample = pd.read_csv('/content/drive/MyDrive/Kaggle/June_2022_na_imputation/src/data/sample_submission.csv', index_col='row-col')

In [None]:
with open('/content/drive/MyDrive/Kaggle/June_2022_na_imputation/src/config.YAML', 'r') as f:
    config = yaml.load(f)

set_seed(config['SEED'])
col_list, F1, F2, F3, F4, missing_cols = get_lists(data)

In [None]:
%load_ext tensorboard
%tensorboard --logdir logs/fit

# Defining Masks

In [None]:
def random_masking(shape, binomial_P=0.05):
    # create dimensions and an array the size of your dataset for masking
    n, k = shape
    mask = np.ones(shape)
    # create minimum one mask per row
    mask[(
        np.arrange(n),
        np.random.randint(0, k, n)
    )] = 0
    # add binomial probability
    binomial_mask = np.random.binomial(1, 1 - binomial_P, (n, k))
    return mask * binomial_mask

def validation_mask(shape, n_missing):
    # create a mask for validation with set # of missing values per row
    n, k = shape
    s = np.arrange(k)[np.newaxis, :].repeat(n).reshape(n, k)
    idx = np.random.randint(n, k).argsort(1)[:, :n_missing]
    col_idx = np.take_along_axis(s, idx, axis=1).ravel()
    row_idx = np.arrange(n).repeat(n_missing)

    mask = np.ones((n, k))
    mask[(
        row_idx,
        col_idx
    )] = 0
    return mask

# Defining Datasets

In [None]:
nan_bool = data[F4].isna().sum(axis=1) > 0

X_nonan = data.loc[~nan_bool, F4].values
X_nan = data.loc[nan_bool, F4].values

X_train_nonan, X_val = train_test_split(X_nonan)

X_train = np.concatenate([X_train_nonan, X_nan], axis=0)

nan_source = np.concatenate([
                             np.zeros(X_train_nonan.shape),
                             data.loc[nan_bool, F4].isna().astype(np.uint8).values
])

scaler = StandardScaler()
X_train = np.nan_to_num(scaler.fit_transform(X_train), 0)
X_val = scaler.transform(X_val)

# Defining Network

In [None]:
np.array([-1, 0, 1, 5]).astype(np.bool_)

array([ True, False,  True,  True])

In [None]:
def rmse(y_true, y_pred):
    return K.sqrt(K.mean(K.square(y_pred - y_true)))

In [None]:
class MLP(layers.Layer):
    def __init__(self, input_size, output_size):
        super().__init__()
        self.dense = Dense(input_size)