In [None]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import gc
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import ReLU
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.utils import plot_model

In [None]:
def AutoEncoder(n_input):
    
    X_in = Input(shape=(n_input,))

    # level 1
    X = Dense(256)(X_in)
    X = BatchNormalization()(X)
    X = ReLU()(X)

    # level 3
    X = Dense(128)(X)
    X = BatchNormalization()(X)
    X = ReLU()(X)

    # Bottle neck
    X = Dense(70)(X)
    X = BatchNormalization()(X)
    X = ReLU()(X)

    # level 4
    X = Dense(128)(X)
    X = BatchNormalization()(X)
    X = ReLU()(X)

    # level 5
    X = Dense(256)(X)
    X = BatchNormalization()(X)
    X = ReLU()(X)

    # output
    X_out = Dense(n_input, activation='linear')(X)
    model = Model(inputs=X_in, outputs=X_out)
    opt = tf.keras.optimizers.Adam(learning_rate=0.0005)
    model.compile(optimizer=opt, loss='mse')

    return model

In [None]:
train = pd.read_pickle('Data/train_data.pkl')
test = pd.read_pickle('Data/test_data.pkl')
data = pd.concat([train, test], axis=0)
data.drop('S_2', axis=1, inplace=True)

cat_features = [
    "B_30",
    "B_38",
    "D_114",
    "D_116",
    "D_117",
    "D_120",
    "D_126",
    "D_63",
    "D_64",
    "D_66",
    "D_68",
]

data.drop(cat_features, axis=1, inplace=True)
data.reset_index(drop=False, inplace=True)

num_columns = data.columns.to_list()
num_columns.remove('customer_ID')

for i in num_columns:
    data[i].fillna(data[i].median(), inplace=True)

del train, test; gc.collect()

data = data.sample(frac=1).reset_index(drop=True)
data.shape

In [None]:
train_dataset = tf.data.Dataset.from_tensor_slices((data[num_columns], data[num_columns]))
del data; _ = gc.collect()

BATCH_SIZE = 1024
train_dataset = train_dataset.batch(BATCH_SIZE)
AUTOTUNE = tf.data.experimental.AUTOTUNE
train_dataset = train_dataset.prefetch(buffer_size=AUTOTUNE)

In [None]:
model = AutoEncoder(len(num_columns))

In [None]:
history = model.fit(
    train_dataset,
    epochs=500,
    callbacks=[tf.keras.callbacks.EarlyStopping(monitor='loss', patience=20, restore_best_weights=True)],
)
model.save('Models/AutoEncoder_bottleneck70_learningrate0005_ShuffeledData.h5')

In [None]:
plt.plot(history.history['loss'], label='train')
plt.legend()
plt.show()

In [None]:
j = 4
for i in train_dataset.take(1):
     value = i[0].numpy()[j]

for i in train_dataset.take(1):
    pred = model.predict(i[0])

plt.scatter(pred[j], value)

## Predict

In [None]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import gc
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import ReLU
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.utils import plot_model

In [None]:
def AutoEncoder(n_input):
    
    X_in = Input(shape=(n_input,))

    # level 1
    X = Dense(256)(X_in)
    X = BatchNormalization()(X)
    X = ReLU()(X)

    # level 3
    X = Dense(128)(X)
    X = BatchNormalization()(X)
    X = ReLU()(X)

    # Bottle neck
    X = Dense(70)(X)
    X = BatchNormalization()(X)
    X = ReLU()(X)

    # level 4
    X = Dense(128)(X)
    X = BatchNormalization()(X)
    X = ReLU()(X)

    # level 5
    X = Dense(256)(X)
    X = BatchNormalization()(X)
    X = ReLU()(X)

    # output
    X_out = Dense(n_input, activation='linear')(X)
    model = Model(inputs=X_in, outputs=X_out)
    opt = tf.keras.optimizers.Adam(learning_rate=0.0005)
    model.compile(optimizer=opt, loss='mse')

    return model

In [None]:
train = pd.read_pickle('Data/test_data.pkl')
train.drop('S_2', axis=1, inplace=True)

cat_features = [
    "B_30",
    "B_38",
    "D_114",
    "D_116",
    "D_117",
    "D_120",
    "D_126",
    "D_63",
    "D_64",
    "D_66",
    "D_68",
]

train.drop(cat_features, axis=1, inplace=True)
train.reset_index(drop=False, inplace=True)

num_columns = train.columns.to_list()
num_columns.remove('customer_ID')

for i in num_columns:
    train[i].fillna(train[i].median(), inplace=True)


train.shape

In [None]:
model = AutoEncoder(len(num_columns))
model.load_weights('Models/AutoEncoder_70.h5')

In [None]:
sample = train[num_columns].to_numpy()
pred = model.predict(sample)

In [None]:
pred_df = pd.DataFrame(pred)
pred_df.columns = num_columns
error = ((pred_df - sample) ** 2).sum(axis = 1)/pred_df.shape[1]

In [None]:
del pred, pred_df, sample; gc.collect()

In [None]:
inputs = model.input
outputs= model.get_layer('re_lu_2').output

encoder = Model(inputs, outputs)

In [None]:
sample = train[num_columns].to_numpy()
enc_pred = encoder.predict(sample)

In [None]:
enc_df = pd.DataFrame(enc_pred)
enc_df.columns = ['enc_'+ str(col) for col in enc_df.columns]
enc_df['error'] = error
enc_df['customer_ID'] = train['customer_ID']
enc_df.set_index('customer_ID', inplace=True)

In [None]:
enc_df.to_pickle('Data/enc_test.pkl')