In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

### Create the synthetic dataset

In [2]:
from sklearn.datasets import make_classification


num_columns = 20

X, y = make_classification(
    n_samples=3333,
    n_features=num_columns,
    n_informative=num_columns,
    n_redundant=0,
    n_repeated=0,
    n_classes=2,
    n_clusters_per_class=1,
    flip_y=0.01,
    class_sep=0.1,
    hypercube=False,
    random_state=0,
    shuffle=True,
)

In [3]:
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score


def print_cross_val_results(estimator, X, y):
    mean_auc = np.mean(cross_val_score(estimator, X, y, n_jobs=1, scoring='roc_auc'))
    print('Mean AUC score: {:.2f}'.format(mean_auc))


estimator = XGBClassifier(n_jobs=-1)

print_cross_val_results(estimator, X, y)

Mean AUC score: 0.90


In [4]:
X_1, X_2, X_3 = np.split(X, 3)
y_1, y_2, y_3 = np.split(y, 3)

In [5]:
for X_, y_ in zip([X_1, X_2, X_3], [y_1, y_2, y_3]):
    print_cross_val_results(estimator, X_, y_)

Mean AUC score: 0.88
Mean AUC score: 0.89
Mean AUC score: 0.89


### First modification -- add random columns

In [6]:
X_1_damaged = X_1[:, range(num_columns//2)]

print_cross_val_results(estimator, X_1_damaged, y_1)

Mean AUC score: 0.77


### Second modification -- dimensionality reducion *via* PCA

In [7]:
from sklearn.decomposition import PCA


X_2_damaged = PCA(n_components=num_columns//2).fit_transform(X_2)

print_cross_val_results(estimator, X_2_damaged, y_2)

Mean AUC score: 0.83


### Third modification -- additional noisy columns, and random rotation

In [8]:
from scipy.stats import special_ortho_group


rot_mat = special_ortho_group.rvs(num_columns, random_state=42)
X_3_damaged = X_3.dot(rot_mat)[:, range(num_columns//2)]

print_cross_val_results(estimator, X_3_damaged, y_3)

Mean AUC score: 0.79


In [None]:
frames = [X_1_damaged, X_2_damaged, X_3_damaged]
ys = [y_1, y_2, y_3]

X_1_perm = X_1[:, np.random.choice(range(num_columns), num_columns)]
frames = [X_1_damaged, X_2_damaged]
ys = [y_1, y_2]

frames = [X_2_damaged, X_3_damaged]
ys = [y_2, y_3]

### Applying the unifying autoencoder

In [None]:
from sklearn.preprocessing import StandardScaler
from unifying_autoencoder import UnifyingAutoEncoder
from sklearn.model_selection import train_test_split


scaler = StandardScaler()
scaled_frames = list(map(scaler.fit_transform, frames))

inp_shapes = [frame.shape[1] for frame in scaled_frames]
uniae = UnifyingAutoEncoder(
    inp_shapes=inp_shapes,
    unifying_dim=100,
    num_neurons=40,
    num_epochs=5,
    num_layers=3,
)

for i in range(20):
    undamaged_frames = uniae.unify(scaled_frames)
    X_undamaged = np.concatenate(undamaged_frames)
    print_cross_val_results(estimator, X_undamaged, np.concatenate(ys))
    uniae.fit(scaled_frames, verbose=0)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
  return f(*args, **kwds)


Mean AUC score: 0.82
Mean AUC score: 0.81
Mean AUC score: 0.82
Mean AUC score: 0.83
Mean AUC score: 0.83
Mean AUC score: 0.83
Mean AUC score: 0.83
Mean AUC score: 0.81
Mean AUC score: 0.81
Mean AUC score: 0.83
Mean AUC score: 0.82
Mean AUC score: 0.83
Mean AUC score: 0.82
Mean AUC score: 0.82
Mean AUC score: 0.83
Mean AUC score: 0.83
Mean AUC score: 0.83
Mean AUC score: 0.83
