In [1]:
import scipy
import numpy as np
import pandas as pd
from scipy.special import erfinv
import matplotlib.pyplot as plt

In [2]:
from sklearn.preprocessing import QuantileTransformer
import sklearn
sklearn.__version__

'0.19.1'

### Loading data

In [3]:
from sklearn.cross_validation import train_test_split



In [4]:
X_train = pd.read_csv('data/train.csv')
y_train = X_train.pop('target')

In [5]:
print X_train.shape
print y_train.sum()
print y_train.mean()

(595212, 58)
21694
0.0364475178592


### Preprocessing

'Basically I removed *calc, added 1-hot to *cat features. Thats all I've done.'

In [6]:
from sklearn.preprocessing import OneHotEncoder

In [7]:
bad_columns = X_train.columns[X_train.columns.str.contains('calc|id')]
categorical_columns = X_train.columns[X_train.columns.str.contains('cat')]

binary_cols = np.setdiff1d(
    X_train.columns[X_train.columns.str.contains('bin')],
    bad_columns
)
# for col in X_train.columns:
#     column_vals = X_train[col].unique()
#     if np.all(np.in1d(column_vals, [0,1])):
#         binary_cols.append(col)
cols_to_scale = np.setdiff1d(X_train.columns, np.union1d(bad_columns, binary_cols))

In [8]:
X_train_categorical = X_train[categorical_columns]
X_train_to_scale = X_train[cols_to_scale]
X_train_binary = X_train[binary_cols]

#### Transform numerical columns

In [9]:
quantile_scaler = QuantileTransformer(output_distribution='normal')
X_train_scaled = quantile_scaler.fit_transform(X_train_to_scale)

#### transform categorical columns

In [10]:
ohe = OneHotEncoder(sparse=False)

In [11]:
from sklearn.base import TransformerMixin
class ShiftCategoricalsTransformer(TransformerMixin):
    
    def fit(self, X, y=None, **kwargs):
        self.cols_to_add = X.columns.to_series().apply(lambda x: np.any(X[x] < 0))
        self.cols_to_add = X.columns[self.cols_to_add]
        return self
    
    def transform(self, X, y=None, **kwargs):
        X_to_transform = X.copy()
        X_to_transform[self.cols_to_add] += 1
        return X_to_transform

In [12]:
### add 1 to columns with -1 values

shift_categoricals = ShiftCategoricalsTransformer()
X_train_categorical_shifted = shift_categoricals.fit_transform(X_train_categorical)

In [13]:
X_train_categorical_ohe = ohe.fit_transform(X_train_categorical_shifted)

In [14]:
X_train_full = np.hstack([X_train_scaled, X_train_categorical_ohe, X_train_binary])

In [15]:
X_train_full.shape

(595212, 221)

### Noise generator

In [16]:
class SwapNoiseGenerator():
    def __init__(self, data, input_swap_noise=0.15):
        self.data = data
        self.input_swap_noise = input_swap_noise
        
    def batch_generator(self, X, y=None, batch_size=32, return_y=False):
        nrow = X.shape[0]
        ncol = X.shape[1]
        while True:
            batch_indices = np.random.choice(nrow, batch_size, replace=False)
            X_batch_output = X[batch_indices, :]
            
            replacement_mask = np.random.random(size=X_batch_output.shape) < self.input_swap_noise
            replacement_row_indices = np.random.choice(nrow, (batch_size, ncol), replace=True)
            replacement_col_indices = [np.arange(ncol)] * batch_size
            replacement_matrix = X[replacement_row_indices, replacement_col_indices]
            
            X_batch_input = np.where(replacement_mask, replacement_matrix, X_batch_output)
            if return_y:
                y_batch = y[batch_indices]
                yield [X_batch_input, y_batch], X_batch_output   
            yield X_batch_input, X_batch_output            
        

### AutoEncoder

In [16]:
from keras.layers import Input, Dense, Dropout
from keras.models import Model
import keras
from keras import optimizers

Using TensorFlow backend.


In [18]:

input_ = Input(shape=(221,))

x1 = Dense(1500, activation='relu')(input_)
x2 = Dense(1500, activation='relu')(x1)
x3 = Dense(1500, activation='relu')(x2)
output = Dense(221, activation='linear')(x3)

autoencoder = Model(inputs=input_, outputs=output)

In [19]:
autoencoder.compile(
    optimizer='adam',#optimizers.adam(decay=0.95), #optimizers.SGD(lr=0.03, decay=0.95),
    loss='mean_squared_error'
)

In [21]:
BATCH_SIZE = 128
STEPS_PER_EPOCH = X_train_full.shape[0]/BATCH_SIZE
MAX_EPOCHS=70

In [22]:
swap_noise_generator = SwapNoiseGenerator(None, 0.15)
batch_generator = swap_noise_generator.batch_generator(X_train_full, BATCH_SIZE)

In [23]:
autoencoder.fit_generator(
    generator=batch_generator, 
    epochs=MAX_EPOCHS, 
    steps_per_epoch=STEPS_PER_EPOCH,
    callbacks=[
        keras.callbacks.EarlyStopping(monitor='loss', min_delta=0.0001, patience=8, verbose=1, mode='auto')
    ],
)

Epoch 1/70
Epoch 2/70
Epoch 3/70
Epoch 4/70
Epoch 5/70
Epoch 6/70
Epoch 7/70
Epoch 8/70
Epoch 9/70
Epoch 10/70
Epoch 11/70
Epoch 12/70
Epoch 13/70
Epoch 14/70
Epoch 15/70
Epoch 16/70
Epoch 17/70
Epoch 18/70
Epoch 19/70
Epoch 20/70
Epoch 21/70
Epoch 22/70
Epoch 23/70
Epoch 24/70
Epoch 25/70
Epoch 26/70
Epoch 27/70
Epoch 28/70
Epoch 29/70
Epoch 30/70
Epoch 31/70
Epoch 32/70
Epoch 33/70
Epoch 34/70
Epoch 35/70
Epoch 36/70
Epoch 37/70
Epoch 38/70
Epoch 39/70
Epoch 40/70
Epoch 41/70
Epoch 42/70
Epoch 43/70
Epoch 44/70
Epoch 45/70
Epoch 46/70
Epoch 47/70
Epoch 48/70
Epoch 49/70
Epoch 50/70
Epoch 51/70
Epoch 52/70
Epoch 53/70
Epoch 54/70
Epoch 55/70
Epoch 56/70
Epoch 00056: early stopping


<keras.callbacks.History at 0x7f35dc644150>

In [24]:
concatenated_encoder = Model(inputs=input_, outputs=[x1, x2, x3])

In [25]:
autoencoder.save(filepath='/home/model/btopolski/DAE/autoencoder_model')
concatenated_encoder.save(filepath='/home/model/btopolski/DAE/concatenated_encoder')

### Load autoencoder

In [17]:
from keras.models import load_model
concatenated_encoder = load_model('/home/model/btopolski/DAE/concatenated_encoder')



In [None]:
# from keras import backend as K
# def get_encoding_function(model, layer_indices):
#     prediction_function = K.function(
#         [model.input, K.learning_phase()], 
#         [model.layers[i].output for i in layer_indices]
#     )
    
#     def prediction_concatenating_function(X):
#         return np.hstack(prediction_function((X, False)))
    
#     return prediction_concatenating_function

In [None]:
# encoding_function = get_encoding_function(autoencoder, [1,2,3])

### Model

In [18]:
from sklearn import metrics
from sklearn.model_selection import KFold
from keras import regularizers

In [19]:
def train_model_and_get_score(X_train, X_valid, y_train, y_valid, metrics=[metrics.log_loss, metrics.roc_auc_score], 
                              batch_size=128, max_epochs=70):
    
    input_ = Input(shape=(4500,))
    x = Dropout(rate=0.1)(input_)
    x = Dense(1000, activation='relu', kernel_regularizer=regularizers.l2(0.05))(input_)
    x = Dropout(rate=0.5)(x)
    x = Dense(1000, activation='relu', kernel_regularizer=regularizers.l2(0.05))(x)
    x = Dropout(rate=0.5)(x)
    output = Dense(1, activation='sigmoid', kernel_regularizer=regularizers.l2(0.05))(x)

    predictor = Model(inputs=input_, outputs=output)

    predictor.compile(
        optimizer=optimizers.adam(decay=0.005), #optimizers.SGD(lr=0.03, decay=0.95),
        loss='binary_crossentropy',
    )
    
    predictor.fit(
        x=X_train,
        y=y_train, 
        epochs=max_epochs, 
        callbacks=[
            keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0.0002, patience=5, verbose=1, mode='auto')
        ],
        validation_data=(X_valid,y_valid)
    )
    
    predictions = predictor.predict(X_valid)
    
    results = {}
    for fun in metrics:
        results[fun.__name__] = fun(y_valid, predictions)    
    return results
    

In [20]:
cv_5fold = KFold(n_splits=5, shuffle=True)

In [None]:
X_train_encoded = np.hstack(concatenated_encoder.predict(X_train_full))

In [None]:
results = []
for train_index, test_index in cv_5fold.split(X_train_encoded):
    X_train_cv = X_train_encoded[train_index]
    X_test_cv = X_train_encoded[test_index]
    y_train_cv = y_train[train_index]
    y_test_cv = y_train[test_index]
    result = train_model_and_get_score(X_train_cv, X_test_cv, y_train_cv, y_test_cv, max_epochs=70)
    results.append(result)
    res_df = pd.DataFrame(results)
    res_df.to_csv('/home/model/btopolski/DAE/cv_results.csv')

In [None]:
results


In [None]:
res_df = pd.DataFrame(results)

In [None]:
res_df.log_loss.mean()