In [20]:
import sys

In [21]:
sys.path.append('../')

In [22]:
import pandas as pd
import numpy as np
from scipy.special import erfinv
import matplotlib.pyplot as plt


import torch
from torch import nn
from torch.utils.data import * 
from torch.optim import *
from fastai.model import *
from fastai.column_data import *
from fastai.dataloader import *

%matplotlib inline

In [23]:
train = pd.read_feather('./data/train.feather')
test = pd.read_feather('./data/test.feather')

In [24]:
train.shape

(595212, 59)

In [25]:
train_test = pd.concat([train, test]).reset_index()

In [26]:
train_test = train_test.sample(frac=0.5)

In [27]:
train_test.reset_index(inplace=True)

### Feature Engineering

I dislike this part most, my creativity is too low for an average competition lifetime, also luck plays huge role here. Therefore I like representation learning, its also an step towards AI. Basically I removed *calc, added 1-hot to *cat features. Thats all I've done. No missing value replacement or something. This is featureset "f0" in the table. This ends up in exactly 221 dense features. With single precision floats its 1.3GB RAM (1e-9*4*221*(595212+892816)). Thanks to the public kernels (wheel of fortune eg.) that suggest to remove *calc features, I'm too blind and probably would not have figured this out by myself. I never remove features.

In [28]:
def preprocess(data):
    usecols = [False if "calc" in n else True for n in data.columns]
    data = data.loc[:,usecols]
    cat_features = [c for c in data.columns if 'cat' in c]
    add_df = data[cat_features]
    data = pd.get_dummies(data, columns=cat_features)
    data = pd.concat([data, add_df], axis= 1).drop('index', 1)
    return data

In [29]:
data = preprocess(train_test)

### Normalization (?)

Michael suggest creating uniform 0, 1 but it doesn't give gaussian??

Input normalization for gradient-based models such as neural nets is critical. For lightgbm/xgb it does not matter. The best what I found during the past and works straight of the box is "RankGauss". Its based on rank transformation. First step is to assign a linspace to the sorted features from 0..1, then apply the inverse of error function ErfInv to shape them like gaussians, then I substract the mean. Binary features are not touched with this trafo (eg. 1-hot ones). This works usually much better than standard mean/std scaler or min/max.

Generate Gaussian distributed random numbers using uniformly distributed random numbers. To convert a uniformly distributed random number  to a Gaussian distributed random number , use the transform

y = sqrt(2)*inverf(x)



In [30]:
def to_gauss(x): return np.sqrt(2)*erfinv(x) 

def normalize(data, exclude=None):
    norm_cols = [n for n, c in data.drop(exclude, 1).items() if len(np.unique(c)) > 2]
    n = data.shape[0]
    for col in norm_cols:
        sorted_idx = data[col].sort_values().index.tolist()
        uniform = np.linspace(start=-0.99, stop=0.99, num=n)
        normal = to_gauss(uniform)
        normalized_col = pd.Series(index=sorted_idx, data=normal)
        data[col] = normalized_col
    return data

In [31]:
norm_data = normalize(data, exclude=['id', 'target'])

### Unsupervised Learning with Data Augmentation

Denoising autoencoders (DAE) are nice to find a better representation of the numeric data for later neural net supervised learning. One can use train+test features to build the DAE. The larger the testset, the better :) An autoencoder tries to reconstruct the inputs features. So features = targets. Linear output layer. Minimize MSE. A denoising autoencoder tries to reconstruct the noisy version of the features. It tries to find some representation of the data to better reconstruct the clean one. With modern GPUs we can put much computing power to solve this task by touching peak floating point performance with huge layers. Sometimes I saw over 300W power consumption by checking nvidia-smi. So why manually constructing 2,3,4-way interactions, use target encoding, search for count features, impute features, when a model can find something similar by itself? The critical part here is to invent the noise. In tabular datasets we cannot just flip, rotate, sheer like people are doing this in images. Adding gaussian or uniform additive / multiplicative noise is not optimal since features have different scale or a discrete set of values that some noise just didnt make sense. **I found a noise schema called "swap noise". Here I sample from the feature itself with a certain probability "inputSwapNoise" in the table above. 0.15 means 15% of features replaced by values from another row. Two different topologies are used by myself.** Deep stack, where the new features are the values of the activations on all hidden layers. Second, bottleneck, where one middle layer is used to grab the activations as new dataset. This DAE step usually blows the input dimensionality to 1k..10k range.

I recommend linear activation in the middle layer of bottleneck setup because relu truncate the values <0. Yes just concat to a long feature vector. Here for a deep stack DAE 221-1500-1500-1500-221 you get new dataset with 4500 features.

In [32]:
dropcols= ['id', 'target']
X = np.array(norm_data.drop(dropcols, 1))

In [33]:
del norm_data
del train
del test
del train_test
del data

In [34]:
### Takes a numpy array and swaps a row of each 
### feature with another value from the same column with probability p
def inputSwapNoise(arr, p):
    n, m = arr.shape
    idx = range(n)
    swap_n = round(n*p)
    for i in range(m):
        col_vals = np.random.permutation(arr[:, i])
        swap_idx = np.random.choice(idx, size= swap_n)
        arr[swap_idx, i] = np.random.choice(col_vals, size = swap_n)
    return arr

In [57]:
# separate train, val
np.random.seed(42)
n = X.shape[1]
val_ratio = 0.3
val_idxs = np.random.permutation(range(n))[:round(n*0.3)]
trn_idxs = np.random.permutation(range(n))[round(n*0.3):]

In [90]:
class AEDataset(Dataset):
    ### AutoEncoder dataset class
    ### Takes denoising function
    ### Some kind of denoising function is recommended
    ### if hidden layer sizes are smaller then input dimension
    ### since there is risk of finding identity
    def __init__(self, arr, denoise_func=None, p=None):
        self.x = arr.copy()
        if denoise_func: self.x_tilde = denoise_func(arr.copy(), p)
        else: self.x_tilde = arr.copy()
            
    def __len__(self): return self.x.shape[0]
    
    def __getitem__(self, idx): return [self.x[idx], self.x_tilde[idx]]

In [91]:
class AEModelData(ModelData):
    def __init__(self, path, trn_ds, val_ds, bs, test_ds=None):
        super().__init__(path, DataLoader(trn_ds, bs, shuffle=True, num_workers=1),
            DataLoader(val_ds, bs*2, shuffle=False, num_workers=1))
    
    @classmethod
    def from_arrays(cls, path, arr, val_idxs, trn_idxs, denoise_func, p, bs):
        arr_trn = arr[val_idxs]
        arr_val = arr[trn_idxs]
        return cls(path, AEDataset(arr_trn, denoise_func, p),
                    AEDataset(arr_val, denoise_func, p), bs)


In [92]:
data = AEModelData.from_arrays('./tmp', X, val_idxs, trn_idxs, inputSwapNoise, 0.15, 128)

In [93]:
class AutoEncoder(nn.Module):
    ### Give encoding layers for example 
    ### if you have a 100, 15 tensor and want to learn two feature layers
    ### with sizes 30, 20
    ### then encoding_layers = [20, 20]
    
    ### After training calling compute_activations 
    ### will allow to get activations (new features) of the input data
    
    ### You can get features at any point of training by calling
    ### get features but if couple tries will be made
    ### it's more efficient to set self.compute_activations = False
    ### after intermediate calls of get_features
    
    def __init__(self, layers, compute_activations = False):
        super().__init__()
        self.layers = layers
        self.compute_activations = compute_activations
        for i in range(len(layers)-1):
            setattr(self, f"fc{i}", nn.Linear(layers[i], layers[i + 1]))
        
    def forward(self, x):
        self.activations = []
        for i in range(len(self.layers)-1):
            dotprod = getattr(self, f"fc{i}")
            x = dotprod(x)
            if self.compute_activations:
                self.activations += [x]
        return x

    def get_activations(self, x):
        self.forward(x)
        return self.activations
        
    def get_features(self,x):
        self.compute_activations = True
        self.get_activations(x)
        features = torch.cat([act.data for act in activations], 1)
        return features

In [94]:
data.trn_dl.dataset.x_tilde

array([[ 1.36026, -1.04715, -2.12373, ...,  0.1339 , -0.19071,  0.33044],
       [ 0.39406, -0.29776,  0.91621, ...,  1.50313, -0.19089,  0.33116],
       [-1.94254, -0.29801,  0.49812, ...,  0.13408,  0.84276,  0.33062],
       ..., 
       [-0.69143,  0.72075, -0.62501, ...,  1.48631, -0.19117,  0.33031],
       [ 1.49713,  0.72103,  1.02998, ...,  0.13402, -0.19105,  0.33058],
       [ 0.33487,  0.72117,  0.00435, ...,  0.13415, -0.19101,  0.33075]])

In [95]:
ae = AutoEncoder([222, 300, 300, 300, 222])

In [99]:
ae

AutoEncoder (
  (fc0): Linear (222 -> 300)
  (fc1): Linear (300 -> 300)
  (fc2): Linear (300 -> 300)
  (fc3): Linear (300 -> 222)
)

In [96]:
ae_model = BasicModel(ae)

In [97]:
learn = StructuredLearner(data, ae_model)

In [98]:
learn.lr_find()

A Jupyter Widget

  0%|          | 0/1 [00:00<?, ?it/s]


TypeError: torch.addmm received an invalid combination of arguments - got (int, torch.FloatTensor, int, torch.DoubleTensor, torch.FloatTensor, out=torch.FloatTensor), but expected one of:
 * (torch.FloatTensor source, torch.FloatTensor mat1, torch.FloatTensor mat2, *, torch.FloatTensor out)
 * (torch.FloatTensor source, torch.SparseFloatTensor mat1, torch.FloatTensor mat2, *, torch.FloatTensor out)
 * (float beta, torch.FloatTensor source, torch.FloatTensor mat1, torch.FloatTensor mat2, *, torch.FloatTensor out)
 * (torch.FloatTensor source, float alpha, torch.FloatTensor mat1, torch.FloatTensor mat2, *, torch.FloatTensor out)
 * (float beta, torch.FloatTensor source, torch.SparseFloatTensor mat1, torch.FloatTensor mat2, *, torch.FloatTensor out)
 * (torch.FloatTensor source, float alpha, torch.SparseFloatTensor mat1, torch.FloatTensor mat2, *, torch.FloatTensor out)
 * (float beta, torch.FloatTensor source, float alpha, torch.FloatTensor mat1, torch.FloatTensor mat2, *, torch.FloatTensor out)
      didn't match because some of the arguments have invalid types: ([32;1mint[0m, [32;1mtorch.FloatTensor[0m, [32;1mint[0m, [31;1mtorch.DoubleTensor[0m, [32;1mtorch.FloatTensor[0m, [32;1mout=torch.FloatTensor[0m)
 * (float beta, torch.FloatTensor source, float alpha, torch.SparseFloatTensor mat1, torch.FloatTensor mat2, *, torch.FloatTensor out)
      didn't match because some of the arguments have invalid types: ([32;1mint[0m, [32;1mtorch.FloatTensor[0m, [32;1mint[0m, [31;1mtorch.DoubleTensor[0m, [32;1mtorch.FloatTensor[0m, [32;1mout=torch.FloatTensor[0m)
