In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import matthews_corrcoef, roc_auc_score
from sklearn.cross_validation import cross_val_score, StratifiedKFold
import matplotlib.pyplot as plt
import seaborn as sns
from sys import getsizeof
import time
import gc
import tqdm

%matplotlib inline



In [2]:
import pickle

def save_pickle(x, filename):
    with open(filename, 'wb') as handle:
        pickle.dump(x, handle, protocol=pickle.HIGHEST_PROTOCOL)

def read_pickle(filename):
    with open(filename, 'rb') as handle:
        x = pickle.load(handle)
    return x

In [15]:
train_date = pd.read_csv('data/train_date.csv.zip', index_col=0, dtype=np.float16)

  mask |= (ar1 == a)


### It turns out that numpy array is not more memory efficient than pandas dataframe

In [17]:
x_train = read_pickle('x_train_numeric_date_0.pickle')

In [22]:
x_train.nbytes*10**-9, x_train.shape

(1.44417134, (1183747, 610))

In [23]:
train_date.memory_usage(deep=True).sum()*10**-9, train_date.shape

(2.7462930400000003, (1183747, 1156))

In [None]:
def bayesian_time_diff(x_train, x_test, suffix, shift, feat):
    '''
    Calculate bayesian mean time difference between neighboring failures.
    shift=n, n>0: n next failures
    shift=n, n<0: n previous failures
    x_train: has Response column
    x_test: does not have Response column, other columns should be the same as x_train
    feat: feature used for sorting and calculating distance
    '''
    assert(shift!=0), 'shift cannot equal 0!'
    
    # calculate 1st distance
    x = pd.concat([x_train, x_test]).fillna(0)
    x['idx'] = np.arange(len(x), dtype=np.int32)
    n_train = len(x_train)
    x['Response'] = x['Response'].astype(int)
    x.sort_values(feat, axis=0, inplace=True)
    
    x['res1'] = x['Response']
    x.loc[x['Response']==1, 'res1'] = x.loc[x['Response']==1, feat]
    if shift<0:
        x['res1'] = x[feat] - x['res1'].shift(1).fillna(0).replace(0, method='ffill')
    elif shift>0:
        x['res1'] = x['res1'].shift(-1).fillna(9999).replace(0, method='bfill') - x[feat]
        
    shift_abs = np.abs(shift)
    if shift_abs>1:
        for n in range(2, shift_abs+1):
            x['res'+str(n)] = x['Response']
            x.loc[x['Response']==1, 'res'+str(n)] = x.loc[x['Response']==1, 'res'+str(n-1)]
            if shift<0:
                x['res'+str(n)] = x['res'+str(n)].shift(1).fillna(0).replace(0, method='ffill') + x['res'+str(n-1)]
            elif shift>0:
                x['res'+str(n)] = x['res'+str(n)].shift(-1).fillna(9999).replace(0, method='bfill') + x['res'+str(n-1)]

    x[feat+suffix] = x[['res'+str(n) for n in range(1, shift_abs+1)]].mean(axis=1)
    x.drop(['res'+str(n) for n in range(1, shift_abs+1)], axis=1, inplace=True)
    x.sort_values('idx', axis=0, inplace=True)
    x.drop('idx', axis=1, inplace=True)
    x_train0 = x.iloc[:n_train]
    x_test0 = x.iloc[n_train:]
    x_test0.drop('Response', axis=1, inplace=True)
    
    return x_train0, x_test0