In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
import os
import glob
import cPickle as pickle
from collections import Counter
from time import time

from sklearn.cross_validation import StratifiedKFold, StratifiedShuffleSplit

import gc



In [None]:
flag_cols = pickle.load(open('../saved/flags_columns.p', 'rb'))
flag_cols_base = {col.rsplit('_', 1)[0] for col in flag_cols}
saved_enc = {os.path.splitext(os.path.basename(p))[0]
     for p in glob.glob('../saved/stupid_encode/VAR*.npy')}
flag_cols_base = flag_cols_base - saved_enc

In [None]:
params = {}
params['objective'] = "binary:logistic"
params['eta'] = 0.1
params['min_child_weight'] = 6
params['subsample'] = 0.7
params['colsample_bytree'] = 1.0
# params['max_delta_step'] = 1
params['silent'] = 1
params['max_depth'] = 3 #7
params['eval_metric'] = 'auc'

def fpreproc(dtrain, dtest, param):
    label = dtrain.get_label()
    ratio = float(np.sum(label == 0)) / np.sum(label==1)
    param['scale_pos_weight'] = ratio
    return (dtrain, dtest, param)

plst = list(params.items())

max_rounds = 50

In [None]:
PATH_STORE = '../saved/storage.h5'
mode = 'train'

# col_to_enc = flag_cols_base[0]

for col_ii, col_to_enc in enumerate(flag_cols_base):
    tic = time()
    print '%d/%d' % (col_ii, len(flag_cols_base)),
    
    columns = [col_to_enc] + ['target']
    df = pd.read_hdf(PATH_STORE, mode, columns=columns)
    print col_to_enc, 
    
    # Setting up the rows used for encoding and the rows to hold out
    x_train = df[col_to_enc][:, None]
    y_train = np.array(df.target)
    del(df)
    
    if os.path.exists('../saved/stupid_encode/enc_ind.npy'):
        enc_ind = np.load('../saved/stupid_encode/enc_ind.npy')
        val_ind = np.load('../saved/stupid_encode/val_ind.npy')
    else:
        enc_ind, val_ind = iter(StratifiedShuffleSplit(y_train, random_state=322)).next()
        np.save('../saved/stupid_encode/enc_ind.npy', enc_ind)
        np.save('../saved/stupid_encode/val_ind.npy', val_ind)
        
    denc = xgb.DMatrix(data=x_train[enc_ind], label=y_train[enc_ind])
    dval = xgb.DMatrix(data=x_train[val_ind], label=y_train[val_ind])
    
    # Train encoding xgboost model
    watchlist = [(denc, 'train'),(dval, 'val')]
    bst = xgb.train(params, denc, max_rounds, evals=watchlist, early_stopping_rounds=20, verbose_eval=False)
    bst.save_model('../saved/stupid_encode/%s.model' % col_to_enc)
    
    # Predict (encode)
    dall = xgb.DMatrix(x_train)
    encoded = bst.predict(dall, ntree_limit=bst.best_iteration)
    np.save('../saved/stupid_encode/%s.npy' % col_to_enc, encoded)
    
    print 'Time: %g' % (time() - tic)
    
    del(x_train)
    del(y_train)
    del(denc)
    del(dval)
    del(dall)
    gc.collect()
#     break

# The mass conversion

In [None]:
%%time

PATH_STORE = '../saved/storage.h5'
# mode = 'train'
mode = 'test'

df = pd.read_hdf(PATH_STORE, mode)
print df.shape

In [None]:
%%time

if mode == 'train':
    for col in saved_enc:
        loaded = np.load('../saved/stupid_encode/%s.npy' % col)
        df[col] = loaded
        gc.collect()
elif mode == 'test':
    for ii, col in enumerate(saved_enc):
        tic = time()
        print ii,
        encoder = xgb.Booster(model_file='../saved/stupid_encode/%s.model' % col)
        dm = xgb.DMatrix(df[col][:, None])
        df[col] = encoder.predict(dm)
        gc.collect()
        
#         print '%d sec' % (time() - tic),
    
print df.shape

In [None]:
with open('/tmp/xgbenclog.log', 'w') as f:
    f.write(str(df.shape))

In [None]:
%%time
# HDF5
# with pd.get_store('../saved/storage_xgbenc.h5') as store:
with pd.get_store('/tmp/storage_xgbenc.h5') as store:
    store.put(mode, df, format='table')

In [None]:
print df.shape

### Convert to libsvm

In [1]:
%%time
import pandas as pd
import numpy as np
# libsvm
# mode = 'train'
mode = 'test'

# df = pd.read_hdf('../saved/storage_xgbenc.h5', mode)
df = pd.read_hdf('/tmp/storage_xgbenc.h5', mode)

print df.shape

(145232, 2152)
CPU times: user 6.87 s, sys: 2.24 s, total: 9.11 s
Wall time: 27.5 s


In [4]:
df.fillna(0, inplace=True)
if mode == 'train':
    labels = df['target']
    df.drop(['target'], axis=1, inplace=True)
else:
    labels = np.zeros(len(df))

In [None]:
%%time
PATH_SAVE_LIBSVM = '/tmp/%s_xgbenc.libsvm' % mode

from sklearn.datasets import dump_svmlight_file, load_svmlight_file
# from svmlight_loader import dump_svmlight_file, load_svmlight_file

dump_svmlight_file(X=df, y=labels,
                  f=PATH_SAVE_LIBSVM)

# Scratch

In [None]:
enc_ind = np.load('../saved/stupid_encode/enc_ind.npy')
val_ind = np.load('../saved/stupid_encode/val_ind.npy')
print enc_ind.shape
print val_ind.shape

loaded = np.load('../saved/stupid_encode/VAR_0002.npy')
print loaded.shape

In [None]:
print df.shape

In [None]:
gc.collect()