In [21]:
import pandas as pd
import numpy as np
import itertools
import xgboost as xgb
import os
import glob
import gc
import shutil

from svmlight_loader import dump_svmlight_file, load_svmlight_file
# from sklearn.datasets import dump_svmlight_file, load_svmlight_file

# mode = 'train'
mode = 'test'

In [13]:
%%time
PATH_STORE = '../saved/storage.h5'
cols_df = pd.read_hdf(PATH_STORE, mode, 
                       where=["index==0"])
if mode == 'train':
    cols = cols_df.columns.drop('target')
else:
    cols = cols_df.columns

store = pd.HDFStore(PATH_STORE)
n_rows = store.get_storer(mode).nrows
store.close()

print '# rows: %d' % n_rows
print 'Columns:', cols

# rows: 145232
Columns: Index([u'VAR_0002', u'VAR_0003', u'VAR_0004', u'VAR_0006', u'VAR_0007', u'VAR_0013', u'VAR_0014', u'VAR_0015', u'VAR_0016', u'VAR_0017', u'VAR_0033', u'VAR_0034', u'VAR_0035', u'VAR_0036', u'VAR_0037', u'VAR_0045', u'VAR_0046', u'VAR_0047', u'VAR_0048', u'VAR_0049', u'VAR_0050', u'VAR_0051', u'VAR_0052', u'VAR_0053', u'VAR_0054', u'VAR_0055', u'VAR_0056', u'VAR_0057', u'VAR_0058', u'VAR_0059', u'VAR_0060', u'VAR_0061', u'VAR_0062', u'VAR_0063', u'VAR_0064', u'VAR_0065', u'VAR_0066', u'VAR_0067', u'VAR_0068', u'VAR_0069', u'VAR_0070', u'VAR_0071', u'VAR_0072', u'VAR_0074', u'VAR_0076', u'VAR_0077', u'VAR_0078', u'VAR_0079', u'VAR_0080', u'VAR_0081', u'VAR_0082', u'VAR_0083', u'VAR_0084', u'VAR_0085', u'VAR_0086', u'VAR_0087', u'VAR_0088', u'VAR_0089', u'VAR_0090', u'VAR_0091', u'VAR_0092', u'VAR_0093', u'VAR_0094', u'VAR_0095', u'VAR_0096', u'VAR_0097', u'VAR_0098', u'VAR_0099', u'VAR_0100', u'VAR_0101', u'VAR_0102', u'VAR_0103', u'VAR_0104', u'VAR_0105', u'VAR_0

In [14]:
if mode == 'train':
    targets_df = pd.read_pickle('../saved/train_targets.p')
    targets = np.array(targets_df['target'])
else:
    targets = -1*np.ones(n_rows)
print targets.shape

(145232,)


# Rasco Params

In [15]:
feat_ratio = 0.7
n_estimators = 5
max_iters = 20
n_xfer = 10

n_feats = len(cols)
n_feats_subsp = int(feat_ratio * n_feats)


In [17]:
np.random.seed(33)
sub_sps_inds = [np.random.permutation(n_feats)[:n_feats_subsp]
                for _ in range(n_estimators)]

span = len(set(itertools.chain(*sub_sps_inds))) / float(n_feats)
print 'Span factor :', span

sub_sps_cols = [cols[inds] for inds in sub_sps_inds]

Span factor : 1.0


## Making libsvm and xgb DMatrices for subspaces

In [18]:
%%time
PATH_SAVE_LIBSVM_PART = '../saved/rasco/%s_sub%d_%d.libsvm'  # mode sub_i and part
chunksize = 20000

for sub_i, sub_sp_cols in enumerate(sub_sps_cols):
       
    print 'Processing subspace # %d' % sub_i
    
    for chunk_i in xrange(n_rows//chunksize + 1):
        print 'Chunk %d / %d' % (chunk_i, n_rows//chunksize + 1),
        
        X = pd.read_hdf(PATH_STORE, mode, columns=sub_sp_cols,
                        start=chunk_i*chunksize, stop=(chunk_i+1)*chunksize)
        
        print ' | Shape:', X.shape
        
        dump_svmlight_file(
            X=X, y=targets[chunk_i*chunksize:chunk_i*chunksize + X.shape[0]], 
            f=PATH_SAVE_LIBSVM_PART % (mode, sub_i, chunk_i))
        
        del X
        gc.collect()
    print 'Done with subspace #', sub_i
    
#     break


Processing subspace # 0
Chunk 0 / 8  | Shape: (20000, 1506)
Chunk 1 / 8  | Shape: (20000, 1506)
Chunk 2 / 8  | Shape: (20000, 1506)
Chunk 3 / 8  | Shape: (20000, 1506)
Chunk 4 / 8  | Shape: (20000, 1506)
Chunk 5 / 8  | Shape: (20000, 1506)
Chunk 6 / 8  | Shape: (20000, 1506)
Chunk 7 / 8  | Shape: (5232, 1506)
Done with subspace # 0
Processing subspace # 1
Chunk 0 / 8  | Shape: (20000, 1506)
Chunk 1 / 8  | Shape: (20000, 1506)
Chunk 2 / 8  | Shape: (20000, 1506)
Chunk 3 / 8  | Shape: (20000, 1506)
Chunk 4 / 8  | Shape: (20000, 1506)
Chunk 5 / 8  | Shape: (20000, 1506)
Chunk 6 / 8  | Shape: (20000, 1506)
Chunk 7 / 8  | Shape: (5232, 1506)
Done with subspace # 1
Processing subspace # 2
Chunk 0 / 8  | Shape: (20000, 1506)
Chunk 1 / 8  | Shape: (20000, 1506)
Chunk 2 / 8  | Shape: (20000, 1506)
Chunk 3 / 8  | Shape: (20000, 1506)
Chunk 4 / 8  | Shape: (20000, 1506)
Chunk 5 / 8  | Shape: (20000, 1506)
Chunk 6 / 8  | Shape: (20000, 1506)
Chunk 7 / 8  | Shape: (5232, 1506)
Done with subspace # 

In [25]:
%%time
# Merging libsvm parts
PATH_SAVE_LIBSVM_FULL = PATH_SAVE_LIBSVM_PART.rsplit('_', 1)[0] + '.libsvm'

for sub_i, sub_sp_cols in enumerate(sub_sps_cols):
    print 'Merging libsvm parts for subspace # %d' % sub_i
    with open(PATH_SAVE_LIBSVM_FULL % (mode, sub_i), 'w') as outfile:
        fnames = glob.glob(PATH_SAVE_LIBSVM_PART.rsplit('_', 1)[0] % (mode, sub_i) + '_*')
        for fname in fnames:
            with open(fname, 'rb') as readfile:
                shutil.copyfileobj(readfile, outfile)
    gc.collect()

    

Merging libsvm parts for subspace # 0
Merging libsvm parts for subspace # 1
Merging libsvm parts for subspace # 2
Merging libsvm parts for subspace # 3
Merging libsvm parts for subspace # 4
CPU times: user 3.34 s, sys: 3min 36s, total: 3min 39s
Wall time: 12min 42s


In [15]:
%%time
PATH_SAVE_BIN = '../saved/rasco/%s_sub%d.bin'

for sub_i, sub_sp_cols in enumerate(sub_sps_cols):
    print 'xgb DMatrix binary save for subspace # %d' % sub_i
    
    d_sub = xgb.DMatrix(PATH_SAVE_LIBSVM_FULL % sub_i)
    d_sub.save_binary(PATH_SAVE_BIN % (mode, sub_i))
    print 'Done:', sub_i

    del d_sub
    gc.collect()
#     break

xgb DMatrix binary save for subspace # 0
Done: 0
xgb DMatrix binary save for subspace # 1
Done: 1
xgb DMatrix binary save for subspace # 2
Done: 2
xgb DMatrix binary save for subspace # 3
Done: 3
xgb DMatrix binary save for subspace # 4
Done: 4
CPU times: user 2min 27s, sys: 3min 38s, total: 6min 6s
Wall time: 11min 50s


# Merging train & test libsvm files 


In [30]:
%%time
# Merging libsvm train & test

for sub_i, sub_sp_cols in enumerate(sub_sps_cols):
    print 'Merging libsvm train and test for subspace # %d' % sub_i
    with open(PATH_SAVE_LIBSVM_FULL % ('both', sub_i), 'w') as outfile:
        fnames = glob.glob(PATH_SAVE_LIBSVM_FULL % ('*', sub_i))
        for fname in fnames:
            with open(fname, 'rb') as readfile:
                shutil.copyfileobj(readfile, outfile)
    gc.collect()

Merging libsvm train and test for subspace # 0
Merging libsvm train and test for subspace # 1
Merging libsvm train and test for subspace # 2
Merging libsvm train and test for subspace # 3
Merging libsvm train and test for subspace # 4
CPU times: user 4.61 s, sys: 7min 4s, total: 7min 9s
Wall time: 35min 49s


In [29]:
PATH_SAVE_LIBSVM_FULL

'../saved/rasco/%s_sub%d.libsvm'

# Saving binaries for combined

In [None]:
PATH_LIBSVM = '../saved/rasco/both_sub%d.libsvm'

n_subsp = len(glob.glob(PATH_LIBSVM.replace('%d', '%s') % '*'))
print n_subsp


In [None]:
%%time

PATH_SAVE_BIN = '/tmp/both_sub%d.bin'

for sub_i in range(n_subsp):
    print 'xgb DMatrix binary save for subspace # %d' % sub_i
    
    d_sub = xgb.DMatrix(PATH_LIBSVM % sub_i)
    d_sub.save_binary(PATH_SAVE_BIN % sub_i)
    print 'Done:', sub_i

    del d_sub
    gc.collect()
