In [11]:
import pandas as pd
import numpy as np
import itertools
import xgboost as xgb
import os
import glob

from svmlight_loader import dump_svmlight_file, load_svmlight_file
# from sklearn.datasets import dump_svmlight_file, load_svmlight_file

In [2]:
targets_df = pd.read_pickle('../saved/train_targets.p')
targets = np.array(targets_df['target'])

In [3]:
%%time
PATH_STORE = '../saved/storage.h5'
cols_df = pd.read_hdf(PATH_STORE, 'train', 
                       where=["index==0"])
cols = cols_df.columns.drop('target')

store = pd.HDFStore(PATH_STORE)
n_rows = store.get_storer('train').nrows
store.close()

print '# rows: %d' % n_rows

# rows: 145175
CPU times: user 86.7 ms, sys: 50 ms, total: 137 ms
Wall time: 323 ms


# Rasco Params

In [4]:
feat_ratio = 0.7
n_estimators = 5
max_iters = 20
n_xfer = 10

n_feats = len(cols)
n_feats_subsp = int(feat_ratio * n_feats)


In [5]:
np.random.seed(33)
sub_sps_inds = [np.random.permutation(n_feats)[:n_feats_subsp]
                for _ in range(n_estimators)]

span = len(set(itertools.chain(*sub_sps_inds))) / float(n_feats)
print 'Span factor :', span

sub_sps_cols = [cols[inds] for inds in sub_sps_inds]

Span factor : 1.0


## Making libsvm and xgb DMatrices for subspaces

In [6]:
%%time
PATH_SAVE_LIBSVM_PART = '../saved/rasco/train_sub%d_%d.libsvm'  # sub_i and part
chunksize = 20000

for sub_i, sub_sp_cols in enumerate(sub_sps_cols):
    print 'Processing subspace # %d' % sub_i
    
    for chunk_i in xrange(n_rows//chunksize + 1):
        print 'Chunk %d / %d' % (chunk_i, n_rows//chunksize + 1),
        
        X = pd.read_hdf(PATH_STORE, 'train', columns=sub_sp_cols,
                        start=chunk_i*chunksize, stop=(chunk_i+1)*chunksize)
        
        print ' | Shape:', X.shape
        
        dump_svmlight_file(
            X=X, y=targets[chunk_i*chunksize:chunk_i*chunksize + X.shape[0]], 
            f=PATH_SAVE_LIBSVM_PART % (sub_i, chunk_i))
        
        del X
    print 'Done with subspace #', sub_i
    
    break


Processing subspace # 0
Chunk 0 / 8  | Shape: (20000, 1506)
Chunk 1 / 8  | Shape: (20000, 1506)
Chunk 2 / 8  | Shape: (20000, 1506)
Chunk 3 / 8  | Shape: (20000, 1506)
Chunk 4 / 8  | Shape: (20000, 1506)
Chunk 5 / 8  | Shape: (20000, 1506)
Chunk 6 / 8  | Shape: (20000, 1506)
Chunk 7 / 8  | Shape: (5175, 1506)
Done with subspace # 0
CPU times: user 2min 14s, sys: 46.3 s, total: 3min 1s
Wall time: 4min 42s


In [24]:
%%time
# Merging libsvm parts
PATH_SAVE_LIBSVM_FULL = PATH_SAVE_LIBSVM_PART.rsplit('_', 1)[0] + '.libsvm'

for sub_i, sub_sp_cols in enumerate(sub_sps_cols):
    
    fnames = glob.glob(PATH_SAVE_LIBSVM_PART.rsplit('_', 1)[0] % sub_i + '_*')
    
    with open(PATH_SAVE_LIBSVM_FULL % sub_i, 'w') as outfile:
        for fname in fnames:
            with open(fname) as infile:
                for line in infile:
                    outfile.write(line)
                
    break
    
    

CPU times: user 513 ms, sys: 41.7 s, total: 42.3 s
Wall time: 2min 7s


In [None]:
%%time
PATH_SAVE_BIN = '../saved/rasco/train_sub%d.bin'

for sub_i, sub_sp_cols in enumerate(sub_sps_cols):
    d_sub = xgb.DMatrix(PATH_SAVE_LIBSVM_FULL % sub_i)
    d_sub.save_binary(PATH_SAVE_BIN % sub_i)
    print 'Done:', sub_i

    break