In [1]:
import pandas
from six.moves import cPickle as pickle
import numpy as np
import optparse
import os
import h5py

from sklearn.model_selection import train_test_split, StratifiedKFold, KFold

In [2]:
dirLoc = './Data/'

In [3]:
trainingData = h5py.File(dirLoc + 'train.hdf5', "r+")
testingData = h5py.File(dirLoc + 'testing.hdf5', "r+")

In [4]:
pseudo_trainingData = h5py.File(dirLoc + 'pseudo_train.hdf5', "w")

In [5]:
FRAC = 0.10

In [6]:
for fold in trainingData:
    print(fold)
    #Load original training data
    inputs = np.array(trainingData[fold + '/inputs'])
    train = pandas.DataFrame(inputs, columns=[f'in_{i}' for i in range(len(inputs[0]))])
    train['gen_weight_original'] = np.array(trainingData[fold + '/orig_weights'])
    train['gen_weight'] = train['gen_weight_original']
    train['gen_target'] = np.array(trainingData[fold + '/targets'])
    print('Training data contains', len(train), 'events')
    
    #Get well classified testing data
    inputs = np.array(testingData[fold + '/inputs'])
    test = pandas.DataFrame(inputs, columns=[f'in_{i}' for i in range(len(inputs[0]))])
    test['preds'] = np.array(testingData[fold + '/pred'])
    test['conf'] = np.abs(test.loc[:, 'preds']-0.5)
    test.sort_values(by=['conf'], inplace=True, ascending=False)
    
    #Build pseudo data
    n_label = min(int(FRAC*len(train)), len(test))
    test = test[:n_label]
    test['gen_target'] = np.round(test['preds'].astype('float32'))
    test['gen_weight_original'] = 0
    test.loc[test.gen_target==1, 'gen_weight_original'] = np.mean(train.loc[train.gen_target==1, 'gen_weight_original'])
    test.loc[test.gen_target==0, 'gen_weight_original'] = np.mean(train.loc[train.gen_target==0, 'gen_weight_original'])
    test['gen_weight'] = test['gen_weight_original']
    print('Adding', len(test), 'test events')

    #Combine & save
    pseudo = train.append(test[['gen_weight_original', 'gen_target', 'gen_weight']+[x for x in test.columns if 'in_' in x]], ignore_index=True)
    pseudo.sample(frac=1)
    print('Pseudo data now contains', len(pseudo), 'events')
    
    grp = pseudo_trainingData.create_group(fold)
    
    X = pseudo[[x for x in pseudo.columns if 'in_' in x]].values.astype('float32')
    inputs = grp.create_dataset("inputs", shape=X.shape, dtype='float32')
    inputs[...] = X
    
    pseudo.loc[pseudo.gen_target == 0, 'gen_weight'] = pseudo.loc[pseudo.gen_target == 0, 'gen_weight']/np.sum(pseudo.loc[pseudo.gen_target == 0, 'gen_weight'])
    pseudo.loc[pseudo.gen_target == 1, 'gen_weight'] = pseudo.loc[pseudo.gen_target == 1, 'gen_weight']/np.sum(pseudo.loc[pseudo.gen_target == 1, 'gen_weight'])

    y = pseudo['gen_target'].values.astype('int')
    targets = grp.create_dataset("targets", shape=y.shape, dtype='int')
    targets[...] = y

    X_weights = pseudo['gen_weight'].values.astype('float32')
    weights = grp.create_dataset("weights", shape=X_weights.shape, dtype='float32')
    weights[...] = X_weights

    X_orig_weights = pseudo['gen_weight_original'].values.astype('float32')
    orig_weights = grp.create_dataset("orig_weights", shape=X_weights.shape, dtype='float32')
    orig_weights[...] = X_orig_weights

fold_0
Training data contains 20001 events
Adding 2000 test events


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


Pseudo data now contains 22001 events
fold_1
Training data contains 20001 events
Adding 2000 test events
Pseudo data now contains 22001 events
fold_2
Training data contains 20001 events
Adding 2000 test events
Pseudo data now contains 22001 events
fold_3
Training data contains 20000 events
Adding 2000 test events
Pseudo data now contains 22000 events
fold_4
Training data contains 20000 events
Adding 2000 test events
Pseudo data now contains 22000 events
fold_5
Training data contains 20000 events
Adding 2000 test events
Pseudo data now contains 22000 events
fold_6
Training data contains 20000 events
Adding 2000 test events
Pseudo data now contains 22000 events
fold_7
Training data contains 19999 events
Adding 1999 test events
Pseudo data now contains 21998 events
fold_8
Training data contains 19999 events
Adding 1999 test events
Pseudo data now contains 21998 events
fold_9
Training data contains 19999 events
Adding 1999 test events
Pseudo data now contains 21998 events
