In [None]:
import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.neighbors import BallTree
import lightgbm as lgb
from mpl_toolkits.mplot3d import Axes3D
from tqdm import tqdm

In [None]:
train = pd.DataFrame()
for f in tqdm(glob.glob('./data/dm4/open*.h5')):
    chunk = pd.read_hdf(f)
    train = train.append(chunk, ignore_index=True)

In [None]:
train.head()

In [None]:
train.data_ind.unique()

In [None]:
indiv = train[train.data_ind == 9]
indiv.shape

In [None]:
for event_id in indiv.event_id.unique()[1:]:
    ev = indiv[indiv.event_id == event_id]
    ax = plt.gca(projection='3d')
    ax.scatter3D(ev.X, ev.Y, ev.Z)

In [None]:
sorted(indiv.event_id.unique())

In [None]:
event = indiv[indiv.event_id == 186145]

In [None]:
z = np.array(sorted(indiv.Z.unique()))

In [None]:
z[1:] - z[:-1]

Here we'll quickly check if 1293 is indeed the normal Z-distance between slices:

In [None]:
(values,counts) = np.unique(z[1:] - z[:-1], return_counts=True)
dist = values[np.argmax(counts)]

In [None]:
print(dist)

In [None]:
np.unique(indiv.event_id, return_counts=True)

Our prediction model will generate data as a first-order (linear) 
approximation along the Z axis using the dist variable from above 

In [None]:
cols = ['TX', 'TY', 'X', 'Y', 'Z']
colsButZ = cols[:-1]

In [None]:
def gen_linear(df, metric='minkowski'):
    out = []
    
    for data_ind in tqdm(df.data_ind.unique()):
        ind = df[df.data_ind == data_ind]
        vals = list(sorted(ind.Z.unique()))
        
        for z, z_next in zip(vals, vals[1:]):
            z = ind[ind.Z == z].copy()
            z_next = ind[ind.Z == z_next].copy()
            
            z[['TX', 'TY']] *= dist
            z_next[['TX', 'TY']] *= dist
            
            bt = BallTree(z_next[colsButZ], metric=metric)
            d, i = bt.query(z[colsButZ])
            data = z_next.iloc[i[:, 0]]
            
            for col in cols:
                z[col + '_pair'] = data[col].values
            if 'event_id' in data.columns:
                z['same_pair'] = data.event_id.values == z.event_id.values
            out.append(z)
            
        out.append(z_next)
        
    out = pd.concat(out)
    for col in cols:
        out['d' + col] = out[col].values - out[col + '_pair'].values
    return out

In [None]:
trainPairs = gen_linear(train)
trainPairs.head()

In [None]:
X_train = trainPairs[trainPairs.same_pair.notnull()]
y_train = X_train['same_pair'].astype(int)

X_train = X_train.drop(['event_id', 'signal', 'data_ind', 'same_pair'], axis=1)
X_train.head()

In [None]:
lgb_train = lgb.Dataset(X_train, y_train)

In [None]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'boost_from_average':'false',
    'metric': 'auc',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'max_depth': 15,
    'feature_fraction': 0.7,
    'bagging_fraction': 0.7,
    'bagging_freq': 4
}

In [None]:
nRounds = 20
lgb.cv(params, lgb_train, nRounds, nfold=5)

In [None]:
nRounds = 100
bst = lgb.train(params, lgb_train, nRounds)

In [None]:
test = pd.concat([pd.read_hdf(file) for file in ['./data/4/test_close0.h5', './data/4/test_close10.h5']])
test = test.reset_index(drop=True)
print(test.shape)
test.head()

In [None]:
testPairs = gen_linear(test)

In [None]:
X_test = testPairs.drop(['data_ind'], axis=1).reset_index(drop=True)
X_test.head()

In [None]:
y_pred = bst.predict(X_test)

In [None]:
outRaw = pd.DataFrame({'id': testPairs.index, 'prob': y_pred}).groupby('id')
agg = outRaw.aggregate(('mean', 'max', 'min'))['prob']

In [None]:
result = pd.DataFrame(data={'id': agg.index, 'signal': agg['mean'].values})
result.head()

In [None]:
(result.signal > .5).sum()

In [None]:
result.to_csv('./data/4/submit.csv', index=False)