In [None]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
hits_train = pd.read_csv("data/train.csv", index_col='global_id')
hits_train.head()

In [None]:
hits_test = pd.read_csv("data/test.csv", index_col='global_id')
hits_test.head()

# Naive machine learning

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
wires = pd.read_csv('data/wires.csv')
wire_rho = wires["wire_rho"]
wire_phi = wires["wire_phi"]
neibours = [None] * len(wire_rho)
 
for layer in set(wire_rho):
    p = []

    for j in range(len(wire_rho)):
        if layer != wire_rho[j]: continue
        
        p.append((wire_phi[j], j))
        
    p.sort()
    
    num = len(p)
    for i in range(len(p)):
        prv = (i - 1 + num) % num
        nxt = (i + 1) % num
        neibours[p[i][1]] = (p[prv][1], p[nxt][1])   

In [None]:
def get_data(hits):
    event_id = hits['event_id'].values
    wire_id = hits['wire_id'].values
    energy_deposit = hits['energy_deposit'].values
    relative_time = hits['relative_time'].values
    
    event_list = list(set(event_id))
    events_number = len(event_list)
    event_map = {}
    for e in range(events_number):
        event_map[event_list[e]] = e
    wires_number = 4482
    index = np.zeros((events_number, wires_number), dtype="int") 
    
    num = len(event_id)
    result = np.zeros((num, 7))
            
    for i in range(num):
        index[event_map[event_id[i]], wire_id[i]] = i + 1
            
    print "Index created"
        
    for i in range(num):
        if energy_deposit[i] > 0.0:
            current_e = event_map[event_id[i]]
            current_w = int(wire_id[i])
            
            prv_wire = neibours[current_w][0]
            prv_i = index[current_e, prv_wire] - 1
            nxt_wire = neibours[current_w][1]
            next_i = index[current_e, nxt_wire] - 1
            
            result[i,0] = np.log(energy_deposit[i]) 
            result[i,1] = relative_time[i]
            result[i,2] = wire_rho[current_w]
            result[i,3] = np.log(energy_deposit[prv_i] + 1e-20)
            result[i,4] = relative_time[i] - relative_time[prv_i] 
            result[i,5] = np.log(energy_deposit[next_i] + 1e-20)
            result[i,6] = relative_time[i] - relative_time[next_i]
    
    print "Done"
            
    return result

In [None]:
hits_train_filtered = hits_train.loc[hits_train.energy_deposit > 0]

train_data = get_data(hits_train_filtered)
train_data

In [None]:
from sklearn.cross_validation import cross_val_score
cv_entropy = cross_val_score(DecisionTreeClassifier(criterion='entropy'),
                train_data, (hits_train_filtered.label == 1).values.astype(np.int),
                scoring='roc_auc')
print(cv_entropy.mean(), cv_entropy.std())

CV might take some time

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.cross_validation import cross_val_score
cv_gini = cross_val_score(GradientBoostingClassifier(),
                train_data, (hits_train_filtered.label == 1).values.astype(np.int),
               scoring='roc_auc')
print(cv_gini.mean(), cv_gini.std())

In [None]:
classifier = GradientBoostingClassifier()
classifier.fit(train_data, (hits_train_filtered.label == 1))

In [None]:
candidates = hits_test.loc[hits_test.energy_deposit > 0]
ml_prediction = pd.DataFrame({
        "prediction": classifier.predict_proba(get_data(candidates))[:, 1]
    }, index=candidates.index)

In [None]:
ml_prediction.to_csv("naive_ml_prediction.csv", index_label='global_id')

Moral: sometimes you can outdo simple machine learning by thinking. Corollary: the best result is achieved by combining the approaches.

In [None]:
the_event = hits_train[hits_train.event_id==54]
fig, ax = plt.subplots(figsize=(20,20))
colormap = 'spectral'
wires = pd.read_csv('data/wires.csv')

wires_cartesian = np.vstack((wires['wire_rho'] * np.cos(wires['wire_phi']),
                                  wires['wire_rho'] * np.sin(wires['wire_phi']))).T

ax.scatter(wires_cartesian[:, 0], wires_cartesian[:, 1], c=2-the_event.label, edgecolors='none',
           s=100, cmap=colormap)
# We want to know what color corresponds to which label
labels_x = (-20, 0, 20)
ax.scatter(labels_x, (0, 0, 0), c=(2, 1, 0), cmap=colormap, edgecolors='none', s=300)
for label, coordinate in zip(("0, inactive", "1, signal", "2, noise"), labels_x):
    ax.annotate(label, xy=(coordinate-4, 3))


### Energy deposits

In [None]:
fig, ax = plt.subplots(figsize=(20,20))
ax.scatter(wires_cartesian[:, 0], wires_cartesian[:, 1], c=np.log(the_event.energy_deposit), edgecolors='none',
           s=100, cmap='bwr')