In [1]:
import findspark
findspark.init('/afs/cern.ch/work/m/migliori/public/spark-2.4/')

In [2]:
!. /afs/cern.ch/work/m/migliori/public/hadoop_confs/setup_hadalytic.sh

In [3]:
application_name = 'hyperparameter-search'
master = "yarn"
num_executors = 25
executor_memory = '6G'
driver_memory = '64G'
num_cores = 4

In [4]:
from pyspark.sql import SparkSession

spark = SparkSession.builder\
        .appName(application_name)\
        .config("spark.pyspark.python",
                "/afs/cern.ch/work/m/migliori/public/anaconda2/bin/python")\
        .config("spark.master", master)\
        .config("spark.executor.cores", `num_cores`)\
        .config("spark.executor.instances", `num_executors`)\
        .config("spark.executor.memory", executor_memory)\
        .config("spark.driver.memory", driver_memory)\
        .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")\
        .config("spark.driver.maxResultSize", "32G") \
        .config("spark.task.cpus", `num_cores`)\
        .getOrCreate()

In [5]:
spark

## Load train and test datasets

In [6]:
from __future__ import print_function

In [7]:
%%time
trainDF = spark.read.format('parquet')\
        .load('hdfs://hadalytic/project/ML/data/train100k.parquet')\
        .select(['HLF_input', 'encoded_label'])
        
testDF = spark.read.format('parquet')\
        .load('hdfs://hadalytic/project/ML/data/test20k.parquet')\
        .select(['HLF_input', 'encoded_label'])
        
test_events = testDF.count()
train_events = trainDF.count()
print('There are', train_events, 'training events')
print('There are', test_events, 'test events')

There are 99656 training events
There are 20173 test events
CPU times: user 9.09 ms, sys: 7.8 ms, total: 16.9 ms
Wall time: 1min 20s


Now we can convert the Spark dataframes to Pandas

In [8]:
import numpy as np

In [9]:
trainDF = trainDF.toPandas()
testDF = testDF.toPandas()

In [10]:
trainDF.head()

Unnamed: 0,HLF_input,encoded_label
0,"[0.0, 0.004093188025030354, 0.6636617564181393...","(0.0, 0.0, 1.0)"
1,"[0.006637775825149609, 0.04995229218498003, 0....","(1.0, 0.0, 0.0)"
2,"[0.007647997639186321, 0.02028482486460716, 0....","(0.0, 0.0, 1.0)"
3,"[0.05113739451279049, 0.040669991536800086, 0....","(0.0, 1.0, 0.0)"
4,"[0.0, 0.02776366095244539, 0.16691218668213223...","(1.0, 0.0, 0.0)"


Convert `Dense` and `Sparse` vectors to list

In [11]:
trainDF[trainDF.columns] = trainDF[trainDF.columns].applymap(lambda x: list(x))
testDF[testDF.columns] = testDF[testDF.columns].applymap(lambda x: list(x))

In [12]:
X = np.array(trainDF['HLF_input'].tolist())
y = np.array(trainDF['encoded_label'].tolist())

In [13]:
X_test = np.array(testDF['HLF_input'].tolist())
y_test = np.array(testDF['encoded_label'].tolist())

## Create the keras model

In [14]:
from keras.optimizers import adam
from keras.models import Sequential
from keras.layers.core import Dense, Activation

def create_model(nh_1, nh_2, nh_3):
    ## Create model
    model = Sequential()
    model.add(Dense(nh_1, input_shape=(14,), activation='relu'))
    model.add(Dense(nh_2, activation='relu'))
    model.add(Dense(nh_3, activation='relu'))
    model.add(Dense(3, activation='softmax'))
    
    ## Compile model
    optimizer = 'adam'
    loss = 'categorical_crossentropy'
    model.compile(loss=loss, optimizer=optimizer, metrics=["accuracy"])
    
    return model

Using TensorFlow backend.


## Baseline model

In [15]:
baseline = create_model(50,20,10)

%time history = baseline.fit(X, y, batch_size=100, epochs=50, validation_data=(X_test,y_test),verbose=0)

CPU times: user 2min 17s, sys: 32.1 s, total: 2min 50s
Wall time: 59.9 s


In [16]:
import matplotlib.pyplot as plt 
%matplotlib notebook

In [23]:
%matplotlib notebook
plt.figure()
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='validation')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(loc='upper right')
plt.show()

<IPython.core.display.Javascript object>

In [24]:
%matplotlib notebook
plt.figure()
plt.plot(history.history['acc'], label='train')
plt.plot(history.history['val_acc'], label='validation')
plt.ylabel('Accuracy')
plt.xlabel('epoch')
plt.legend(loc='lower right')
plt.show()

<IPython.core.display.Javascript object>

In [19]:
y_pred = baseline.predict(X_test)

In [20]:
from sklearn.metrics import roc_curve, auc

fpr = dict()
tpr = dict()
roc_auc = dict()

for i in range(3):
    fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_pred[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

In [21]:
%matplotlib notebook

plt.figure()
plt.plot(fpr[1], tpr[1], color='blue', 
         lw=2, label='HLF classifier (AUC) = %0.4f' % roc_auc[1])
plt.plot([0, 1], [0, 1], color='orange', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Background Contamination (FPR)')
plt.ylabel('Signal Efficiency (TPR)')
plt.title('$tt$ selector')
plt.legend(loc="lower right")
plt.grid()
plt.show()

<IPython.core.display.Javascript object>

## Create the keras classifier

In [22]:
from keras.wrappers.scikit_learn import KerasClassifier

model = KerasClassifier(build_fn=create_model, verbose=0)

## Define the grid search parameters

In [28]:
batch_size = [100,200]
epochs = [30,50]

## Number of hidden units per layer
nh_1 = [50,100]
nh_2 = [20,50,100]
nh_3 = [10,20]

In [29]:
param_grid = {'batch_size':batch_size,
              'epochs':epochs,
              'nh_1':nh_1, 'nh_2':nh_2, 'nh_3':nh_3}

## Grid Search with Spark

In [30]:
from spark_sklearn.grid_search import GridSearchCV

sc = spark.sparkContext

grid = GridSearchCV(sc, estimator=model, param_grid=param_grid, cv=3, verbose=5)

In [31]:
%time grid_results = grid.fit(X,y)

Fitting 3 folds for each of 48 candidates, totalling 144 fits
CPU times: user 2min 45s, sys: 33.1 s, total: 3min 18s
Wall time: 4min 34s


In [33]:
grid_results.best_estimator_.get_params()

{'batch_size': 100,
 'build_fn': <function __main__.create_model>,
 'epochs': 50,
 'nh_1': 100,
 'nh_2': 100,
 'nh_3': 20,
 'verbose': 0}

In [38]:
y_pred = grid_results.best_estimator_.predict_proba(X_test)

In [40]:
from sklearn.metrics import roc_curve, auc

fpr = dict()
tpr = dict()
roc_auc = dict()

for i in range(3):
    fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_pred[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

In [41]:
%matplotlib notebook

plt.figure()
plt.plot(fpr[1], tpr[1], color='blue', 
         lw=2, label='HLF classifier (AUC) = %0.4f' % roc_auc[1])
plt.plot([0, 1], [0, 1], color='orange', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Background Contamination (FPR)')
plt.ylabel('Signal Efficiency (TPR)')
plt.title('$tt$ selector')
plt.legend(loc="lower right")
plt.grid()
plt.show()

<IPython.core.display.Javascript object>

## Random search

In [61]:
np.random.seed(42)

batch_size = np.random.randint(low=100, high=300, size=3)
epochs = np.random.randint(low=30, high=80, size=3)

## Number of hidden units per layer
nh_1 = np.random.randint(low=50, high=150, size=3)
nh_2 = np.random.randint(low=30, high=100, size=3)
nh_3 = np.random.randint(low=10, high=30, size=2)

In [45]:
random_param_grid = {'batch_size':batch_size,
                     'epochs':epochs,
                     'nh_1':nh_1, 'nh_2':nh_2, 'nh_3':nh_3}

In [46]:
random_grid = GridSearchCV(sc, estimator=model, param_grid=random_param_grid, cv=3, verbose=5)

In [50]:
random_grid.param_grid

{'batch_size': array([152, 284, 129]),
 'epochs': array([41, 38, 65]),
 'nh_1': array([143, 121, 141]),
 'nh_2': array([66, 47, 35]),
 'nh_3': array([22, 22])}

In [47]:
%time random_grid_results = random_grid.fit(X,y)

Fitting 3 folds for each of 162 candidates, totalling 486 fits
CPU times: user 2min 22s, sys: 30 s, total: 2min 52s
Wall time: 13min 37s


In [48]:
random_grid_results.best_estimator_.get_params()

{'batch_size': 152,
 'build_fn': <function __main__.create_model>,
 'epochs': 65,
 'nh_1': 141,
 'nh_2': 35,
 'nh_3': 22,
 'verbose': 0}

In [62]:
y_pred = random_grid_results.best_estimator_.predict_proba(X_test)

In [63]:
from sklearn.metrics import roc_curve, auc

fpr = dict()
tpr = dict()
roc_auc = dict()

for i in range(3):
    fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_pred[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

In [64]:
%matplotlib notebook

plt.figure()
plt.plot(fpr[1], tpr[1], color='blue', 
         lw=2, label='HLF classifier (AUC) = %0.4f' % roc_auc[1])
plt.plot([0, 1], [0, 1], color='orange', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Background Contamination (FPR)')
plt.ylabel('Signal Efficiency (TPR)')
plt.title('$tt$ selector')
plt.legend(loc="lower right")
plt.grid()
plt.show()

<IPython.core.display.Javascript object>