In [1]:
import findspark
findspark.init('/afs/cern.ch/work/m/migliori/public/spark-2.4/')

In [2]:
!. /afs/cern.ch/work/m/migliori/public/hadoop_confs/setup_hadalytic.sh

In [3]:
application_name = 'HLF-Classifier'
master = "yarn"
num_executors = 21
executor_memory = '6G'
driver_memory = '64G'
num_cores = 4

In [4]:
from pyspark.sql import SparkSession
import os 

spark = SparkSession.builder\
        .appName(application_name)\
        .config("spark.pyspark.python",
                "/afs/cern.ch/work/m/migliori/public/anaconda2/bin/python")\
        .config("spark.master", master)\
        .config("spark.executor.cores", `num_cores`)\
        .config("spark.executor.instances", `num_executors`)\
        .config("spark.executor.memory", executor_memory)\
        .config("spark.driver.memory", driver_memory)\
        .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")\
        .config("spark.driver.maxResultSize", "32G") \
        .config("spark.task.cpus", `num_cores`)\
        .getOrCreate()

In [5]:
spark

## Load train and test dataframes

In [6]:
from __future__ import print_function

In [7]:
%%time
trainDF = spark.read.format('parquet') \
        .load('hdfs://p01001532067275.cern.ch/project/ML/data/trainUndersampled_v2.parquet') \
        .select(['HLF_input', 'encoded_label'])
        
testDF = spark.read.format('parquet') \
        .load('hdfs://p01001532067275.cern.ch/project/ML/data/testUndersampled_v2.parquet') \
        .select(['HLF_input', 'encoded_label'])    
        
test_events = testDF.count()
train_events = trainDF.count()
print('There are', train_events, 'training events')
print('There are', test_events, 'test events')

There are 3423028 training events
There are 856078 test events
CPU times: user 27.2 ms, sys: 13.8 ms, total: 41 ms
Wall time: 5min 3s


In [8]:
trainDF.printSchema()

root
 |-- HLF_input: vector (nullable = true)
 |-- encoded_label: vector (nullable = true)



## Build Keras model

In [9]:
from keras.optimizers import adam
from keras.models import Sequential
from keras.layers.core import Dense, Activation

Using TensorFlow backend.


In [10]:
model = Sequential()
model.add(Dense(50, input_shape=(14,), activation='relu'))
model.add(Dense(20, activation='relu'))
model.add(Dense(10, activation='relu'))
model.add(Dense(3, activation='softmax'))

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 50)                750       
_________________________________________________________________
dense_2 (Dense)              (None, 20)                1020      
_________________________________________________________________
dense_3 (Dense)              (None, 10)                210       
_________________________________________________________________
dense_4 (Dense)              (None, 3)                 33        
Total params: 2,013
Trainable params: 2,013
Non-trainable params: 0
_________________________________________________________________


## Distributed-Keras trainer

In [11]:
from distkeras.trainers import * 
from distkeras.predictors import *
from distkeras.utils import *
from distkeras.workers import *

In [12]:
optimizer = 'adam'
loss = 'categorical_crossentropy'

num_epochs = 50 
batch_size = 200

## set the number of workers
num_workers = num_executors

In [13]:
trainer = AEASGD(keras_model=model, worker_optimizer=optimizer,
                 loss=loss, num_workers=num_workers, batch_size=batch_size,
                 features_col="HLF_input", label_col="encoded_label", num_epoch=num_epochs,
                 communication_window=25, rho=5.0, learning_rate=0.1,
                 master_port=5500)
trainer.set_parallelism_factor(1)

In [14]:
%time trained_model = trainer.train(trainDF)

CPU times: user 1min 56s, sys: 1min 30s, total: 3min 27s
Wall time: 17min 1s


In [15]:
trainer_adag = ADAG(keras_model=model, worker_optimizer=optimizer, loss=loss,
                    metrics=["accuracy"], num_workers=num_workers, batch_size=batch_size,
                    features_col="HLF_input", label_col="encoded_label",
                    num_epoch=num_epochs, communication_window=3,
                    master_port=5600)
trainer_adag.set_parallelism_factor(1)

In [16]:
%time trained_model_adag = trainer_adag.train(trainDF)

CPU times: user 6min 33s, sys: 3min 2s, total: 9min 36s
Wall time: 18min 46s


## Plot worker iterations and losses

In [17]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
%matplotlib notebook

In [18]:
worker_id = []
worker_iterations = []
worker_trainer = []

trainers_list = [trainer, trainer_adag]
trainers_name = ['AEASGD', 'ADAG']

for t, t_name in zip(trainers_list, trainers_name):
    for i in range(t.get_num_workers()):
        worker_id.append(i)
        worker_iterations.append(len(t.get_executor_history(i)))
        worker_trainer.append(t_name)
        
trainer_dataframe = pd.DataFrame({'worker_id':worker_id,
                                  'worker_iterations':worker_iterations,
                                  'trainer':worker_trainer})
trainer_dataframe.head()

Unnamed: 0,trainer,worker_id,worker_iterations
0,AEASGD,0,40850
1,AEASGD,1,41150
2,AEASGD,2,40800
3,AEASGD,3,41000
4,AEASGD,4,40700


In [19]:
%matplotlib notebook 
plt.figure()
sns.barplot(x='worker_id', y='worker_iterations', hue='trainer',
              data=trainer_dataframe, palette="Set2")
plt.xlabel('worker id')
plt.ylabel('Iterations')
plt.legend(loc='lower center')
plt.show()

<IPython.core.display.Javascript object>

In [21]:
def compute_trainer_metrics(trainer):
    
    ## Get the history of each executor and find the max number of iterations
    num_workers = trainer.get_num_workers()
    max_iterations = 0
    
    history = []
    
    for i in range(num_workers):
        history.append(trainer.get_executor_history(i))
        if history[i][-1]['iteration']>max_iterations:
            max_iterations = history[i][-1]['iteration']
            
    iteration = []
    avg_loss = []
    std_loss = []

    ## Average the losses of batch_size iterations
    batch_loss = []
    batch_std = []
    
    for i in range(max_iterations):
        loss_i = [h[i]['history'][0] for h in history if i<len(h)]
        batch_loss.append(np.mean(loss_i))
        batch_std.append(np.std(loss_i))
    
        if (i%100==0) or (i==max_iterations-1):
            iteration.append(i)
            avg_loss.append(np.mean(batch_loss))
            std_loss.append(np.mean(batch_std))
            batch_loss = []
            batch_std = []
    
    iteration = np.asarray(iteration)
    loss = np.asarray(avg_loss)
    error = np.asarray(std_loss)
    
    return iteration, loss, error

In [22]:
x, y, err = compute_trainer_metrics(trainer)

In [23]:
x_adag, y_adag, err_adag = compute_trainer_metrics(trainer_adag)

In [24]:
plt.figure()
plt.plot(x, y, lw=2, label='AEASGD')
plt.plot(x_adag, y_adag, lw=2, label='ADAG')
plt.fill_between(x, y - err , y + err, alpha=0.3)
plt.fill_between(x_adag, y_adag - err_adag,
                 y_adag + err_adag, alpha=0.3)
plt.xlabel('Iterations')
plt.ylabel('Loss')
plt.legend()
plt.show()

<IPython.core.display.Javascript object>

## Save the model

In [25]:
trained_model_adag.save('HLF_model.h5')

## Make predictions

In [26]:
predictor = ModelPredictor(keras_model=trained_model_adag, features_col='HLF_input')
pred = predictor.predict(testDF)

In [27]:
pred.select(['encoded_label', 'prediction']).show(5)

+-------------+--------------------+
|encoded_label|          prediction|
+-------------+--------------------+
|(3,[0],[1.0])|[0.73483234643936...|
|(3,[0],[1.0])|[0.95402497053146...|
|(3,[0],[1.0])|[0.07840288430452...|
|(3,[0],[1.0])|[0.49988028407096...|
|(3,[0],[1.0])|[0.87204778194427...|
+-------------+--------------------+
only showing top 5 rows



## Compute the auc

In [28]:
from pyspark.sql.types import ArrayType, DoubleType
from pyspark.sql.functions import udf
    
vector_udf = udf(lambda vector: vector.toArray().tolist(),ArrayType(DoubleType()))
pred = pred.select([vector_udf('encoded_label').alias('encoded_label'),
                    vector_udf('prediction').alias('prediction')])

In [29]:
%%time
pred_pd = pred.select(['prediction', 'encoded_label']).toPandas()

CPU times: user 6.06 s, sys: 220 ms, total: 6.28 s
Wall time: 1min 8s


In [30]:
pred_pd.head()

Unnamed: 0,prediction,encoded_label
0,"[0.734832346439, 0.00241237436421, 0.262755215...","[1.0, 0.0, 0.0]"
1,"[0.954024970531, 0.00100908835884, 0.044965907...","[1.0, 0.0, 0.0]"
2,"[0.0784028843045, 0.00784215331078, 0.91375499...","[1.0, 0.0, 0.0]"
3,"[0.499880284071, 0.00707794679329, 0.493041783...","[1.0, 0.0, 0.0]"
4,"[0.872047781944, 0.000754710577894, 0.12719747...","[1.0, 0.0, 0.0]"


In [31]:
import numpy as np
y_true = np.array(pred_pd['encoded_label'].tolist())
y_pred = np.array(pred_pd['prediction'].tolist())

In [32]:
from sklearn.metrics import roc_curve, auc
fpr = dict()
tpr = dict()
roc_auc = dict()

for i in range(3):
    fpr[i], tpr[i], _ = roc_curve(y_true[:, i], y_pred[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

In [35]:
import matplotlib.pyplot as plt
%matplotlib notebook

plt.figure()
plt.plot(fpr[1], tpr[1], color='blue', 
         lw=2, label='HLF classifier (AUC) = %0.4f' % roc_auc[1])
plt.plot([0, 1], [0, 1], color='orange', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Background Contamination (FPR)')
plt.ylabel('Signal Efficiency (TPR)')
plt.title('$tt$ selector')
plt.legend(loc="lower right")
plt.grid()
plt.show()

<IPython.core.display.Javascript object>

In [34]:
import matplotlib.pyplot as plt
%matplotlib notebook

plt.figure()
plt.plot(fpr[2], tpr[2], color='blue', 
         lw=2, label='HLF classifier (AUC) = %0.4f' % roc_auc[2])
plt.plot([0, 1], [0, 1], color='orange', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Background Contamination (FPR)')
plt.ylabel('Signal Efficiency (TPR)')
plt.title('$W$ selector')
plt.legend(loc="lower right")
plt.grid()
plt.show()

<IPython.core.display.Javascript object>