## Prediction from trained model

In [1]:
spark

## Load the test dataframe

In [2]:
from __future__ import print_function

testDF = spark.read.format('parquet') \
        .load('/project/ML/data/test20k.parquet') \
        .select(['HLF_input', 'encoded_label']) \
        .withColumnRenamed('HLF_input', 'features')
        
test_events = testDF.count()
print('There are', test_events, 'test events')

There are 20000 test events


In [3]:
testDF.printSchema()

root
 |-- features: vector (nullable = true)
 |-- encoded_label: vector (nullable = true)



## Load the model

In [4]:
from bigdl.nn.layer import Model
from bigdl.util.common import init_engine
import numpy as np 

init_engine()

In [22]:
path = '/afs/cern.ch/work/m/migliori/public/BDLtest/pythonScripts/models/'
modelHLF = Model.loadModel(modelPath=path+'hlfHadalytic_hlf_10exe_6cores.bigdl',
                        weightPath=path+'hlfHadalytic_hlf_10exe_6cores.bin')

In [24]:
from bigdl.dlframes.dl_classifier import DLModel

predictorHLF = DLModel(model=modelHLF, featureSize=[14])

creating: createDLModel


## Prediction

In [25]:
predHLF = predictiorHLF.transform(testDF)

In [26]:
predDF.printSchema()

root
 |-- features: vector (nullable = true)
 |-- encoded_label: vector (nullable = true)
 |-- prediction: array (nullable = false)
 |    |-- element: double (containsNull = false)



In [27]:
predHLF.show(5)

+--------------------+-------------+--------------------+
|            features|encoded_label|          prediction|
+--------------------+-------------+--------------------+
|[0.06865329588884...|(3,[1],[1.0])|[2.81666027149185...|
|[0.01563167840852...|(3,[0],[1.0])|[0.99877804517745...|
|[0.00626596377193...|(3,[2],[1.0])|[0.18347467482089...|
|[0.0,0.0375606796...|(3,[0],[1.0])|[0.18502448499202...|
|[0.00697752897271...|(3,[2],[1.0])|[0.13649576902389...|
+--------------------+-------------+--------------------+
only showing top 5 rows



## Compute auc

In [28]:
%%time
y_true = np.asarray(predDF.rdd.map(lambda row: \
                                   row.encoded_label).collect())
y_pred_hlf = np.asarray(predHLF.rdd.map(lambda row: \
                                   row.prediction).collect())

CPU times: user 1.35 s, sys: 9.31 ms, total: 1.36 s
Wall time: 2.77 s


In [11]:
from sklearn.metrics import roc_curve, auc
fpr = dict()
tpr = dict()
roc_auc = dict()

for i in range(3):
    fpr[i], tpr[i], _ = roc_curve(y_true[:, i], y_pred[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

In [21]:
from sklearn.metrics import roc_curve, auc

def computeAUC(y_true, y_pred):
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    
    for i in range(3):
        fpr[i], tpr[i], _ = roc_curve(y_true[:, i], y_pred[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])
    return fpr,tpr,roc_auc   

In [None]:
fpr_hlf, tpr_hlf, auc_hlf = computeAUC()

In [20]:
import matplotlib.pyplot as plt 
import seaborn as sns
sns.set(style="darkgrid")
%matplotlib notebook

plt.figure()
plt.plot(fpr[0], tpr[0],
         lw=2, label='GRU classifier (AUC) = %0.4f' % roc_auc[0])
plt.plot([0, 1], [0, 1], linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Background Contamination (FPR)')
plt.ylabel('Signal Efficiency (TPR)')
plt.title('$tt$ selector')
plt.legend(loc="lower right")
plt.grid(b=True, which='major')
plt.grid(b=True, which='minor')
plt.show()

<IPython.core.display.Javascript object>