In [1]:
# Set the new configuration
conf = SparkConf().setAll([('spark.executor.memory', '4g'),\
                           ('spark.driver.memory', '8g'),\
                           ('spark.shuffle.service.enabled', True), \
                           ('spark.dynamicAllocation.enabled', True), \
                           #('spark.executor.instances', 50)
                           ('spark.dynamicAllocation.executorIdleTimeout', 600), \
                           ('spark.executor.cores', 1),\
                           ('spark.default.parallelism', 90),\
                           ('spark.executor.memoryOverhead', '4g'),\
                           ('spark.driver.memoryOverhead', '4g'),\
                           ('spark.scheduler.mode', 'FAIR'),\
                           ('spark.kryoserializer.buffer.max', '512m'),\
                           ('spark.app.name','LightCurve Demo - JupyterHub Elephas implementation')])# Show the current options




#                           ('spark.dynamicAllocation.maxExecutors', 90), \


# Stop the old context
sc.stop()

# And restart the context with the new configuration
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
import math
import copy
import random

import time

from pyspark.ml.feature import StringIndexer, StandardScaler,VectorAssembler,OneHotEncoder
from pyspark.ml import Pipeline

from pyspark.sql.functions import udf, col, array, lit
from pyspark.ml.linalg import Vectors, VectorUDT

from pyspark.sql.functions import rand
from pyspark.sql.types import ArrayType, FloatType,IntegerType, DataType, DoubleType

from pyspark.mllib.evaluation import MulticlassMetrics

from keras import optimizers
from keras.models import Sequential, Model, load_model # model and load_model from Plasticc
from keras.layers import Dense, Dropout, Activation, Layer, Lambda
from keras import backend as K

from elephas.ml_model import ElephasEstimator


from keras.layers import *
from keras.optimizers import Adam, Nadam, SGD
from keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard
from keras.utils import to_categorical
from keras.utils import np_utils, generic_utils

from keras.preprocessing.sequence import pad_sequences

from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import normalize
from sklearn.metrics import confusion_matrix

import tensorflow as tf

Using TensorFlow backend.




In [3]:
import sys

In [4]:
sqlContext = SQLContext(sc)
sqlContext.sql("use plasticc")

DataFrame[]

In [5]:
iType=IntegerType()
dType=DoubleType()
fType=FloatType()

In [6]:
augment_count = 25
#batch_size = 128
#batch_size2 = 512
batch_size = 1000
batch_size2 = 5000
optimizer = 'nadam'
num_models = 1
use_specz = False
valid_size = 0.1
max_epochs = 50


In [7]:
classes = np.array([6, 15, 16, 42, 52, 53, 62, 64, 65, 67, 88, 90, 92, 95, 99], dtype='int32')
class_names = ['class_6','class_15','class_16','class_42','class_52','class_53','class_62','class_64','class_65','class_67','class_88','class_90','class_92','class_95','class_99']
class_weight = {6: 1, 15: 2, 16: 1, 42: 1, 52: 1, 53: 1, 62: 1, 64: 2, 65: 1, 67: 1, 88: 1, 90: 1, 92: 1, 95: 1, 99: 1}

# LSST passbands (nm)  u    g    r    i    z    y      
passbands = np.array([357, 477, 621, 754, 871, 1004], dtype='float32')

In [8]:
limit = 1000000
sequence_len = 256
num_classes = len(classes)

#### Get the custom loss functions

In [9]:
def get_wtable(df):
    
    all_y = np.array(df.select('target').collect(), dtype = 'int32') 

    y_count = np.unique(all_y, return_counts=True)[1]

    wtable = np.ones(len(classes))

    for i in range(0, y_count.shape[0]):
        wtable[i] = y_count[i] / all_y.shape[0]

    return wtable    

def mywloss(y_true,y_pred):
    yc=tf.clip_by_value(y_pred,1e-15,1-1e-15)
    loss=-(tf.reduce_mean(tf.reduce_mean(y_true*tf.log(yc),axis=0)/wtable))
    return loss
    
    
def multi_weighted_logloss(y_ohe, y_p, wtable):
    """
    @author olivier https://www.kaggle.com/ogrellier
    multi logloss for PLAsTiCC challenge
    """
    # Normalize rows and limit y_preds to 1e-15, 1-1e-15
    y_p = np.clip(a=y_p, a_min=1e-15, a_max=1-1e-15)
    # Transform to log
    y_p_log = np.log(y_p)
    # Get the log for ones, .values is used to drop the index of DataFrames
    # Exclude class 99 for now, since there is no class99 in the training set 
    # we gave a special process for that class
    y_log_ones = np.sum(y_ohe * y_p_log, axis=0)
    # Get the number of positives for each class
    nb_pos = y_ohe.sum(axis=0).astype(float)
    nb_pos = wtable

    if nb_pos[-1] == 0:
        nb_pos[-1] = 1

    # Weight average and divide by the number of positives
    class_arr = np.array([class_weight[k] for k in sorted(class_weight.keys())])
    y_w = y_log_ones * class_arr / nb_pos    
    loss = - np.sum(y_w) / np.sum(class_arr)
    return loss / y_ohe.shape[0]

#### References

https://www.tensorflow.org/api_docs/python/tf/keras/backend/permute_dimensions

In [10]:
def get_model(train_df, input_dim, size=80):
    def get_meta(x):
        x=x[:,0:10]
        return x
    
    def get_band(x):
        x=x[:,10:266]
        return x
    
    def get_hist(x):
        # x=x[:,266:input_dim]  -- this is before we passed in the shape from input dim - to avoid two select calls
        x=x[:,266:input_dim[0]]
        x=Reshape((8,256))(x)
        x=K.permute_dimensions(x, (0,2,1))
        return x
    
    #raw_input  = Input(shape=train_df.select("features").first()[0].shape, name='raw')
    raw_input  = Input(shape=input_dim, name='raw')
    
    hist_input = Lambda(get_hist,  name="hist")(raw_input)
    meta_input = Lambda(get_meta,  name="meta")(raw_input)
    band_input = Lambda(get_band,  name="band")(raw_input)
    
    band_emb = Embedding(8, 8)(band_input)
    hist = concatenate([hist_input, band_emb])
    hist = TimeDistributed(Dense(40, activation='relu'))(hist)
    
    rnn = Bidirectional(GRU(size, return_sequences=True))(hist)
    rnn = SpatialDropout1D(0.5)(rnn)
    
    gmp = GlobalMaxPool1D()(rnn)
    gmp = Dropout(0.5)(gmp)
    
    x = concatenate([meta_input, gmp])
    x = Dense(128, activation='relu')(x)
    x = Dense(128, activation='relu')(x)
    x = Dropout(0.5)(x)
    
    output = Dense(15, activation='softmax')(x)
    model = Model(inputs=[raw_input], outputs=output)
    
    return model
    

#### Create the dataframe

In [11]:
trainingVectorsDF=sqlContext.sql("""
select object_id, target,meta, band, mjd, flux, flux_err, detected, fwd_int, bwd_int, 
source_wavelength, received_wavelength
from elephas_training_set""")

In [12]:
wtable=get_wtable(trainingVectorsDF)

#### UDF Functions for vector creation - definitions

In [13]:
def pad_array(x, sequence_len=sequence_len):
    x = np.pad(x, (sequence_len,0), 'constant', constant_values=(0))
    x= x[len(x)-sequence_len:len(x)]
    return x

def fwd_intervals(x):
    x=np.ediff1d(x, to_begin = [0])
    return x

def bwd_intervals(x):
    x=np.ediff1d(x, to_end = [0])
    return x

#### Issues returning numpy arrays to pyspark UDFs.
You'll get pickle errors because the funtion returns NumPy types which are not compatible with DataFrame API. You have to cast the function return back to a list, and then caset to Spark comparible data types.

https://stackoverflow.com/questions/44965762/is-it-possible-to-store-a-numpy-array-in-a-spark-dataframe-column

Example - 


get_padded_float_vectors = udf(
    lambda arr: pad_array(arr).tolist(), 
    ArrayType(fType)
)

#### UDF Declarations for vector manipulation
Note that we have included the UDF toDenseUDF - this is necessary because the Spark ML class VectorAssembler will cast the assembled vector to a sparse vector if there are a large number of zeros in the vector. This nor usually a problem, but for this problem we do need to pass in a static dense vector intop the Keras model in order to properly extract the features.

In [14]:
target_categorical = udf(
    lambda arr:
        [int(i+1 == arr) for i in range(num_classes)], 
        ArrayType(iType)       
)

get_padded_float_vectors = udf(
    lambda arr: pad_array(arr), 
    ArrayType(fType)
)

get_padded_int_vectors = udf(
    lambda arr: pad_array(arr), 
    ArrayType(iType)
)

toDenseUdf = udf(
    lambda arr: Vectors.dense(arr.toArray()), 
    VectorUDT()
)

fwd_udf = udf(
    lambda arr: fwd_intervals(arr), 
    ArrayType(fType)
)

bwd_udf = udf(
    lambda arr: bwd_intervals(arr), 
    ArrayType(fType)
)

to_vector = udf(lambda a: Vectors.dense(a), VectorUDT())

### Set up the training set dataframe with the feature vectors

This is how we convert the target to a categegorical, looks like ElephaseSTIMATOE doesn't want it

                             to_vector(target_categorical("target")).alias("targetV"),


In [15]:
trainingVectorsDF = trainingVectorsDF.select(\
                       "object_id","target", #target_categorical("target").alias("target"),
                             to_vector("meta").alias("meta"),                           
                             to_vector(get_padded_int_vectors("band")).alias("band"),
                             to_vector(get_padded_float_vectors("mjd")).alias("mjd"),
                             to_vector(get_padded_float_vectors("flux")).alias("flux"),
                             to_vector(get_padded_float_vectors("flux_err")).alias("flux_err"),
                             to_vector(get_padded_int_vectors("detected")).alias("detect"),
                             to_vector(fwd_udf(get_padded_float_vectors("fwd_int"))).alias("fwd_int"),
                             to_vector(bwd_udf(get_padded_float_vectors("bwd_int"))).alias("bwd_int"),
                             to_vector(get_padded_float_vectors("source_wavelength")).alias("source_wavelength"),
                             to_vector(get_padded_float_vectors("received_wavelength")).alias("received_wavelength")
                            )

In [16]:
trainingVectorsDF.printSchema()

root
 |-- object_id: integer (nullable = true)
 |-- target: integer (nullable = true)
 |-- meta: vector (nullable = true)
 |-- band: vector (nullable = true)
 |-- mjd: vector (nullable = true)
 |-- flux: vector (nullable = true)
 |-- flux_err: vector (nullable = true)
 |-- detect: vector (nullable = true)
 |-- fwd_int: vector (nullable = true)
 |-- bwd_int: vector (nullable = true)
 |-- source_wavelength: vector (nullable = true)
 |-- received_wavelength: vector (nullable = true)



## Now we create the feature vector

In [17]:
ignore = ['object_id', 'target']

assembler = VectorAssembler(
    inputCols=[x for x in trainingVectorsDF.columns if x not in ignore],
    outputCol='features')

trainingVectorsDF=assembler.transform(trainingVectorsDF)

In [18]:
trainingVectorsDF.printSchema()

root
 |-- object_id: integer (nullable = true)
 |-- target: integer (nullable = true)
 |-- meta: vector (nullable = true)
 |-- band: vector (nullable = true)
 |-- mjd: vector (nullable = true)
 |-- flux: vector (nullable = true)
 |-- flux_err: vector (nullable = true)
 |-- detect: vector (nullable = true)
 |-- fwd_int: vector (nullable = true)
 |-- bwd_int: vector (nullable = true)
 |-- source_wavelength: vector (nullable = true)
 |-- received_wavelength: vector (nullable = true)
 |-- features: vector (nullable = true)



### The training, test and validation splits

Note that this is where we apply the toDenseUDF function to ensure that the features vector is a dense vector

weights = [.7, .3]
seed = 42 # seed=0L  validation_df, 
train_df, test_df = trainingVectorsDF.select( toDenseUdf("scaled_features").alias("features"), "target").randomSplit(weights, seed)

In [19]:
weights = [.6, .3, .1]
seed = 42 # seed=0L  validation_df, 
train_df, test_df, validation_df = trainingVectorsDF.select( toDenseUdf("features").alias("features"), "target").randomSplit(weights, seed)

In [20]:
train_df.printSchema()

root
 |-- features: vector (nullable = true)
 |-- target: integer (nullable = true)



In [21]:
train_df=train_df.repartition(800)

In [22]:
nb_classes = len(classes)
#input_dim = len(train_df.select("features").first()[0])
input_dim = train_df.select("features").first()[0].shape

print(f"We have {num_classes} classes and {input_dim[0]} features")


We have 15 classes and 2314 features


## Preprocessing: Defining Transformers

Up until now, we basically just read in raw data. Luckily, ```Spark ML``` has quite a few preprocessing features available, so the only thing we will ever have to do is define transformations of data frames.

To proceed, we will first transform category strings to double values. This is done by a so called ```StringIndexer```. Note that we carry out the actual transformation here already, but that is just for demonstration purposes. All we really need is too define ```string_indexer``` to put it into a pipeline later on.

#### get and compile the model 

In [23]:
model = get_model(train_df, input_dim)
print("hi there")

W1031 02:48:32.892550 140343809632000 deprecation_wrapper.py:119] From /home/hduser/.virtualenvs/Elephas/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W1031 02:48:32.941244 140343809632000 deprecation_wrapper.py:119] From /home/hduser/.virtualenvs/Elephas/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W1031 02:48:32.994191 140343809632000 deprecation_wrapper.py:119] From /home/hduser/.virtualenvs/Elephas/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W1031 02:48:33.590679 140343809632000 deprecation_wrapper.py:119] From /home/hduser/.virtualenvs/Elephas/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:133: The name tf.placeholder_with_default is d

hi there


In [24]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
raw (InputLayer)                (None, 2314)         0                                            
__________________________________________________________________________________________________
band (Lambda)                   (None, 256)          0           raw[0][0]                        
__________________________________________________________________________________________________
hist (Lambda)                   (None, 256, 8)       0           raw[0][0]                        
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 256, 8)       64          band[0][0]                       
__________________________________________________________________________________________________
concatenat

In [25]:
model.compile(optimizer=optimizer, loss=mywloss, metrics=['accuracy'])
print("hi there")

W1031 02:48:33.791811 140343809632000 deprecation_wrapper.py:119] From /home/hduser/.virtualenvs/Elephas/lib/python3.6/site-packages/keras/optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.



hi there


## Distributed Elephas model

To lift the above Keras ```model``` to Spark, we define an ```Estimator``` on top of it. An ```Estimator``` is Spark's incarnation of a model that still has to be trained. It essentially only comes with only a single (required) method, namely ```fit```. Once we call ```fit``` on a data frame, we get back a ```Model```, which is a trained model with a ```transform``` method to predict labels.

We do this by initializing an ```ElephasEstimator``` and setting a few properties. As by now our input data frame will have many columns, we have to tell the model where to find features and labels by column name. Then we provide serialized versions of our Keras model. We can not plug in keras models into the ```Estimator``` directly, as Spark will have to serialize them anyway for communication with workers, so it's better to provide the serialization ourselves. In fact, while pyspark knows how to serialize ```model```, it is extremely inefficient and can break if models become too large. Spark ML is especially picky (and rightly so) about parameters and more or less prohibits you from providing non-atomic types and arrays of the latter. Most of the remaining parameters are optional and rather self explainatory. Plus, many of them you know if you have ever run a keras model before. We just include them here to show the full set of training configuration.

In [26]:
#adam=optimizers.nadam(lr=0.01)
adam=optimizers.nadam(lr=0.01)
opt_conf = optimizers.serialize(adam)


W1031 02:48:33.891923 140343809632000 deprecation_wrapper.py:119] From /home/hduser/.virtualenvs/Elephas/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:174: The name tf.get_default_session is deprecated. Please use tf.compat.v1.get_default_session instead.



In [27]:
print(max_epochs)

50


In [28]:
#optimizer = 'nadam'
adam = optimizers.Adam(lr=0.01)
opt_conf = optimizers.serialize(adam)

# Initialize SparkML Estimator and set all relevant properties
estimator = ElephasEstimator()
estimator.setFeaturesCol("features")             # These two come directly from pyspark,
estimator.setLabelCol("target")                 # hence the camel case. Sorry :)
estimator.set_keras_model_config(model.to_yaml())       # Provide serialized Keras model
estimator.set_categorical_labels(True)
estimator.set_nb_classes(nb_classes)
estimator.set_num_workers(80)  # We just use one worker here. Feel free to adapt it.
estimator.set_epochs(2) # was max_epochs
estimator.set_batch_size(batch_size) # was 128
estimator.set_verbosity(2) # was 1
estimator.set_validation_split(0.15)
estimator.set_optimizer_config(opt_conf)
estimator.set_mode("synchronous") # Was synchronous
estimator.set_loss(mywloss) # was("categorical_crossentropy")
estimator.set_metrics(['accuracy']) ##(['acc'])

ElephasEstimator_40db93cba63f4db85f82

## And now we set up the pipeline.

Looks very similar to SparkFlow, n'est ce pas? This could be an interesting comparison!

Defining pipelines is really as easy as listing pipeline stages. We can provide any configuration of Transformers and Estimators really, but here we simply take the three components defined earlier. Note that string_indexer and scaler and interchangable, while estimator somewhat obviously has to come last in the pipeline.

In [29]:
pipeline = Pipeline(stages=[estimator])

## And train the model

Note that at this stage, the only method we can call is ''fit''

In [30]:
import time

start=time.time()
fitted_pipeline = pipeline.fit(train_df) # Fit model to data
elapsedTime=time.time()-start
print(f"Model trained in {elapsedTime} seconds")

  config = yaml.load(yaml_string)


>>> Fit model
>>> Synchronous training complete.
Model trained in 547.3875677585602 seconds


#### Save the model

#### Now we run the transform so we can get the predictions

However, this will require modifications to the class file for ElephasEstimator and ElephasTransformer located here

 ~/.virtualenvs/Elephas/lib/python3.6/site-packages/elephas/ml_model.py
 
because the \_transform method as written runs the predict process using the model.predict_classes method. A keras Sequential() model class has this method, but the Model() class does not - you have to use the 'predict' method.

What this entails is that you have to rewrite that method and overload it to incorporate the predict method if you're training on a keras Model class instead of a Sequential model.

The overloaded method is described here - 

https://github.com/maxpumperla/elephas/issues/111

and we will include a copy of the modified class statement in th github



In [31]:
validation_df=validation_df.repartition(400)

### NOTE!

Enable the display code in the ml_model.py definition on the driver machine to get the details, but it won't work if you run the job on batch because display is an ipython class

In [32]:
pred = fitted_pipeline.stages[-1]._transform(validation_df, useModel=True)

'Hi there - here are the predictions in array form'

[array([0.0000000e+00, 4.8938852e-29, 2.4161126e-38, 8.9783408e-18,
        7.5355738e-36, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
        2.4967731e-26, 0.0000000e+00, 0.0000000e+00, 1.0000000e+00,
        0.0000000e+00, 0.0000000e+00, 0.0000000e+00], dtype=float32),
 array([0.00000000e+00, 4.30397502e-27, 1.42300264e-35, 1.09210975e-16,
        2.37756259e-33, 0.00000000e+00, 7.72808525e-37, 0.00000000e+00,
        1.35205217e-24, 0.00000000e+00, 2.23152579e-36, 1.00000000e+00,
        1.89933272e-37, 0.00000000e+00, 0.00000000e+00], dtype=float32)]

[(Row(features=DenseVector([0.0, 0.0, 0.0, 0.0, 1.0, 0.4054, 0.2102, 0.018, 1.0, 0.7537, 2.0, 3.0, 2.0, 1.0, 0.0, 5.0, 0.0, 0.0, 3.0, 1.0, 5.0, 1.0, 3.0, 4.0, 3.0, 4.0, 5.0, 3.0, 3.0, 3.0, 4.0, 0.0, 2.0, 5.0, 5.0, 3.0, 4.0, 1.0, 1.0, 5.0, 0.0, 1.0, 5.0, 4.0, 3.0, 0.0, 5.0, 3.0, 2.0, 1.0, 5.0, 1.0, 5.0, 5.0, 2.0, 3.0, 5.0, 0.0, 4.0, 1.0, 3.0, 0.0, 5.0, 0.0, 5.0, 3.0, 4.0, 5.0, 3.0, 0.0, 4.0, 1.0, 2.0, 4.0, 5.0, 4.0, 4.0, 5.0, 4.0, 0.0, 5.0, 4.0, 2.0, 3.0, 4.0, 3.0, 3.0, 3.0, 4.0, 3.0, 1.0, 2.0, 1.0, 4.0, 5.0, 2.0, 0.0, 3.0, 0.0, 3.0, 4.0, 4.0, 1.0, 2.0, 3.0, 5.0, 5.0, 0.0, 5.0, 3.0, 0.0, 0.0, 3.0, 0.0, 3.0, 5.0, 1.0, 1.0, 2.0, 2.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 3.0, 5.0, 5.0, 0.0, 0.0, 4.0, 1.0, 2.0, 4.0, 2.0, 2.0, 2.0, 5.0, 3.0, 1.0, 1.0, 5.0, 2.0, 4.0, 3.0, 4.0, 1.0, 0.0, 5.0, 2.0, 1.0, 1.0, 1.0, 3.0, 0.0, 5.0, 0.0, 1.0, 3.0, 3.0, 1.0, 0.0, 0.0, 4.0, 0.0, 4.0, 2.0, 4.0, 1.0, 1.0, 4.0, 1.0, 4.0, 3.0, 1.0, 3.0, 1.0, 1.0, 1.0, 1.0, 1.0, 3.0, 0.0, 1.0, 2.0, 1.0, 5.0, 2.0, 4.0, 4.0, 2.0, 3

In [34]:
pred.printSchema()

root
 |-- features: vector (nullable = true)
 |-- target: double (nullable = true)
 |-- prediction: double (nullable = true)



In [35]:
pnl=pred.select("target","prediction")
pnl.show(100)

+------+----------+
|target|prediction|
+------+----------+
|  11.0|      13.0|
|   2.0|      13.0|
|   3.0|      13.0|
|   8.0|      13.0|
|  11.0|      13.0|
|   8.0|      13.0|
|  11.0|      13.0|
|  11.0|      13.0|
|  11.0|      13.0|
|   0.0|      13.0|
|   2.0|      13.0|
|  11.0|      13.0|
|   3.0|      13.0|
|  11.0|      13.0|
|   9.0|      13.0|
|   0.0|      13.0|
|   3.0|      13.0|
|   8.0|      13.0|
|   3.0|      13.0|
|   3.0|      13.0|
|   3.0|      13.0|
|   3.0|      13.0|
|  11.0|      13.0|
|   8.0|      13.0|
|   8.0|      13.0|
|  11.0|      13.0|
|   2.0|      13.0|
|  13.0|      13.0|
|   3.0|      13.0|
|  11.0|      13.0|
|   6.0|      13.0|
|   9.0|      13.0|
|  11.0|      13.0|
|   2.0|      13.0|
|  11.0|      13.0|
|   8.0|      13.0|
|  11.0|      13.0|
|   3.0|      13.0|
|  12.0|      13.0|
|   8.0|      13.0|
|   3.0|      13.0|
|   3.0|      13.0|
|   8.0|      13.0|
|   2.0|      13.0|
|  10.0|      13.0|
|   6.0|      13.0|
|  11.0|      13.0|


In [36]:
# Looks like prediction_and_label is a dataframe, not an RDD, so we need to cast it to an RDD in order to use .map
prediction_and_label = pnl.rdd.map(lambda row: (row.target, row.prediction))
metrics = MulticlassMetrics(prediction_and_label)
print(metrics.precision())

0.023822855688470347


In [37]:
x1=np.array([0.1118802 , 0.07325512, 0.01614694, 0.02659923, 0.07440449,
        0.08224725, 0.06605115, 0.02206622, 0.0542385 , 0.04232709,
        0.05887228, 0.0560729 , 0.12828934, 0.10508711, 0.08246218], dtype='float32')
x2=np.array([0.11386207, 0.09637289, 0.01490068, 0.02821412, 0.06656378,
        0.080683  , 0.07982644, 0.01722281, 0.04762489, 0.03941712,
        0.07437494, 0.06128084, 0.08523843, 0.09439979, 0.10001823], dtype='float32')

In [38]:
np.argmax(x1)

12

In [39]:
np.argmax(x2)

0

In [40]:
20393/3600

5.664722222222222