In [0]:
data = spark.read.format('libsvm').load('/FileStore/tables/ml/simpleNN.txt')

In [0]:
data.show(truncate = False)

In [0]:
data.show(n=1, truncate = False)

# Dense and Sparse Vectors
##### dense_vec = [1,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0......0, 1, 1, 0  ........0, 0, 0, 0, 0, 0, 0, 0]    ---> (800 KB of memory)
##### sparsevec= [100000, [0, 12, 10005, 10006], [1, 1, 1, 1]]   (8 units: 4 for indices, 4 for values + unit for the length)


-  sparsevec=[0]=1
-  sparsevec=[1]=0
-  sparsevec=[2]=0
-  sparsevec=[12]=1
- sparsevec=[10005]=1

## Example
data_1 = (4,[0,2,3],[-0.944444,-0.898305,-0.916667])

- feature 1 = data_1[0] =  -0.944444
- feature 2= data_1[1] = 0
- feature 3= data_1[2] = -0.898305
- feature 4= data_1[3] = -0.916667

In [0]:
display(data)

In [0]:
data.select('label').distinct().show()

In [0]:
data.groupBy('label').count().show()

In [0]:
train, test = data.randomSplit([0.7, 0.3])

In [0]:
train.cache()

In [0]:
test.cache()

# Let's train a Neural Network

In [0]:
from pyspark.ml.classification import MultilayerPerceptronClassifier

In [0]:
layers = [4, 10, 10, 3]
# 4 features -> 4 neurons on the input layer
# 2 hidden layers of 10 neurons each
# 3 neurons ( 3 classes 0, 1 & 2)


In [0]:
NNclassifier = MultilayerPerceptronClassifier(featuresCol='features', labelCol='label', maxIter=50, layers= layers, seed=45)
# an estimator

In [0]:
NNclassifier.explainParams()

In [0]:
NNclassifier.explainParam('solver')

In [0]:
NNclassifier.explainParam('maxIter')

In [0]:
model = NNclassifier.fit(train) # model= transformer

In [0]:
preds = model.transform(test)

In [0]:
preds.show()

In [0]:
preds.show(n=1, truncate = False)

In [0]:
preds.take(1)

In [0]:
predictionsAndLabels = preds.select(['prediction', 'label'])

In [0]:
predictionsAndLabels.show()

In [0]:
NNclassifier.explainParams()

# Evaluating classifiers: Confusion Matrix

![Confusion Matrix](images/confusionMatrix.png)

## Classification Metrics
- __Accuracy__ : measures the number of data points classified correctly. Accuracy is not a good metric for skewed classes  (for instance, 90 % of the labels in the training set are class A, 10 % class B)!!

$$Accuracy=\frac{TP+TN}{TP+TN+FP+FN} $$

Where:

    -TP: Number of True Positives
    -FP: Number of False Positives
    -TN: Number of True Negatives
    -FN: Number of False Negatives

- __Precision__: Out of all the examples the classifier labeled as positive, what fraction were correct?

$$Precision=\frac{TP}{TP+FP} $$

- __Recall__: "Out of all the positive examples, what fraction did the classifier pick up?"

$$Recall=\frac{TP}{TP+FN} $$

- __F1 score__: It is the harmonic average of the precision and recall. It's best value is 1 (when both are 1). Its worst value is 0. 

$$F1=2 \times \frac{Precision \times Recall}{Precision + Recall} $$

## More on:
http://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html

In [0]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [0]:
evaluator = MulticlassClassificationEvaluator(metricName='f1', predictionCol='prediction') #  

In [0]:
evaluator.explainParam('metricName')

In [0]:
evaluator.explainParams()

In [0]:
evaluationNN = evaluator.evaluate(predictionsAndLabels)

In [0]:
print ('NeauralNetwork F1 = %g'%evaluationNN)

In [0]:
model.layers


In [0]:
NNclassifier.getLayers()

In [0]:
model.weights

In [0]:
len(model.weights)


##Number of parameters of our NN
- layers = [4, 10, 10, 3]
- Number of neurons between input layer and hidden layer 1 = 10 * (4+1) = 50
- Number of neurons between first hidden layer and second hidden layer = 10 * (10 +1) = 110
- Number of neurons between second hidden layer and output layer = 3 * (10 +1) = 33

Number of params = 50 + 110 + 33 = 193

# Model Tuning

Spark provides 2 efficient classes that facilitate model tuning: **ParamGridBuilder and CrossValidator**:

More on: https://spark.apache.org/docs/latest/api/python/pyspark.ml.html#module-pyspark.ml.tuning

## Pipeline
- A pipeline acts as an estimator. 
- A Pipeline consists of a sequence of stages, each of which is either an Estimator or a Transformer. 
- When Pipeline.fit is called, the stages are executed in order. 
- If a stage is an Estimator its Estimator.fit method will be called on the input dataset to fit a model. Then the model, which is a transformer, will be used to transform the dataset as the input to the next stage. 
- If a stage is a Transformer, its Transformer.transform method will be called to produce the dataset for the next stage.

## ParamGridBuilder

Builder for a param grid used in grid search-based model selection

In [0]:
from pyspark.ml import Pipeline
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

In [0]:
#help(Pipeline)

In [0]:
#help(CrossValidator)

In [0]:
pipeLine=Pipeline()
pipeLine.setStages([NNclassifier])

In [0]:
gridBuilder = ParamGridBuilder().addGrid(NNclassifier.layers, 
                                         [[4, 3, 3, 3], [4, 2, 2, 3], [4,3], [4, 10, 3]]
                                        ).addGrid(NNclassifier.stepSize, [0.03, 0.1]).addGrid(NNclassifier.maxIter, [100, 200]).build()

# 4 * 2 *2 : 16 models 


In [0]:
NNclassifier.explainParam('stepSize')

In [0]:
cv = CrossValidator(estimator=pipeLine, 
                    estimatorParamMaps=gridBuilder, 
                    evaluator=evaluator)

In [0]:
cvm = cv.fit(train)

In [0]:
predictions = cvm.transform(test)

In [0]:
evaluator.evaluate(predictions)

# Let's see which model was the best !

In [0]:
bestModel = cvm.bestModel.stages[0]

In [0]:
bestModel.weights.array

In [0]:
bestModel.getLayers()

In [0]:
cvm.avgMetrics


# Now, try other classifiers:

To learn about the different classifiers implemented in Spark:

https://spark.apache.org/docs/3.0.0/api/python/pyspark.ml.html#

Next, we will train and evaluation 3 among them:
   1. LogisticRegressior
   1. DecisionTreeClassifier
   1. RandomForestClassifier

In [0]:
from pyspark.ml.classification import (LogisticRegression, 
                                       DecisionTreeClassifier, 
                                       RandomForestClassifier
                                      )

In [0]:
lr  = LogisticRegression()
dt = DecisionTreeClassifier()
rf = RandomForestClassifier(maxDepth=10, maxBins=10)


In [0]:
lr.explainParams()

In [0]:
lr.explainParam("regParam")

In [0]:
lrm = lr.fit(train)

In [0]:
dtm = dt.fit(train)

In [0]:
rfm = rf.fit(train)

In [0]:
lrm_results = lrm.transform(test)
dtm_results = dtm.transform(test)
rfm_results = rfm.transform(test)


In [0]:
evaluation_lrm=evaluator.evaluate(lrm_results)
evaluation_dtm=evaluator.evaluate(dtm_results)
evaluation_rfm=evaluator.evaluate(rfm_results)

In [0]:
print('evaluation of logistic regression model = %g'%evaluation_lrm)
print('evaluation of decision tree model = %g'%evaluation_dtm)
print('evaluation of random forest model = %g'%evaluation_rfm)

In [0]:
rf = RandomForestClassifier(maxDepth=10, maxBins=50)

In [0]:
rfm_2 = rf.fit(train)

In [0]:
results = rfm_2.transform(test)

In [0]:
evaluator.evaluate(results)