In [1]:
import pyspark
sc = pyspark.SparkContext()

In [9]:
import matplotlib.pyplot as plt
import pyspark.sql.functions as F
from pyspark.sql.functions import when
from kneed import KneeLocator
from yellowbrick.cluster import KElbowVisualizer
from pyspark.ml.feature import Normalizer, MinMaxScaler, VectorAssembler, StandardScaler
from pyspark.sql import Row, Column
from pyspark.sql.types import FloatType, DoubleType
import plotly.graph_objects as go
from pyspark.mllib.linalg import DenseVector

from pyspark.ml.classification import MultilayerPerceptronClassifier

from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.mllib.tree import LabeledPoint

from sklearn.metrics import confusion_matrix, roc_curve, auc
import numpy as np
import itertools

from pyspark.ml import Pipeline
from pyspark.sql.functions import udf, mean, stddev

from pyspark.ml.feature import StringIndexer, VectorIndexer, StandardScaler, VectorAssembler


from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator, ClusteringEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics, BinaryClassificationMetrics

In [10]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField
spark = SparkSession.builder \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

In [60]:
clusters_path = "hdfs://kddrtserver13.isti.cnr.it:9000/user/hpsa15/kmeans_cluster_"
optimals_path = "hdfs://kddrtserver13.isti.cnr.it:9000/user/hpsa15/multilayerperceptron_optimals.csv"

- Numeri dei clusters

In [12]:
prediction_v = [0, 1, 2, 3, 4, 5]

- Lettura dei clusters

In [13]:
clusters_df = {}
for i in prediction_v:
    clusters_df[i] = spark.read.options(inferSchema = True, header = True)\
                .csv(clusters_path+str(i)+".csv")

In [16]:
features_numeric = ["age", "duration", "campaign", "pdays", "previous",
                  "emp_var_rate", "cons_price_idx", "cons_conf_idx", "euribor3m",
                  "nr_employed"]

In [3]:
maxIters = [100, 150, 200, 250]
blockSize = [32, 64, 128]
layers = [[10,6, 4, 2], [10, 8, 6, 4, 2], [10, 8, 2]]

In [4]:
tuned_param = {}
s = 0
for i in maxIters:
    for j in blockSize:
        for k in layers:
            tuned_param["sharp_"+str(s)] = {"maxIters": i, "blockSize": j, "layers": k}
            s = s + 1

In [32]:
tuned_param

{'sharp_0': {'maxIters': 100, 'blockSize': 32, 'layers': [10, 6, 4, 2]},
 'sharp_1': {'maxIters': 100, 'blockSize': 32, 'layers': [10, 8, 6, 4, 2]},
 'sharp_2': {'maxIters': 100, 'blockSize': 32, 'layers': [10, 8, 2]},
 'sharp_3': {'maxIters': 100, 'blockSize': 64, 'layers': [10, 6, 4, 2]},
 'sharp_4': {'maxIters': 100, 'blockSize': 64, 'layers': [10, 8, 6, 4, 2]},
 'sharp_5': {'maxIters': 100, 'blockSize': 64, 'layers': [10, 8, 2]},
 'sharp_6': {'maxIters': 100, 'blockSize': 128, 'layers': [10, 6, 4, 2]},
 'sharp_7': {'maxIters': 100, 'blockSize': 128, 'layers': [10, 8, 6, 4, 2]},
 'sharp_8': {'maxIters': 100, 'blockSize': 128, 'layers': [10, 8, 2]},
 'sharp_9': {'maxIters': 150, 'blockSize': 32, 'layers': [10, 6, 4, 2]},
 'sharp_10': {'maxIters': 150, 'blockSize': 32, 'layers': [10, 8, 6, 4, 2]},
 'sharp_11': {'maxIters': 150, 'blockSize': 32, 'layers': [10, 8, 2]},
 'sharp_12': {'maxIters': 150, 'blockSize': 64, 'layers': [10, 6, 4, 2]},
 'sharp_13': {'maxIters': 150, 'blockSize': 6

In [46]:
def grid(df, prediction_v, tuned_param):
    optimals = {}
    evaluatorMulti = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")
    evaluator = BinaryClassificationEvaluator(labelCol="label", rawPredictionCol="prediction", metricName='areaUnderROC')
    for i in prediction_v:
        assembler = VectorAssembler().setInputCols(features_numeric).setOutputCol("features")
        tmp_df = assembler.transform(df[i])
        tmp_df = tmp_df.withColumnRenamed("y","label")
        # Training e test
        train1, test = tmp_df.randomSplit([0.7, 0.3], seed = 2)
        # Training e validation
        train, val = train1.randomSplit([0.7, 0.3], seed = 2)
        f1 = 0
        s_tmp = ""
        print("------------ Grid per cluster " + str(i) + " ------------")
        for j in tuned_param:            
            mlp = MultilayerPerceptronClassifier(maxIter = tuned_param[j]["maxIters"], layers = tuned_param[j]["layers"], blockSize = tuned_param[j]["blockSize"], seed=1002)
            print("Effettuato " + str(j))
            model = mlp.fit(train)

            # Make predicitons
            predictionAndTarget = model.transform(val).select("label", "prediction")
            
            f1_new = evaluatorMulti.evaluate(predictionAndTarget, {evaluatorMulti.metricName: "f1"})
            # Massimizziamo F1
            if(f1_new > f1):
                s_tmp = j
        print(" > Parametri migliori ")
        print(s_tmp)
        # Metriche Reali
        optimals[i] = {"maxIter": tuned_param[s_tmp]["maxIters"], "layers" : tuned_param[s_tmp]["layers"], "blockSize": tuned_param[s_tmp]["blockSize"]}
        print(optimals[i])
        mlp = MultilayerPerceptronClassifier(maxIter = tuned_param[s_tmp]["maxIters"], layers = tuned_param[s_tmp]["layers"], blockSize = tuned_param[s_tmp]["blockSize"], seed=1002)
        model = mlp.fit(train1)
        predictionAndTarget = model.transform(test).select("label", "prediction")
        print(" > Valutazioni")
        acc = evaluatorMulti.evaluate(predictionAndTarget, {evaluatorMulti.metricName: "accuracy"})
        f1 = evaluatorMulti.evaluate(predictionAndTarget, {evaluatorMulti.metricName: "f1"})
        weightedPrecision = evaluatorMulti.evaluate(predictionAndTarget, {evaluatorMulti.metricName: "weightedPrecision"})
        weightedRecall = evaluatorMulti.evaluate(predictionAndTarget, {evaluatorMulti.metricName: "weightedRecall"})
        auc = evaluator.evaluate(predictionAndTarget)
        print(" - Accuracy = " + str(acc))
        print(" - F1 score = " + str(f1))
        print(" - Weighted Precision = " + str(weightedPrecision))
        print(" - Weighted Recall = " + str(weightedRecall))
        print(" - AUC = " + str(auc))
    return optimals

In [47]:
optimals = grid(clusters_df, prediction_v, tuned_param)

------------ Grid per cluster 0 ------------
Effettuato sharp_0
Effettuato sharp_1
Effettuato sharp_2
Effettuato sharp_3
Effettuato sharp_4
Effettuato sharp_5
Effettuato sharp_6
Effettuato sharp_7
Effettuato sharp_8
Effettuato sharp_9
Effettuato sharp_10
Effettuato sharp_11
Effettuato sharp_12
Effettuato sharp_13
Effettuato sharp_14
Effettuato sharp_15
Effettuato sharp_16
Effettuato sharp_17
Effettuato sharp_18
Effettuato sharp_19
Effettuato sharp_20
Effettuato sharp_21
Effettuato sharp_22
Effettuato sharp_23
Effettuato sharp_24
Effettuato sharp_25
Effettuato sharp_26
Effettuato sharp_27
Effettuato sharp_28
Effettuato sharp_29
Effettuato sharp_30
Effettuato sharp_31
Effettuato sharp_32
Effettuato sharp_33
Effettuato sharp_34
Effettuato sharp_35
 > Parametri migliori 
sharp_35
{'maxIter': 250, 'layers': [10, 8, 2], 'blockSize': 128}
 > Valutazioni
 - Accuracy = 0.764065335753176
 - F1 score = 0.6618755274734303
 - Weighted Precision = 0.5837958372996137
 - Weighted Recall = 0.7640653357

In [48]:
optimals

{0: {'maxIter': 250, 'layers': [10, 8, 2], 'blockSize': 128},
 1: {'maxIter': 250, 'layers': [10, 8, 2], 'blockSize': 128},
 2: {'maxIter': 250, 'layers': [10, 8, 2], 'blockSize': 128},
 3: {'maxIter': 250, 'layers': [10, 8, 2], 'blockSize': 128},
 4: {'maxIter': 250, 'layers': [10, 8, 2], 'blockSize': 128},
 5: {'maxIter': 250, 'layers': [10, 8, 2], 'blockSize': 128}}

In [57]:
optimals[0]["layers"][1]

8

- Salvataggio valori ottimi (backup)

In [58]:
"""
# Eseguire solo se è stata eseguita la crossvalidation
def opt_towrite(optimals):
    ll = []
    for i in range(0, 6):
        x ={}
        x["clusters"] = i
        x["maxIter"] = optimals[i]["maxIter"]
        ss = str(optimals[i]["layers"][0])
        for j in range(1, len(optimals[i]["layers"])):
            ss = ss+"_"+str(optimals[i]["layers"][j])
        x["layers"] = ss
        x["blockSize"] = optimals[i]["blockSize"]
        ll.append(x)
    df_optimals = spark.createDataFrame(ll)
    df_optimals.write.format("csv").save("hdfs://kddrtserver13.isti.cnr.it:9000/user/hpsa15/multilayerperceptron_optimals.csv", header = True)

opt_towrite(optimals)
"""

In [61]:
"""
optimals_df = spark.read.options(inferSchema = True, header = True)\
                .csv(optimals_path)    
optimals_df.show()
"""

+---------+--------+------+-------+
|blockSize|clusters|layers|maxIter|
+---------+--------+------+-------+
|      128|       1|10_8_2|    250|
|      128|       2|10_8_2|    250|
|      128|       4|10_8_2|    250|
|      128|       5|10_8_2|    250|
|      128|       0|10_8_2|    250|
|      128|       3|10_8_2|    250|
+---------+--------+------+-------+

