# Machine Learning con la librería MLlib  de Spark

En el presente script construiremos una red neuronal para clasificación múltiple

Todos los procesos serán coleccionados en un Pipeline explícito 

## Load the libraries

In [1]:
import os
import numpy as np
import pandas as pd
from pyspark.sql.types import * #<-- importa todos los tipos de dato como: StringType, FloatType DoubleType, DateType, etc.
from pyspark.ml import Pipeline
from pyspark.sql import functions as f
from pyspark.sql.functions import udf, StringType
from pyspark.sql import SparkSession, functions as F
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import MultilayerPerceptronClassifier, MultilayerPerceptronClassificationModel
from pyspark.ml.feature import OneHotEncoder, VectorAssembler, StringIndexer, MinMaxScaler

## Initialize Spark Session

In [2]:
spark = SparkSession.builder.appName('deep_learning').getOrCreate()

In [3]:
spark

## Carga y limpieza de datos

In [4]:
data = spark.read.csv('./data/dl_data.csv', header=True, inferSchema=True)

In [5]:
data.show(5)

+-------------------+---------------------+-----------------+-------------------------------------+-------------------------------+-----------------------+------------------------------+-------------------------------+-------------------------+-------------------------+-------------------+------------------+-------------------+-----------------+------------------+------------------+-------------------------+
|Visit_Number_Bucket|Page_Views_Normalized|Orders_Normalized|Internal_Search_Successful_Normalized|Internal_Search_Null_Normalized|Email_Signup_Normalized|Total_Seconds_Spent_Normalized|Store_Locator_Search_Normalized|Mapped_Last_Touch_Channel|Mapped_Mobile_Device_Type|Mapped_Browser_Type|Mapped_Entry_Pages|Mapped_Site_Section|Mapped_Promo_Code|Maped_Product_Name|Mapped_Search_Term|Mapped_Product_Collection|
+-------------------+---------------------+-----------------+-------------------------------------+-------------------------------+-----------------------+---------------------

Renombramos las columnas ya que los nombres son muy largos. Esto no es necesario, sólo es para poder visualizar los DataFrames en la pantalla

In [6]:
i=0
for column in data.columns:
    i = i+1 
    data = data.withColumnRenamed( column,'x_'+str(i) )

data.show(5)

+-------------+-----------+---+---+---+---+-----------+---+--------+-------+--------+-----------+-------------+-----------+-------------+------------+-------------------+
|          x_1|        x_2|x_3|x_4|x_5|x_6|        x_7|x_8|     x_9|   x_10|    x_11|       x_12|         x_13|       x_14|         x_15|        x_16|               x_17|
+-------------+-----------+---+---+---+---+-----------+---+--------+-------+--------+-----------+-------------+-----------+-------------+------------+-------------------+
|less than  11|0.026315789|  1|0.0|0.0|0.0|  4.5819E-4|0.0|Channel1|Device1|Browser1|Entry_Page1|Site_Section1|Promo_Code1|Product_Name1|Search_Term1|Product_Collection1|
|less than  11|        0.0|  1|0.0|0.0|0.0|        0.0|0.0|Channel1|Device1|Browser2|Entry_Page2|Site_Section1|Promo_Code1|Product_Name1|Search_Term1|Product_Collection2|
|less than  11|        0.0|  1|0.0|0.0|0.0|0.002863688|0.0|Channel1|Device1|Browser1|Entry_Page2|Site_Section1|Promo_Code1|Product_Name2|Search_T

In [7]:
# Dimensión del DataFrame:

data.count() , len(data.columns)

(73487, 17)

### Seleccionamos variables para trabajar:

In [8]:
# Columnas que no usaremos:
unused_cols = [] #<-- vacío si usaremos todas las columnas

In [9]:
# Columnas que usaremos:
selected_cols = [ c for c in data.columns if c not in unused_cols ]
selected_cols

['x_1',
 'x_2',
 'x_3',
 'x_4',
 'x_5',
 'x_6',
 'x_7',
 'x_8',
 'x_9',
 'x_10',
 'x_11',
 'x_12',
 'x_13',
 'x_14',
 'x_15',
 'x_16',
 'x_17']

In [10]:
data = data.select( selected_cols )
data.show(5)

+-------------+-----------+---+---+---+---+-----------+---+--------+-------+--------+-----------+-------------+-----------+-------------+------------+-------------------+
|          x_1|        x_2|x_3|x_4|x_5|x_6|        x_7|x_8|     x_9|   x_10|    x_11|       x_12|         x_13|       x_14|         x_15|        x_16|               x_17|
+-------------+-----------+---+---+---+---+-----------+---+--------+-------+--------+-----------+-------------+-----------+-------------+------------+-------------------+
|less than  11|0.026315789|  1|0.0|0.0|0.0|  4.5819E-4|0.0|Channel1|Device1|Browser1|Entry_Page1|Site_Section1|Promo_Code1|Product_Name1|Search_Term1|Product_Collection1|
|less than  11|        0.0|  1|0.0|0.0|0.0|        0.0|0.0|Channel1|Device1|Browser2|Entry_Page2|Site_Section1|Promo_Code1|Product_Name1|Search_Term1|Product_Collection2|
|less than  11|        0.0|  1|0.0|0.0|0.0|0.002863688|0.0|Channel1|Device1|Browser1|Entry_Page2|Site_Section1|Promo_Code1|Product_Name2|Search_T

In [11]:
data.printSchema()

root
 |-- x_1: string (nullable = true)
 |-- x_2: double (nullable = true)
 |-- x_3: integer (nullable = true)
 |-- x_4: double (nullable = true)
 |-- x_5: double (nullable = true)
 |-- x_6: double (nullable = true)
 |-- x_7: double (nullable = true)
 |-- x_8: double (nullable = true)
 |-- x_9: string (nullable = true)
 |-- x_10: string (nullable = true)
 |-- x_11: string (nullable = true)
 |-- x_12: string (nullable = true)
 |-- x_13: string (nullable = true)
 |-- x_14: string (nullable = true)
 |-- x_15: string (nullable = true)
 |-- x_16: string (nullable = true)
 |-- x_17: string (nullable = true)



### Ajustamos tipos de datos:

In [12]:
# Extraemos nombres de columnas categóricas:
categ_cols = [item[0] for item in data.dtypes if item[1].startswith('string')]
categ_cols

['x_1', 'x_9', 'x_10', 'x_11', 'x_12', 'x_13', 'x_14', 'x_15', 'x_16', 'x_17']

In [13]:
# A las columnas categóricas les asignamos el tipo "String"
# A las columnas numéricas les asignamos el tipo "Double"

for col in data.columns:    
    if col in categ_cols:
        # Asignamos el tipo "String"
        data = data.withColumn( col , data[col].cast( StringType() ) )
    else:
        # Asignamos el tipo "Double"
        data = data.withColumn( col , data[col].cast( DoubleType() ) )

In [14]:
data.printSchema()

root
 |-- x_1: string (nullable = true)
 |-- x_2: double (nullable = true)
 |-- x_3: double (nullable = true)
 |-- x_4: double (nullable = true)
 |-- x_5: double (nullable = true)
 |-- x_6: double (nullable = true)
 |-- x_7: double (nullable = true)
 |-- x_8: double (nullable = true)
 |-- x_9: string (nullable = true)
 |-- x_10: string (nullable = true)
 |-- x_11: string (nullable = true)
 |-- x_12: string (nullable = true)
 |-- x_13: string (nullable = true)
 |-- x_14: string (nullable = true)
 |-- x_15: string (nullable = true)
 |-- x_16: string (nullable = true)
 |-- x_17: string (nullable = true)



### Identificamos columnas numéricas y categóricas

In [15]:
data_types = data.dtypes
numerical_columns = [ item[0] for item in data.dtypes if item[1] != 'string' ]
categoric_columns = [ item[0] for item in data.dtypes if item[1].startswith('string') ]

In [16]:
print('Columnas numéricas: \n', numerical_columns)
print('\nColumnas categóric: \n', categoric_columns)

Columnas numéricas: 
 ['x_2', 'x_3', 'x_4', 'x_5', 'x_6', 'x_7', 'x_8']

Columnas categóric: 
 ['x_1', 'x_9', 'x_10', 'x_11', 'x_12', 'x_13', 'x_14', 'x_15', 'x_16', 'x_17']


### Trabajamos con campos nulos:

In [17]:
from pyspark.sql.functions import isnull, when, count, col

# Mostramos el número de campos vacíos en cada columna:
df_nulls = data.select([count(when(isnull(c), True)).alias(c) for c in data.columns]).toPandas()
df_nulls

Unnamed: 0,x_1,x_2,x_3,x_4,x_5,x_6,x_7,x_8,x_9,x_10,x_11,x_12,x_13,x_14,x_15,x_16,x_17
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


for c in df_nulls.columns:
    # Extraemos el valor de la celda:
    n_nulls = df_nulls[c][0]
    if ( n_nulls > 0):
        print('Error!!! La columna',c,'tiene',n_nulls,'campos nulos' )
        stop()
    else:
        print('No se encontraron campos nulos en la columna:',c)

### Obtenemos algunos gráficos

In [18]:
#from pandas_profiling import ProfileReport

#df_pandas = data.toPandas()

#pfr = ProfileReport(df_pandas)
#pfr.to_notebook_iframe()
#pfr

## Extraemos las clases de datos que hay en cada columna

In [19]:
# Extraemos el número de clases de datos en cada columna

from pyspark.sql.functions import countDistinct, col

data.select( [ countDistinct( col(c) ).alias(c) for c in data.columns ] ).toPandas()

Unnamed: 0,x_1,x_2,x_3,x_4,x_5,x_6,x_7,x_8,x_9,x_10,x_11,x_12,x_13,x_14,x_15,x_16,x_17
0,8,27,2,9,5,5,1631,5,12,4,9,30,31,28,31,31,31


## Identificamos la columna de labels y las columnas de features:

In [20]:
label_col = "x_10" #"x_3" # <-- nombre de la columna que usaremos como labels

# Obtenemos las clases de valores de la columna de labels:
classes = data.select([ label_col ]).distinct().toPandas()
classes

Unnamed: 0,x_10
0,Device3
1,Device1
2,Device4
3,Device2


In [21]:
# Numero de clases de salida:
n_class_out = len( classes )
n_class_out

4

In [22]:
# Columnas que formaran los features:
feature_cols = [col for col in selected_cols if col != label_col]
feature_cols

['x_1',
 'x_2',
 'x_3',
 'x_4',
 'x_5',
 'x_6',
 'x_7',
 'x_8',
 'x_9',
 'x_11',
 'x_12',
 'x_13',
 'x_14',
 'x_15',
 'x_16',
 'x_17']

### Split the dataset into Train, Validation and Test

In [23]:
train, validation, test  = data.randomSplit([0.7, 0.2, 0.1], 1234)

In [24]:
train.show(5)

+--------+---+---+---+---+---+---+---+--------+-------+--------+------------+--------------+------------+-------------+------------+--------------------+
|     x_1|x_2|x_3|x_4|x_5|x_6|x_7|x_8|     x_9|   x_10|    x_11|        x_12|          x_13|        x_14|         x_15|        x_16|                x_17|
+--------+---+---+---+---+---+---+---+--------+-------+--------+------------+--------------+------------+-------------+------------+--------------------+
|above 70|0.0|0.0|0.0|0.0|0.0|0.0|0.0|Channel1|Device1|Browser2| Entry_Page2| Site_Section1| Promo_Code1|Product_Name1|Search_Term1| Product_Collection4|
|above 70|0.0|0.0|0.0|0.0|0.0|0.0|0.0|Channel4|Device2|Browser2| Entry_Page2| Site_Section7| Promo_Code1|Product_Name1|Search_Term2| Product_Collection1|
|above 70|0.0|0.0|0.0|0.0|0.0|0.0|0.0|Channel5|Device1|Browser1|Entry_Page15|Site_Section18| Promo_Code1|Product_Name1|Search_Term1| Product_Collection4|
|above 70|0.0|0.0|0.0|0.0|0.0|0.0|0.0|Channel5|Device1|Browser1| Entry_Page2

# Inicia creación de Pipeline

Antes de llegar a este paso debemos tener el dataset limpio y listo para trabajar.

Estaremos usando la función `StringIndexer` que asigna un valor entero a cada categoría de datos, iniciando forzosamente dede 0.

0 se asigna a la categoría más frecuente, 1 a la siguiente categoría más frecuente y así sucesivamente.


### Stage 1: Creación de columna de labels codificados

In [25]:
Stage_1 = StringIndexer(inputCol = label_col, outputCol = "labels")
Stage_1

StringIndexer_94b0fa41eb8e

In [26]:
# Ver la salida del stage 
Scaler_1 = Stage_1.fit(train)
data_transform = Scaler_1.transform(train)
data_transform.show(5)

+--------+---+---+---+---+---+---+---+--------+-------+--------+------------+--------------+------------+-------------+------------+--------------------+------+
|     x_1|x_2|x_3|x_4|x_5|x_6|x_7|x_8|     x_9|   x_10|    x_11|        x_12|          x_13|        x_14|         x_15|        x_16|                x_17|labels|
+--------+---+---+---+---+---+---+---+--------+-------+--------+------------+--------------+------------+-------------+------------+--------------------+------+
|above 70|0.0|0.0|0.0|0.0|0.0|0.0|0.0|Channel1|Device1|Browser2| Entry_Page2| Site_Section1| Promo_Code1|Product_Name1|Search_Term1| Product_Collection4|   0.0|
|above 70|0.0|0.0|0.0|0.0|0.0|0.0|0.0|Channel4|Device2|Browser2| Entry_Page2| Site_Section7| Promo_Code1|Product_Name1|Search_Term2| Product_Collection1|   1.0|
|above 70|0.0|0.0|0.0|0.0|0.0|0.0|0.0|Channel5|Device1|Browser1|Entry_Page15|Site_Section18| Promo_Code1|Product_Name1|Search_Term1| Product_Collection4|   0.0|
|above 70|0.0|0.0|0.0|0.0|0.0|0.0|

### Stage 2: Transformación de features categóricos a numéricos

In [27]:
# Extraemos nombres de columnas categóricas que formaran los features (ie. no incluyen el label):
categoric_cols_features = [ c for c in categoric_columns if c != label_col ]
categoric_cols_features

['x_1', 'x_9', 'x_11', 'x_12', 'x_13', 'x_14', 'x_15', 'x_16', 'x_17']

In [28]:
# Creamos nombres de nuevas columnas numéricas que formaran los features:
categoric_cols_features_num = [ c+'_num' for c in categoric_cols_features]
categoric_cols_features_num

['x_1_num',
 'x_9_num',
 'x_11_num',
 'x_12_num',
 'x_13_num',
 'x_14_num',
 'x_15_num',
 'x_16_num',
 'x_17_num']

In [29]:
# Agregamos nuevas columnas numéricas para los features:
Stage_2 = []
for i in range( len(categoric_cols_features) ):    
    st_i = StringIndexer(inputCol = categoric_cols_features[i] , outputCol = categoric_cols_features_num[i] , handleInvalid='keep')
    Stage_2.append(st_i)

Stage_2

[StringIndexer_61291796020f,
 StringIndexer_8ba84babd04e,
 StringIndexer_5aec717833af,
 StringIndexer_66e9efe1bb6b,
 StringIndexer_78b55cabe1e2,
 StringIndexer_fc9049dcdace,
 StringIndexer_36f743a4d6bc,
 StringIndexer_68b146bc0cc2,
 StringIndexer_4eecafe362b6]

In [30]:
# Ver la salida del stage 
for st in Stage_2:
    Scaler_i = st.fit(data_transform)
    data_transform = Scaler_i.transform(data_transform)
    
data_transform.show(5)

+--------+---+---+---+---+---+---+---+--------+-------+--------+------------+--------------+------------+-------------+------------+--------------------+------+-------+-------+--------+--------+--------+--------+--------+--------+--------+
|     x_1|x_2|x_3|x_4|x_5|x_6|x_7|x_8|     x_9|   x_10|    x_11|        x_12|          x_13|        x_14|         x_15|        x_16|                x_17|labels|x_1_num|x_9_num|x_11_num|x_12_num|x_13_num|x_14_num|x_15_num|x_16_num|x_17_num|
+--------+---+---+---+---+---+---+---+--------+-------+--------+------------+--------------+------------+-------------+------------+--------------------+------+-------+-------+--------+--------+--------+--------+--------+--------+--------+
|above 70|0.0|0.0|0.0|0.0|0.0|0.0|0.0|Channel1|Device1|Browser2| Entry_Page2| Site_Section1| Promo_Code1|Product_Name1|Search_Term1| Product_Collection4|   0.0|    4.0|    7.0|     1.0|     1.0|     0.0|     0.0|     0.0|     0.0|     0.0|
|above 70|0.0|0.0|0.0|0.0|0.0|0.0|0.0|Ch

### Stage 3: Creación de columna de vectores de features

In [31]:
# Extraemos columnas numéricas para los features:
numeric_cols_features = [col for col in numerical_columns if col != label_col]
numeric_cols_features

['x_2', 'x_3', 'x_4', 'x_5', 'x_6', 'x_7', 'x_8']

In [32]:
# Coleccionamos todas las columnas numericas que formaran a los features:
required_features = numeric_cols_features + categoric_cols_features_num
required_features

['x_2',
 'x_3',
 'x_4',
 'x_5',
 'x_6',
 'x_7',
 'x_8',
 'x_1_num',
 'x_9_num',
 'x_11_num',
 'x_12_num',
 'x_13_num',
 'x_14_num',
 'x_15_num',
 'x_16_num',
 'x_17_num']

In [33]:
Stage_3 = VectorAssembler(inputCols=required_features, outputCol='features')
Stage_3

VectorAssembler_e8a5692e495c

In [34]:
# Ver la salida del stage 
data_transform = Stage_3.transform(data_transform)
data_transform.show(5)

+--------+---+---+---+---+---+---+---+--------+-------+--------+------------+--------------+------------+-------------+------------+--------------------+------+-------+-------+--------+--------+--------+--------+--------+--------+--------+--------------------+
|     x_1|x_2|x_3|x_4|x_5|x_6|x_7|x_8|     x_9|   x_10|    x_11|        x_12|          x_13|        x_14|         x_15|        x_16|                x_17|labels|x_1_num|x_9_num|x_11_num|x_12_num|x_13_num|x_14_num|x_15_num|x_16_num|x_17_num|            features|
+--------+---+---+---+---+---+---+---+--------+-------+--------+------------+--------------+------------+-------------+------------+--------------------+------+-------+-------+--------+--------+--------+--------+--------+--------+--------+--------------------+
|above 70|0.0|0.0|0.0|0.0|0.0|0.0|0.0|Channel1|Device1|Browser2| Entry_Page2| Site_Section1| Promo_Code1|Product_Name1|Search_Term1| Product_Collection4|   0.0|    4.0|    7.0|     1.0|     1.0|     0.0|     0.0|     

### Stage 4: Creación de columna de vectores de features reescalados (entre 0 y 1)

In [35]:
### Reescalamos los features para tomen valore entre 0 y 1:
Stage_4 = MinMaxScaler(min=0.0, max=1.0, inputCol="features", outputCol="scaled_features")
Stage_4

MinMaxScaler_764b7c2a2ce3

In [36]:
# Ver la salida del stage:
Scaler_4 = Stage_4.fit(data_transform)
data_transform = Scaler_4.transform(data_transform)
data_transform.show(5)
data_transform.select(['features','scaled_features']).show(5,truncate=False)


+--------+---+---+---+---+---+---+---+--------+-------+--------+------------+--------------+------------+-------------+------------+--------------------+------+-------+-------+--------+--------+--------+--------+--------+--------+--------+--------------------+--------------------+
|     x_1|x_2|x_3|x_4|x_5|x_6|x_7|x_8|     x_9|   x_10|    x_11|        x_12|          x_13|        x_14|         x_15|        x_16|                x_17|labels|x_1_num|x_9_num|x_11_num|x_12_num|x_13_num|x_14_num|x_15_num|x_16_num|x_17_num|            features|     scaled_features|
+--------+---+---+---+---+---+---+---+--------+-------+--------+------------+--------------+------------+-------------+------------+--------------------+------+-------+-------+--------+--------+--------+--------+--------+--------+--------+--------------------+--------------------+
|above 70|0.0|0.0|0.0|0.0|0.0|0.0|0.0|Channel1|Device1|Browser2| Entry_Page2| Site_Section1| Promo_Code1|Product_Name1|Search_Term1| Product_Collection4| 

In [37]:
data_transform.printSchema()

root
 |-- x_1: string (nullable = true)
 |-- x_2: double (nullable = true)
 |-- x_3: double (nullable = true)
 |-- x_4: double (nullable = true)
 |-- x_5: double (nullable = true)
 |-- x_6: double (nullable = true)
 |-- x_7: double (nullable = true)
 |-- x_8: double (nullable = true)
 |-- x_9: string (nullable = true)
 |-- x_10: string (nullable = true)
 |-- x_11: string (nullable = true)
 |-- x_12: string (nullable = true)
 |-- x_13: string (nullable = true)
 |-- x_14: string (nullable = true)
 |-- x_15: string (nullable = true)
 |-- x_16: string (nullable = true)
 |-- x_17: string (nullable = true)
 |-- labels: double (nullable = false)
 |-- x_1_num: double (nullable = false)
 |-- x_9_num: double (nullable = false)
 |-- x_11_num: double (nullable = false)
 |-- x_12_num: double (nullable = false)
 |-- x_13_num: double (nullable = false)
 |-- x_14_num: double (nullable = false)
 |-- x_15_num: double (nullable = false)
 |-- x_16_num: double (nullable = false)
 |-- x_17_num: double (null

### Stage 5: Construcción del modelo

In [38]:
# Tamaño (dimensión) de cada feature:
dim_xi = len( required_features )
dim_xi

16

In [39]:
# Definimos el número de neuronas en cada capa de la red:
layers = [ dim_xi, 10, 10, n_class_out]
layers

[16, 10, 10, 4]

In [40]:
# Batch_size
batch_size = round( 0.01*train.count()  )# 128
batch_size

514

In [41]:
# Declaramos el clasificador: 
Stage_5 = MultilayerPerceptronClassifier(
    labelCol='labels', 
    featuresCol='scaled_features', 
    maxIter=200, 
    layers=layers, 
    blockSize=batch_size, 
    seed=1234,
    stepSize=0.05,
    solver='l-bfgs'
    )
    
Stage_5

MultilayerPerceptronClassifier_82b36949e200

## Ensamble del Pipeline

In [42]:
# Coleccionamos todos los Satges en una lista:
Stages_list = [Stage_1] + Stage_2 + [ Stage_3, Stage_4, Stage_5]
Stages_list

[StringIndexer_94b0fa41eb8e,
 StringIndexer_61291796020f,
 StringIndexer_8ba84babd04e,
 StringIndexer_5aec717833af,
 StringIndexer_66e9efe1bb6b,
 StringIndexer_78b55cabe1e2,
 StringIndexer_fc9049dcdace,
 StringIndexer_36f743a4d6bc,
 StringIndexer_68b146bc0cc2,
 StringIndexer_4eecafe362b6,
 VectorAssembler_e8a5692e495c,
 MinMaxScaler_764b7c2a2ce3,
 MultilayerPerceptronClassifier_82b36949e200]

In [43]:
pipeline = Pipeline(stages = Stages_list )
pipeline

Pipeline_e28408d1d945

## Ejecución del Pipeline

In [44]:
# Ejecutamos el pipeline y se entrena el modelo con los datos de entrenamiento:

model = pipeline.fit(train)

model # <-- Contiene el modelo entrenado

PipelineModel_d03dc96b0959

## Predecimos datos de entrenamiento, validación y pruebas:

In [45]:
cols_to_show = ['labels', 'scaled_features', 'rawPrediction','probability','prediction']

In [46]:
# Predecimos datos de entrenamiento:
train_prediction = model.transform(train)
train_prediction.select(cols_to_show).show(5)

+------+--------------------+--------------------+--------------------+----------+
|labels|     scaled_features|       rawPrediction|         probability|prediction|
+------+--------------------+--------------------+--------------------+----------+
|   0.0|(16,[7,8,9,10],[0...|[3.15479636889047...|[0.74226546906052...|       0.0|
|   1.0|(16,[7,8,9,10,11,...|[2.91972917595144...|[0.62307831299302...|       0.0|
|   0.0|(16,[7,8,10,11],[...|[4.27732737506947...|[0.95719378288419...|       0.0|
|   0.0|(16,[7,8,10,11,13...|[4.20355946996423...|[0.95462410504392...|       0.0|
|   0.0|(16,[7,8,10,11,12...|[2.98096499218038...|[0.82130217411974...|       0.0|
+------+--------------------+--------------------+--------------------+----------+
only showing top 5 rows



In [47]:
# Predecimos datos de validación:
val_prediction = model.transform(validation)
val_prediction.select(cols_to_show).show(5)

+------+--------------------+--------------------+--------------------+----------+
|labels|     scaled_features|       rawPrediction|         probability|prediction|
+------+--------------------+--------------------+--------------------+----------+
|   0.0|(16,[7,8,9,11],[0...|[3.15048674678314...|[0.74024018494556...|       0.0|
|   0.0|(16,[7,8,9,10],[0...|[3.15479636889047...|[0.74226546906052...|       0.0|
|   0.0|(16,[7,8,11,13,15...|[4.22499472162357...|[0.95565336949326...|       0.0|
|   0.0|(16,[5,7,8,10,11]...|[4.13483238158360...|[0.95118693599229...|       0.0|
|   1.0|(16,[5,7,8,9,10,1...|[2.98351183302904...|[0.66693581266729...|       0.0|
+------+--------------------+--------------------+--------------------+----------+
only showing top 5 rows



In [48]:
# Predecimos datos de pruebas:
test_prediction = model.transform(test)
test_prediction.select(cols_to_show).show(5)

+------+--------------------+--------------------+--------------------+----------+
|labels|     scaled_features|       rawPrediction|         probability|prediction|
+------+--------------------+--------------------+--------------------+----------+
|   1.0|(16,[5,7,8,9,10,1...|[2.91340546775139...|[0.61907113860652...|       0.0|
|   1.0|(16,[5,7,8,9,10],...|[3.00139745067683...|[0.68000620967572...|       0.0|
|   1.0|(16,[1,5,7,8,9,13...|[1.63564638325819...|[0.02352854119603...|       1.0|
|   1.0|(16,[1,5,7,8,9,12...|[2.88689268426310...|[0.60587761918991...|       0.0|
|   1.0|(16,[1,5,7,8,9,10...|[2.83654787744289...|[0.57100856125019...|       0.0|
+------+--------------------+--------------------+--------------------+----------+
only showing top 5 rows



## Evaluate the Predictions

In [49]:
# Seleccionamos las columnas 'label' y 'prediction' de los DataFrames predichos:
train_Labels_Prediction = train_prediction.select('labels','prediction')
validation_Labels_Prediction = val_prediction.select('labels','prediction')
test_Labels_Prediction = test_prediction.select('labels','prediction')

In [50]:
# Elegimos las siguientes métricas:
metrics = ['weightedPrecision', 'weightedRecall', 'accuracy']

# Obtenemos los valores de cada métrica, aplicadas a los datos de entrenamiento, validación y pruebas:
for metric in metrics:
    # Declaramos la métrica actual:
    evaluator = MulticlassClassificationEvaluator(labelCol='labels', predictionCol='prediction', metricName=metric)
    
    f_train = evaluator.evaluate( train_Labels_Prediction )
    f_validation = evaluator.evaluate( validation_Labels_Prediction )
    f_test = evaluator.evaluate( test_Labels_Prediction )

    print('\nValores de metrica << ' + metric ,'>>' )
    print('Train = ' , f_train)
    print('Validation = ' , f_validation)
    print('Test = ' , f_test)


Valores de metrica << weightedPrecision >>
Train =  0.7137631633863583
Validation =  0.723014400812884
Test =  0.7208873508623836

Valores de metrica << weightedRecall >>
Train =  0.755849460274239
Validation =  0.7665800629016819
Test =  0.7637657802847165

Valores de metrica << accuracy >>
Train =  0.755849460274239
Validation =  0.7665800629016819
Test =  0.7637657802847166


### Guardamos el modelo entrenado:

In [51]:
import os
from os import system

path = './Models_trained/pipeline_model_Book'

# En caso de que exista la ruta la borramos para volver a crearla:
if os.path.exists( path ):
    # Comando a ejecutar:
    comando = 'rm -r ' + path
    # Ejecutamos el comando:
    system( comando )
    print('Se ejecutó: ',comando)


# Guardamos el modelo entrenado:
model.save( path )

Se ejecutó:  rm -r ./Models_trained/pipeline_model_Book


### Cargamos el modelo entrenado y hacemos una predicción:

In [52]:
# Cargamos el modelo:
#from pyspark.ml.classification import MultilayerPerceptronClassificationModel
from pyspark.ml import PipelineModel

modelo_NN = PipelineModel.load( path )

In [53]:
# Predecimos datos de pruebas:
predictions_test = modelo_NN.transform(test)
predictions_test.show(5)

+--------+---+---+---+---+---+-----------+---+--------+-------+--------+-----------+-------------+-----------+--------------+------------+--------------------+------+-------+-------+--------+--------+--------+--------+--------+--------+--------+--------------------+--------------------+--------------------+--------------------+----------+
|     x_1|x_2|x_3|x_4|x_5|x_6|        x_7|x_8|     x_9|   x_10|    x_11|       x_12|         x_13|       x_14|          x_15|        x_16|                x_17|labels|x_1_num|x_9_num|x_11_num|x_12_num|x_13_num|x_14_num|x_15_num|x_16_num|x_17_num|            features|     scaled_features|       rawPrediction|         probability|prediction|
+--------+---+---+---+---+---+-----------+---+--------+-------+--------+-----------+-------------+-----------+--------------+------------+--------------------+------+-------+-------+--------+--------+--------+--------+--------+--------+--------+--------------------+--------------------+--------------------+----------

#### Calculamos la precisión manualmente:

In [54]:
wrong_test = predictions_test.filter( predictions_test['labels']!=predictions_test['prediction'] ).count()
right_test = predictions_test.filter( predictions_test['labels']==predictions_test['prediction'] ).count()

acc_test =  right_test/(right_test + wrong_test)

print( 'La precisión en las predicciones de los datos de prueba es del',round(acc_test * 100),'%' )

La precisión en las predicciones de los datos de prueba es del 76 %


### Consulta de predicciones:

#### Creamos mapeo de labels:

Crearemos el DataFrame de pandas `labels_map` que contiene la relación entre los labels del dataset original y los labels reescalados que se usaron para entrenar el modelo.

In [55]:
Scaler_labels = Stage_1.fit(data)
data_transform = Scaler_labels.transform(data)
data_transform.show(5)

+-------------+-----------+---+---+---+---+-----------+---+--------+-------+--------+-----------+-------------+-----------+-------------+------------+-------------------+------+
|          x_1|        x_2|x_3|x_4|x_5|x_6|        x_7|x_8|     x_9|   x_10|    x_11|       x_12|         x_13|       x_14|         x_15|        x_16|               x_17|labels|
+-------------+-----------+---+---+---+---+-----------+---+--------+-------+--------+-----------+-------------+-----------+-------------+------------+-------------------+------+
|less than  11|0.026315789|1.0|0.0|0.0|0.0|  4.5819E-4|0.0|Channel1|Device1|Browser1|Entry_Page1|Site_Section1|Promo_Code1|Product_Name1|Search_Term1|Product_Collection1|   0.0|
|less than  11|        0.0|1.0|0.0|0.0|0.0|        0.0|0.0|Channel1|Device1|Browser2|Entry_Page2|Site_Section1|Promo_Code1|Product_Name1|Search_Term1|Product_Collection2|   0.0|
|less than  11|        0.0|1.0|0.0|0.0|0.0|0.002863688|0.0|Channel1|Device1|Browser1|Entry_Page2|Site_Section1

In [56]:
# Obtenemos el mapeo entre la columna de labels reales y labels reescalados:
labels_map = data_transform.select( [label_col , 'labels'] ).groupBy( [label_col,'labels'] ).count().toPandas()
labels_map = labels_map.sort_values('labels')

# Mapeo de labels:
labels_map[['labels', label_col]]

Unnamed: 0,labels,x_10
0,0.0,Device1
1,1.0,Device2
3,2.0,Device4
2,3.0,Device3


In [57]:
# Llamamos a las componentes de features que necesitamos
feature_cols

['x_1',
 'x_2',
 'x_3',
 'x_4',
 'x_5',
 'x_6',
 'x_7',
 'x_8',
 'x_9',
 'x_11',
 'x_12',
 'x_13',
 'x_14',
 'x_15',
 'x_16',
 'x_17']

### Cerramos sesión Spark

In [60]:
spark.stop()