# Intro ML SUP en BD

Creación de la sesión Spark:

In [1]:
#import SparkSession
import findspark
findspark.init('/opt/spark')
from datetime import datetime
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, date_format, udf 
from pyspark.sql.types import DateType

In [2]:
# Crear el spark session object, llamarle "supervised_ml"
session=SparkSession.builder.appName('supervised_ml').getOrCreate()


## Regression 

Carga de datos, archivo *Linear_regression_dataset.csv*:

In [3]:
# Carga de datos
df=session.read.options().csv('Linear_regression_dataset.csv',header=True,inferSchema =True)

Se invocan las librerias correcpondientes a **LinearRegression**, asi como las de OneHotEncoder, StringIndexer, VectorAssembler:

In [4]:
# Importacion de libs y operaciones
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler


# from pyspark.ml.feature import OneHotEncoderEstimator, OneHotEncoderModel


Se visualizan algunos datos:

In [5]:
print((df.count(), len(df.columns))) 

(1232, 6)


Se muestran los primeros 10 datos:

In [6]:
# primeros 10 datos
df.show(10)



+-----+-----+-----+-----+-----+-----+
|var_1|var_2|var_3|var_4|var_5|label|
+-----+-----+-----+-----+-----+-----+
|  734|  688|   81|0.328|0.259|0.418|
|  700|  600|   94| 0.32|0.247|0.389|
|  712|  705|   93|0.311|0.247|0.417|
|  734|  806|   69|0.315| 0.26|0.415|
|  613|  759|   61|0.302| 0.24|0.378|
|  748|  676|   85|0.318|0.255|0.422|
|  669|  588|   97|0.315|0.251|0.411|
|  667|  845|   68|0.324|0.251|0.381|
|  758|  890|   64| 0.33|0.274|0.436|
|  726|  670|   88|0.335|0.268|0.422|
+-----+-----+-----+-----+-----+-----+
only showing top 10 rows



## Feature Engineering

Creamos un solo vector con todos los features i.e 'var_1', 'var_2', 'var_3', 'var_4', 'var_5', a este le llamaremos "features" y como salida colocamos a 'label':

In [7]:
# Vector Ensamblador

df_assembler = VectorAssembler(inputCols=['var_1',
 'var_2',
 'var_3',
 'var_4',
 'var_5'], outputCol="features")
df = df_assembler.transform(df)


In [8]:
# visulizacion de vector ensamblado compuesto por features y label
df = df.select(['features','label'])

Partimos a continuación el set de datos en 75% training y 25% testing:

In [9]:
# Particion del data set
train,test = df.randomSplit([0.75,0.25])

print(f"Size of train Dataset : {train.count()}" )
print(f"Size of test Dataset : {test.count()}" )

Size of train Dataset : 906
Size of test Dataset : 326


Creamos el Regresor Lineal: 

In [10]:
lr = LinearRegression()

Entrenamos el modelo de regresión lineal:

In [11]:
# Fit the model, le llamamos lr_model
lr_model=lr.fit(train)

Creamos el dataframe de prediciones (*predictions_df*) a partir del modelo de entrenamiento y el conjunto de datos test: 

In [12]:
predictions_df = lr_model.transform(test)

Visualizamos el contenido de *predictions_df*:

In [13]:
# visulizacion de predictions_df
predictions_df.show(10)


+--------------------+-----+-------------------+
|            features|label|         prediction|
+--------------------+-----+-------------------+
|[470.0,509.0,76.0...|0.319| 0.3124470912652623|
|[486.0,610.0,61.0...|0.332| 0.3188700936081938|
|[498.0,672.0,61.0...|0.325|0.33246577054852616|
|[510.0,588.0,72.0...|0.317| 0.3239170187040888|
|[514.0,549.0,81.0...|0.339| 0.3296600659659841|
|[516.0,504.0,86.0...|0.327|0.32934075594181794|
|[524.0,665.0,65.0...|0.336|0.33536852477822154|
|[533.0,660.0,62.0...| 0.33| 0.3357851499360977|
|[536.0,531.0,83.0...|0.318| 0.3274819477544099|
|[537.0,660.0,63.0...|0.326| 0.3283774763638284|
+--------------------+-----+-------------------+
only showing top 10 rows



Ahora, evaluamos el modelo de Regresión Lineal, con los datos de TEST:

In [14]:
# evaluacion del modelo, le llamaremos model_predictions
model_predictions = lr_model.evaluate(test)
model_predictions

<pyspark.ml.regression.LinearRegressionSummary at 0x10ff5fac8>

Imprimimos el valor de R2:

In [15]:
# valor de R2
model_predictions.r2


0.8771081469828823

Imprimimos el valor del meanSquaredError:

In [16]:
# valor del meanSquaredError
model_predictions.meanSquaredError


0.0001425956425761919

## Regresión con Árboles de Decisión

Importamos la librería *DecisionTreeRegressor*: 

In [17]:
# import lib
from pyspark.ml.regression import DecisionTreeRegressor


Creamos el Regresor DT, le llamaremos *dec_tree*:

In [18]:
# dec_tree
dec_tree = DecisionTreeRegressor()


Entrenamos el modelo:

In [19]:
# Train model, le llamaremos dec_tree_model
dec_tree_model=dec_tree.fit(train)
dec_tree_model

DecisionTreeRegressionModel (uid=DecisionTreeRegressor_4a3c2b5b78a2) of depth 5 with 63 nodes

Cuánto es la profundidad máxima por defecto, de este algoritmo?

R/ 5

Desplegamos las *featureImportances*:

In [20]:
dec_tree_model.featureImportances

SparseVector(5, {0: 0.9586, 1: 0.0203, 2: 0.0003, 3: 0.0039, 4: 0.0169})

Evaluamos el modelo con los datos de entrenamiento:

In [25]:
# Make predictions, le llamaremos model_predictions 
model_predictions = dec_tree_model.transform(train)

In [22]:
# visualizamos

model_predictions.show()

+--------------------+-----+-------------------+
|            features|label|         prediction|
+--------------------+-----+-------------------+
|[463.0,527.0,67.0...|0.311|0.31923076923076926|
|[464.0,640.0,66.0...|0.301|0.31923076923076926|
|[468.0,746.0,52.0...|0.329| 0.3328333333333333|
|[473.0,499.0,73.0...|0.315|0.31923076923076926|
|[495.0,628.0,66.0...|0.315|0.31923076923076926|
|[495.0,752.0,50.0...|0.327| 0.3328333333333333|
|[498.0,615.0,67.0...|0.318|0.31923076923076926|
|[501.0,774.0,51.0...|0.315| 0.3328333333333333|
|[511.0,576.0,76.0...|0.329|0.31923076923076926|
|[513.0,698.0,61.0...|0.339| 0.3454999999999999|
|[519.0,595.0,73.0...|0.332|0.31923076923076926|
|[522.0,621.0,72.0...|0.317|0.31923076923076926|
|[527.0,569.0,75.0...|0.341|0.33999999999999986|
|[528.0,652.0,71.0...|0.319|0.31923076923076926|
|[531.0,491.0,89.0...| 0.32|0.31923076923076926|
|[531.0,734.0,55.0...| 0.34| 0.3454999999999999|
|[532.0,690.0,69.0...|0.351| 0.3454999999999999|
|[534.0,609.0,69.0..

Importamos el **RegressionEvaluator**

In [26]:
# import Evaluator
from pyspark.ml.evaluation import RegressionEvaluator



Usando *RegressionEvaluator* calculamos e imprimimos el valor de las metricas R2 y RMSE:

In [29]:
# R2 value of the model on test data 
dt_evaluator = RegressionEvaluator(metricName='r2')
dt_r2 = dt_evaluator.evaluate(model_predictions)
print(f'The r-square value of DecisionTreeRegressor is {dt_r2}')

# RMSE value of the model on test data 
dt_evaluatorRMSE = RegressionEvaluator(metricName='rmse')
dt_RMSE = dt_evaluatorRMSE.evaluate(model_predictions)
print(f'The RMSE value of DecisionTreeRegressor is {dt_RMSE}')

The r-square value of DecisionTreeRegressor is 0.8710842150545587
The RMSE value of DecisionTreeRegressor is 0.011833144819992923


## RandomForestRegressor

Importamos a *RandomForestRegressor*

In [30]:
# import lib
from pyspark.ml.regression import RandomForestRegressor

Creamos el Regresor RF:

In [31]:
# Regresor 
Rfr = RandomForestRegressor()


Entrenamos el modelo:

In [32]:
# Train model, le llamaremos rf_model
rf_model=Rfr.fit(train)


Desplegamos las *featureImportances*:

In [33]:
# importances 
rf_model.featureImportances



SparseVector(5, {0: 0.5837, 1: 0.0436, 2: 0.0153, 3: 0.269, 4: 0.0884})

Desplegamos el numero de arboles (Num of Trees)

In [34]:
# Numero de Trees
rf_model.numTrees



Param(parent='RandomForestRegressor_93d71a3fe7c1', name='numTrees', doc='Number of trees to train (>= 1)')

Evaluamos el modelo con los datos de entrenamiento, le llamaremos model_predictions:


In [35]:
# model_predictions
model_predictions = rf_model.transform(train)

Desplegamos los valores del *model_predictions*

In [36]:
model_predictions.show()

+--------------------+-----+-------------------+
|            features|label|         prediction|
+--------------------+-----+-------------------+
|[463.0,527.0,67.0...|0.311|  0.323144625915751|
|[464.0,640.0,66.0...|0.301|0.32634411309523814|
|[468.0,746.0,52.0...|0.329| 0.3314525871212121|
|[473.0,499.0,73.0...|0.315|0.32395462591575097|
|[495.0,628.0,66.0...|0.315| 0.3238979592490843|
|[495.0,752.0,50.0...|0.327| 0.3325405871212121|
|[498.0,615.0,67.0...|0.318| 0.3238979592490843|
|[501.0,774.0,51.0...|0.315| 0.3325405871212121|
|[511.0,576.0,76.0...|0.329| 0.3274458759157509|
|[513.0,698.0,61.0...|0.339| 0.3401561907208477|
|[519.0,595.0,73.0...|0.332|0.33670635064712834|
|[522.0,621.0,72.0...|0.317| 0.3247079592490843|
|[527.0,569.0,75.0...|0.341| 0.3379772176106246|
|[528.0,652.0,71.0...|0.319|0.33080809732221805|
|[531.0,491.0,89.0...| 0.32| 0.3282796259157509|
|[531.0,734.0,55.0...| 0.34| 0.3406594249865819|
|[532.0,690.0,69.0...|0.351| 0.3488143272234122|
|[534.0,609.0,69.0..

Usando *RegressionEvaluator* calculamos e imprimimos el valor de las metricas R2 y RMSE:

In [37]:
# R2 value of the model on test data 
dt_evaluator = RegressionEvaluator(metricName='r2')
dt_r2 = dt_evaluator.evaluate(model_predictions)
print(f'The r-square value of DecisionTreeRegressor is {dt_r2}')

# RMSE value of the model on test data 
dt_evaluatorRMSE = RegressionEvaluator(metricName='rmse')
dt_RMSE = dt_evaluatorRMSE.evaluate(model_predictions)
print(f'The RMSE value of DecisionTreeRegressor is {dt_RMSE}')

The r-square value of DecisionTreeRegressor is 0.869893866063758
The RMSE value of DecisionTreeRegressor is 0.01188765019469272


## Gradient-Boosted Tree Regressor

Importamos a GBTRegressor


In [38]:
# import
from pyspark.ml.regression import GBTRegressor

Creamos el Regresor GBTR, le llamaremos gbt:


In [39]:
# regresor
gbt = GBTRegressor()


Entrenamos el modelo:

In [40]:
# Train model, le llamaremos gbt_model
gbt_model=gbt.fit(train)




Desplegamos las featureImportances:

In [41]:
#Importances
gbt_model.featureImportances



SparseVector(5, {0: 0.2557, 1: 0.148, 2: 0.1907, 3: 0.2282, 4: 0.1774})

Evaluamos el modelo con los datos de entrenamiento, le llamaremos model_predictions:

In [42]:
# model_predictions
model_predictions = gbt_model.transform(train)


Desplegamos los valores del *model_predictions*

In [43]:
# show 
model_predictions.show()



+--------------------+-----+-------------------+
|            features|label|         prediction|
+--------------------+-----+-------------------+
|[463.0,527.0,67.0...|0.311|0.31484423875591977|
|[464.0,640.0,66.0...|0.301| 0.3098513166111605|
|[468.0,746.0,52.0...|0.329|0.33055495697662485|
|[473.0,499.0,73.0...|0.315| 0.3175020365100298|
|[495.0,628.0,66.0...|0.315| 0.3156029371750579|
|[495.0,752.0,50.0...|0.327|0.33055495697662485|
|[498.0,615.0,67.0...|0.318|0.31927624562126766|
|[501.0,774.0,51.0...|0.315| 0.3308927794818314|
|[511.0,576.0,76.0...|0.329|0.32460156134438206|
|[513.0,698.0,61.0...|0.339|0.34258611626571517|
|[519.0,595.0,73.0...|0.332| 0.3232994384240227|
|[522.0,621.0,72.0...|0.317| 0.3206269651511555|
|[527.0,569.0,75.0...|0.341|0.34327576184871855|
|[528.0,652.0,71.0...|0.319| 0.3207163035434285|
|[531.0,491.0,89.0...| 0.32|0.31984856541648965|
|[531.0,734.0,55.0...| 0.34|0.34322162364329145|
|[532.0,690.0,69.0...|0.351| 0.3450123117642031|
|[534.0,609.0,69.0..

Usando RegressionEvaluator calculamos e imprimimos el valor de las metricas R2 y RMSE:

In [44]:
#Select (prediction, true label) and compute test error
# R2 value of the model on test data 
dt_evaluator = RegressionEvaluator(metricName='r2')
dt_r2 = dt_evaluator.evaluate(model_predictions)
print(f'The r-square value of DecisionTreeRegressor is {dt_r2}')

# RMSE value of the model on test data 
dt_evaluatorRMSE = RegressionEvaluator(metricName='rmse')
dt_RMSE = dt_evaluatorRMSE.evaluate(model_predictions)
print(f'The RMSE value of DecisionTreeRegressor is {dt_RMSE}')


The r-square value of DecisionTreeRegressor is 0.9306579588216147
The RMSE value of DecisionTreeRegressor is 0.008678515466733636


 ## Exploracion de datos...

Usaremos el dataset https://archive.ics.uci.edu/ml/datasets/Bank+Marketing 

Indique a grandes razgos de que se trata este dataset:


Carga de datos, archivo bank_data.csv:


In [46]:
# Load csv Dataset 
df=session.read.csv('bank_data.csv',inferSchema=True,header=True)

Determine la cantidad de datos en el dataset:

In [47]:
#number of records
df.count()




41188

A que dato corresponde cada columna?

In [54]:
# columns values
df.limit(10).toPandas().head()


Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,target_class
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


Imprima el Schema:

In [49]:
#dataype of input data - Schema
df.printSchema()



root
 |-- age: integer (nullable = true)
 |-- job: string (nullable = true)
 |-- marital: string (nullable = true)
 |-- education: string (nullable = true)
 |-- default: string (nullable = true)
 |-- housing: string (nullable = true)
 |-- loan: string (nullable = true)
 |-- contact: string (nullable = true)
 |-- month: string (nullable = true)
 |-- day_of_week: string (nullable = true)
 |-- duration: integer (nullable = true)
 |-- campaign: integer (nullable = true)
 |-- pdays: integer (nullable = true)
 |-- previous: integer (nullable = true)
 |-- poutcome: string (nullable = true)
 |-- emp.var.rate: double (nullable = true)
 |-- cons.price.idx: double (nullable = true)
 |-- cons.conf.idx: double (nullable = true)
 |-- euribor3m: double (nullable = true)
 |-- nr.employed: double (nullable = true)
 |-- target_class: string (nullable = true)



En cuanto a la salida, como es la distrubución de clases?

In [59]:
# YES/NO Class Distribution
yes=df.filter(df['target_class']=="yes").count()
no=df.filter(df['target_class']=="no").count()
print("Yes:",yes," No:",no)




Yes: 4640  No: 36548


La distribución no se encuentra balanceada, debido a que existen una gran cantidad de rilas para la clase "No". 

Una tarea típica, resulta de convertir los valores binarios en 1 y 0, usando como referencia "label", convierta los no/yes en 0/1:

In [57]:
from pyspark.sql import functions as F
from pyspark.sql import *

In [67]:
# Ingrese acá la instrucción: 
from pyspark.sql import functions as F
df = df.withColumn('label',F.when((col("target_class") == "yes"),1).otherwise(0))



In [69]:
# New 1/0 Class Distribution

yes=df.filter(df['label']==1).count()
no=df.filter(df['label']==0).count()
print("Yes:",yes," No:",no)

Yes: 4640  No: 36548


A continuación se presenta un ejercicio de Deep Learning para su revisión...

# Deep Learning 

Importamos las librerias necesarias:

In [70]:
import os
import numpy as np
import pandas as pd
from pyspark.sql.types import *
from pyspark.ml import Pipeline
from pyspark.sql import functions as f
from pyspark.sql.functions import udf, StringType
from pyspark.sql import SparkSession, functions as F
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.feature import OneHotEncoder, VectorAssembler, StringIndexer

Inicializamos la sesion SPARK:

In [71]:
spark = SparkSession.builder.appName('deep_learning').getOrCreate()

Leemos el dataset:

In [72]:
data = spark.read.csv('dl_data.csv', header=True, inferSchema=True)

In [73]:
data.printSchema()

root
 |-- Visit_Number_Bucket: string (nullable = true)
 |-- Page_Views_Normalized: double (nullable = true)
 |-- Orders_Normalized: integer (nullable = true)
 |-- Internal_Search_Successful_Normalized: double (nullable = true)
 |-- Internal_Search_Null_Normalized: double (nullable = true)
 |-- Email_Signup_Normalized: double (nullable = true)
 |-- Total_Seconds_Spent_Normalized: double (nullable = true)
 |-- Store_Locator_Search_Normalized: double (nullable = true)
 |-- Mapped_Last_Touch_Channel: string (nullable = true)
 |-- Mapped_Mobile_Device_Type: string (nullable = true)
 |-- Mapped_Browser_Type: string (nullable = true)
 |-- Mapped_Entry_Pages: string (nullable = true)
 |-- Mapped_Site_Section: string (nullable = true)
 |-- Mapped_Promo_Code: string (nullable = true)
 |-- Maped_Product_Name: string (nullable = true)
 |-- Mapped_Search_Term: string (nullable = true)
 |-- Mapped_Product_Collection: string (nullable = true)



Renombramos la columna TARGET:

In [74]:
data = data.withColumnRenamed('Orders_Normalized', 'label')

In [75]:
data.printSchema()

root
 |-- Visit_Number_Bucket: string (nullable = true)
 |-- Page_Views_Normalized: double (nullable = true)
 |-- label: integer (nullable = true)
 |-- Internal_Search_Successful_Normalized: double (nullable = true)
 |-- Internal_Search_Null_Normalized: double (nullable = true)
 |-- Email_Signup_Normalized: double (nullable = true)
 |-- Total_Seconds_Spent_Normalized: double (nullable = true)
 |-- Store_Locator_Search_Normalized: double (nullable = true)
 |-- Mapped_Last_Touch_Channel: string (nullable = true)
 |-- Mapped_Mobile_Device_Type: string (nullable = true)
 |-- Mapped_Browser_Type: string (nullable = true)
 |-- Mapped_Entry_Pages: string (nullable = true)
 |-- Mapped_Site_Section: string (nullable = true)
 |-- Mapped_Promo_Code: string (nullable = true)
 |-- Maped_Product_Name: string (nullable = true)
 |-- Mapped_Search_Term: string (nullable = true)
 |-- Mapped_Product_Collection: string (nullable = true)



Partimos lo datos en Train, Validation y Test:

In [89]:
train, validation, test  = data.randomSplit([0.6, 0.3, 0.1], 1234)

Construimos el Pipeline

In [90]:
categorical_columns = [item[0] for item in data.dtypes if item[1].startswith('string')]
numeric_columns = [item[0] for item in data.dtypes if item[1].startswith('double')]

indexers = [StringIndexer(inputCol=column, outputCol='{0}_index'.format(column)) for column in categorical_columns]

featuresCreator = VectorAssembler(inputCols=[indexer.getOutputCol() for indexer in indexers] + numeric_columns, outputCol="features")

layers = [len(featuresCreator.getInputCols()), 8, 4, 4]

classifier = MultilayerPerceptronClassifier(labelCol='label', featuresCol='features', maxIter=100, layers=layers, blockSize=128, seed=1234)

pipeline = Pipeline(stages=indexers + [featuresCreator, classifier])

Entrenamos...

In [91]:
model = pipeline.fit(train)

Validamos y Evaluamos

In [92]:
train_output_df = model.transform(train)
validation_output_df = model.transform(validation)
test_output_df = model.transform(test)

Llevamos a cabo, algunas predicciones:

In [93]:
train_predictionAndLabels = train_output_df.select("prediction", "label")
validation_predictionAndLabels = validation_output_df.select("prediction", "label")
test_predictionAndLabels = test_output_df.select("prediction", "label")

metrics = ['weightedPrecision', 'weightedRecall', 'accuracy']

for metric in metrics:
    evaluator = MulticlassClassificationEvaluator(metricName=metric)
    print('Train ' + metric + ' = ' + str(evaluator.evaluate(train_predictionAndLabels)))
    print('Validation ' + metric + ' = ' + str(evaluator.evaluate(validation_predictionAndLabels)))
    print('Test ' + metric + ' = ' + str(evaluator.evaluate(test_predictionAndLabels)))

Train weightedPrecision = 0.9795794395360935
Validation weightedPrecision = 0.9789308008148192
Test weightedPrecision = 0.982346997695827
Train weightedRecall = 0.9793233509804367
Validation weightedRecall = 0.9787243673894346
Test weightedRecall = 0.9821259309410968
Train accuracy = 0.9793233509804367
Validation accuracy = 0.9787243673894346
Test accuracy = 0.9821259309410968


Puede mejorar el test accuracy del modelo variando alguno de los hyperparametros?

Resultado Mejorado: 

### Datos cambiados:

data.randomSplit([0.6, 0.3, 0.1], 1234)

Para esta caso se incremento el dato de validacion en un 30%

Ademas se agregaron el doble de neuronas por cada layer