# Tres métodos basados en árboles

In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('arboles').getOrCreate()

Cargar los datos

In [0]:
ruta = 'dbfs:/FileStore/shared_uploads/jgamarramoreno@gmail.com/College.csv'

In [0]:
datos = spark.read.csv(ruta,inferSchema=True,header=True)

In [0]:
datos.printSchema()

root
 |-- School: string (nullable = true)
 |-- Private: string (nullable = true)
 |-- Apps: integer (nullable = true)
 |-- Accept: integer (nullable = true)
 |-- Enroll: integer (nullable = true)
 |-- Top10perc: integer (nullable = true)
 |-- Top25perc: integer (nullable = true)
 |-- F_Undergrad: integer (nullable = true)
 |-- P_Undergrad: integer (nullable = true)
 |-- Outstate: integer (nullable = true)
 |-- Room_Board: integer (nullable = true)
 |-- Books: integer (nullable = true)
 |-- Personal: integer (nullable = true)
 |-- PhD: integer (nullable = true)
 |-- Terminal: integer (nullable = true)
 |-- S_F_Ratio: double (nullable = true)
 |-- perc_alumni: integer (nullable = true)
 |-- Expend: integer (nullable = true)
 |-- Grad_Rate: integer (nullable = true)



In [0]:
datos.head()

Out[5]: Row(School='Abilene Christian University', Private='Yes', Apps=1660, Accept=1232, Enroll=721, Top10perc=23, Top25perc=52, F_Undergrad=2885, P_Undergrad=537, Outstate=7440, Room_Board=3300, Books=450, Personal=2200, PhD=70, Terminal=78, S_F_Ratio=18.1, perc_alumni=12, Expend=7041, Grad_Rate=60)

Formateo de datos para Spark

In [0]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [0]:
datos.columns

Out[7]: ['School',
 'Private',
 'Apps',
 'Accept',
 'Enroll',
 'Top10perc',
 'Top25perc',
 'F_Undergrad',
 'P_Undergrad',
 'Outstate',
 'Room_Board',
 'Books',
 'Personal',
 'PhD',
 'Terminal',
 'S_F_Ratio',
 'perc_alumni',
 'Expend',
 'Grad_Rate']

In [0]:
ensamblador = VectorAssembler(
    inputCols = ['Apps',
                 'Accept',
                 'Enroll',
                 'Top10perc',
                 'Top25perc',
                 'F_Undergrad',
                 'P_Undergrad',
                 'Outstate',
                 'Room_Board',
                 'Books',
                 'Personal',
                 'PhD',
                 'Terminal',
                 'S_F_Ratio',
                 'perc_alumni',
                 'Expend',
                 'Grad_Rate'],
    outputCol="caracteristicas"
)

In [0]:
salida = ensamblador.transform(datos)

Adaptar la columna 'Private' que tiene valores 'yes' o  'no'

In [0]:
from pyspark.ml.feature import StringIndexer

In [0]:
indexador = StringIndexer(inputCol='Private',outputCol='IndicePrivado')

In [0]:
salida_adaptada = indexador.fit(salida).transform(salida)

In [0]:
datos_finales = salida_adaptada.select('caracteristicas','IndicePrivado')

In [0]:
datos_finales.show(3)

+--------------------+-------------+
|     caracteristicas|IndicePrivado|
+--------------------+-------------+
|[1660.0,1232.0,72...|          0.0|
|[2186.0,1924.0,51...|          0.0|
|[1428.0,1097.0,33...|          0.0|
+--------------------+-------------+
only showing top 3 rows



Dividir los datos en conjunto de entrenamiento y prueba

In [0]:
datos_entrenamiento, datos_prueba = datos_finales.randomSplit([0.7,0.3])

Crear los modelos

In [0]:
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import GBTClassifier
from pyspark.ml import Pipeline

In [0]:
ad  = DecisionTreeClassifier(labelCol='IndicePrivado',featuresCol='caracteristicas')
ba  = RandomForestClassifier(labelCol='IndicePrivado',featuresCol='caracteristicas')
gbt = GBTClassifier(labelCol='IndicePrivado',featuresCol='caracteristicas')

Entrenar los modelos

In [0]:
modelo_ad  = ad.fit(datos_prueba)
modelo_ba  = ba.fit(datos_prueba)
modelo_gbt = gbt.fit(datos_prueba)

Comparar los modelos

In [0]:
predicciones_ad  = modelo_ad.transform(datos_prueba)
predicciones_ba  = modelo_ba.transform(datos_prueba)
predicciones_gbt = modelo_gbt.transform(datos_prueba)

In [0]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [0]:
eval_exactitud = MulticlassClassificationEvaluator(labelCol='IndicePrivado',
                                                  predictionCol='prediction',
                                                  metricName='accuracy')

In [0]:
ad_exa = eval_exactitud.evaluate(predicciones_ad)
ba_exa = eval_exactitud.evaluate(predicciones_ba)
gbt_exa = eval_exactitud.evaluate(predicciones_gbt)

In [0]:
print("RESULTADOS")
print('-'*80)
print("Árbol de decisión simple. Exactitud %2.2f%%"%(ad_exa*100))
print('-'*80)
print("Bosque Aleatorio. Exactitud %2.2f%%"%(ba_exa*100))
print('-'*80)
print("Árbol Potenciado con Gradiente. Exactitud %2.2f%%"%(gbt_exa*100))

RESULTADOS
--------------------------------------------------------------------------------
Árbol de decisión simple. Exactitud 97.77%
--------------------------------------------------------------------------------
Bosque Aleatorio. Exactitud 99.11%
--------------------------------------------------------------------------------
Árbol Potenciado con Gradiente. Exactitud 100.00%
