# Descomposición Comida para Perros

In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('comida').getOrCreate()

In [0]:
ruta = 'dbfs:/FileStore/shared_uploads/jgamarramoreno@gmail.com/dog_food.csv'

In [0]:
datos = spark.read.csv(ruta,inferSchema=True,header=True)

In [0]:
datos.printSchema()

root
 |-- A: integer (nullable = true)
 |-- B: integer (nullable = true)
 |-- C: double (nullable = true)
 |-- D: integer (nullable = true)
 |-- Spoiled: double (nullable = true)



In [0]:
datos.head()

Out[5]: Row(A=4, B=2, C=12.0, D=3, Spoiled=1.0)

In [0]:
datos.describe().show()

+-------+------------------+------------------+------------------+------------------+-------------------+
|summary|                 A|                 B|                 C|                 D|            Spoiled|
+-------+------------------+------------------+------------------+------------------+-------------------+
|  count|               490|               490|               490|               490|                490|
|   mean|  5.53469387755102| 5.504081632653061| 9.126530612244897| 5.579591836734694| 0.2857142857142857|
| stddev|2.9515204234399057|2.8537966089662063|2.0555451971054275|2.8548369309982857|0.45221563164613465|
|    min|                 1|                 1|               5.0|                 1|                0.0|
|    max|                10|                10|              14.0|                10|                1.0|
+-------+------------------+------------------+------------------+------------------+-------------------+



In [0]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [0]:
datos.columns

Out[8]: ['A', 'B', 'C', 'D', 'Spoiled']

In [0]:
ensamblador = VectorAssembler(
    inputCols = ['A', 'B', 'C', 'D'],
    outputCol="caracteristicas"
)

In [0]:
salida = ensamblador.transform(datos)

In [0]:
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.classification import RandomForestClassifier

In [0]:
ad = DecisionTreeClassifier(labelCol='Spoiled',featuresCol='caracteristicas')

In [0]:
salida.printSchema()

root
 |-- A: integer (nullable = true)
 |-- B: integer (nullable = true)
 |-- C: double (nullable = true)
 |-- D: integer (nullable = true)
 |-- Spoiled: double (nullable = true)
 |-- caracteristicas: vector (nullable = true)



In [0]:
datos_finales = salida.select('caracteristicas','Spoiled')

In [0]:
datos_finales.head()

Out[16]: Row(caracteristicas=DenseVector([4.0, 2.0, 12.0, 3.0]), Spoiled=1.0)

In [0]:
modelo_ad = ad.fit(datos_finales)

In [0]:
modelo_ad.featureImportances

Out[18]: SparseVector(4, {1: 0.0019, 2: 0.9832, 3: 0.0149})

La característica en el índice 2 (Compuesto Químico C) es evidentemente (0.9832) la característica más importante, lo que significa que este está causando la descomposición temprana del alimento.