# Tree Methods Consulting Project

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('DogFood').getOrCreate()
food = spark.read.csv('dog_food.csv', inferSchema=True, header=True)

In [2]:
food.printSchema()

root
 |-- A: integer (nullable = true)
 |-- B: integer (nullable = true)
 |-- C: double (nullable = true)
 |-- D: integer (nullable = true)
 |-- Spoiled: double (nullable = true)



In [3]:
from pyspark.ml.feature import VectorAssembler

In [4]:
food.columns

['A', 'B', 'C', 'D', 'Spoiled']

In [5]:
assembler = VectorAssembler(inputCols=['A', 'B', 'C', 'D'], outputCol='features')

In [6]:
output = assembler.transform(food)

In [7]:
from pyspark.ml.classification import RandomForestClassifier, DecisionTreeClassifier

In [9]:
rfc = RandomForestClassifier(labelCol='Spoiled', featuresCol='features')
dtc = DecisionTreeClassifier(labelCol='Spoiled', featuresCol='features')

In [13]:
food_final = output.select('features', 'Spoiled')

In [14]:
rfc_model = rfc.fit(food_final)
dtc_model = dtc.fit(food_final)

In [16]:
print('RFC:')
rfc_model.featureImportances

RFC:


SparseVector(4, {0: 0.0193, 1: 0.0203, 2: 0.937, 3: 0.0234})

In [17]:
print('DTC:')
dtc_model.featureImportances

DTC:


SparseVector(4, {1: 0.0019, 2: 0.9832, 3: 0.0149})