In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('consult_project').getOrCreate()

In [2]:
data = spark.read.csv('dog_food.csv', inferSchema=True, header=True)

In [3]:
data.printSchema()

root
 |-- A: integer (nullable = true)
 |-- B: integer (nullable = true)
 |-- C: double (nullable = true)
 |-- D: integer (nullable = true)
 |-- Spoiled: double (nullable = true)



In [4]:
data.head(1)

[Row(A=4, B=2, C=12.0, D=3, Spoiled=1.0)]

In [5]:
from pyspark.ml.feature import VectorAssembler

In [6]:
data.columns

['A', 'B', 'C', 'D', 'Spoiled']

In [7]:
assembler = VectorAssembler(inputCols=['A', 'B', 'C', 'D'], outputCol='features')

In [8]:
output = assembler.transform(data)

In [9]:
output.show(5)

+---+---+----+---+-------+------------------+
|  A|  B|   C|  D|Spoiled|          features|
+---+---+----+---+-------+------------------+
|  4|  2|12.0|  3|    1.0|[4.0,2.0,12.0,3.0]|
|  5|  6|12.0|  7|    1.0|[5.0,6.0,12.0,7.0]|
|  6|  2|13.0|  6|    1.0|[6.0,2.0,13.0,6.0]|
|  4|  2|12.0|  1|    1.0|[4.0,2.0,12.0,1.0]|
|  4|  2|12.0|  3|    1.0|[4.0,2.0,12.0,3.0]|
+---+---+----+---+-------+------------------+
only showing top 5 rows



In [10]:
from pyspark.ml.classification import RandomForestClassifier

In [11]:
rfc = RandomForestClassifier(featuresCol='features', labelCol='Spoiled')

In [12]:
final_data = output.select('features', 'Spoiled')

In [14]:
final_data.show(5)

+------------------+-------+
|          features|Spoiled|
+------------------+-------+
|[4.0,2.0,12.0,3.0]|    1.0|
|[5.0,6.0,12.0,7.0]|    1.0|
|[6.0,2.0,13.0,6.0]|    1.0|
|[4.0,2.0,12.0,1.0]|    1.0|
|[4.0,2.0,12.0,3.0]|    1.0|
+------------------+-------+
only showing top 5 rows



In [15]:
model = rfc.fit(final_data)

In [17]:
final_data.head(1)

[Row(features=DenseVector([4.0, 2.0, 12.0, 3.0]), Spoiled=1.0)]

In [18]:
model.featureImportances

SparseVector(4, {0: 0.0179, 1: 0.0174, 2: 0.9367, 3: 0.028})

Feature C is causing more spoilage.