In [44]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('dogfood').getOrCreate()

In [45]:
df = spark.read.csv('dog_food.csv', header=True, inferSchema=True)

In [46]:
df.printSchema()

root
 |-- A: integer (nullable = true)
 |-- B: integer (nullable = true)
 |-- C: double (nullable = true)
 |-- D: integer (nullable = true)
 |-- Spoiled: double (nullable = true)



In [47]:
df.head(1)

[Row(A=4, B=2, C=12.0, D=3, Spoiled=1.0)]

In [48]:
df.columns

['A', 'B', 'C', 'D', 'Spoiled']

In [49]:
from pyspark.ml.feature import VectorAssembler

In [50]:
from pyspark.sql.functions import col

In [51]:
df = df.select(col("Spoiled").alias("label"),col("A").alias("A"),col("B").alias("B"),col("C").alias("C"),col("D").alias("D"))

In [52]:
df.columns

['label', 'A', 'B', 'C', 'D']

In [53]:
assembler = VectorAssembler(
    inputCols=[
        'A','B','C','D'],
    outputCol='features')

In [54]:
output = assembler.transform(df)

final_data = output.select(['features','label'])

In [55]:
train,test = final_data.randomSplit([0.7, 0.3])

In [56]:
train.describe().show()

+-------+-------------------+
|summary|              label|
+-------+-------------------+
|  count|                330|
|   mean|  0.296969696969697|
| stddev|0.45761695964387855|
|    min|                0.0|
|    max|                1.0|
+-------+-------------------+



In [57]:
from pyspark.ml.classification import (RandomForestClassifier, GBTClassifier,
                                       DecisionTreeClassifier)

In [58]:
dtc = DecisionTreeClassifier()
rfc = RandomForestClassifier(numTrees = 100)
gbt = GBTClassifier()

In [59]:
dtc_model = dtc.fit(train)
rfc_model = rfc.fit(train)
gbt_model = gbt.fit(train)

In [60]:
dtc_preds = dtc_model.transform(test)
rfc_preds = rfc_model.transform(test)
gbt_preds = gbt_model.transform(test)

In [61]:
dtc_preds.show()

+-------------------+-----+-------------+--------------------+----------+
|           features|label|rawPrediction|         probability|prediction|
+-------------------+-----+-------------+--------------------+----------+
| [1.0,1.0,12.0,4.0]|  1.0|   [0.0,77.0]|           [0.0,1.0]|       1.0|
|  [1.0,2.0,9.0,1.0]|  0.0|  [215.0,1.0]|[0.99537037037037...|       0.0|
|  [1.0,2.0,9.0,4.0]|  0.0|  [215.0,1.0]|[0.99537037037037...|       0.0|
|  [1.0,3.0,8.0,5.0]|  0.0|  [215.0,1.0]|[0.99537037037037...|       0.0|
|  [1.0,4.0,9.0,3.0]|  0.0|  [215.0,1.0]|[0.99537037037037...|       0.0|
|  [1.0,5.0,8.0,5.0]|  0.0|  [215.0,1.0]|[0.99537037037037...|       0.0|
| [1.0,5.0,8.0,10.0]|  0.0|  [215.0,1.0]|[0.99537037037037...|       0.0|
|  [1.0,6.0,8.0,9.0]|  0.0|  [215.0,1.0]|[0.99537037037037...|       0.0|
|  [1.0,7.0,8.0,4.0]|  0.0|  [215.0,1.0]|[0.99537037037037...|       0.0|
|  [1.0,8.0,8.0,6.0]|  0.0|  [215.0,1.0]|[0.99537037037037...|       0.0|
|  [1.0,9.0,7.0,4.0]|  0.0|  [215.0,1.

In [62]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(metricName='accuracy')

In [63]:
print(f'DTC: {evaluator.evaluate(dtc_preds)}') 
print(f'RFC: {evaluator.evaluate(rfc_preds)}')
print(f'GBT: {evaluator.evaluate(gbt_preds)}')

DTC: 0.975
RFC: 0.9875
GBT: 0.975
