# Preprocess, split data for validation, train, validate and predict

## Data

In [0]:
from pyspark.sql import DataFrame

beers: DataFrame = spark.createDataFrame(
    [
      ("Quinarelle", 70, 1), 
      ("Kriek", 10, 0),
      ("Pale Ale", 80, 1),
      ("Kronembourg", 18, 0),
      ("Chouffe", 50, 1),
      ("Edelweiss", 15, 0),
      ("Heineken", 5, 0),
      ("Chimay", 60, 1),
      ("Triple Karmeliet", 50, 1),
      ("Leffe", 30, 1)
    ],
    ["name", "bitterness", "good"])
  
beers.show()

## Preprocessing

In [0]:
from pyspark.ml.feature import VectorAssembler

ready_beers: DataFrame = VectorAssembler(inputCols=["bitterness"], outputCol="features").transform(beers)
  
ready_beers.show()

##Split data for validation

In [0]:
(training_set, test_set) = ready_beers.randomSplit([0.5, 0.5], seed=90)

In [0]:
training_set.show()

In [0]:
test_set.show()

## Training

In [0]:
from pyspark.ml.classification import DecisionTreeClassifier, DecisionTreeClassificationModel

decision_tree_builder: DecisionTreeClassifier = DecisionTreeClassifier(labelCol="good", featuresCol="features")

model: DecisionTreeClassificationModel = decision_tree_builder.fit(training_set)

## Predictions

In [0]:
predictions: DataFrame = model.transform(test_set)

predictions.show()

## Evaluation

In [0]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator: MulticlassClassificationEvaluator = MulticlassClassificationEvaluator(
    labelCol="x", predictionCol="y", metricName="accuracy")
  
accuracy = evaluator.function_to_replace(predictions)

print(accuracy)