In [1]:
import findspark
findspark.init()
import pyspark

from pyspark.sql.functions import *
from pyspark.sql import SparkSession
from pyspark.sql.types import IntegerType
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml.classification import LogisticRegression, MultilayerPerceptronClassifier,DecisionTreeClassifier,RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

spark=spark = SparkSession \
    .builder \
    .appName("US_accidents") \
    .getOrCreate()
print(spark)

<pyspark.sql.session.SparkSession object at 0x00000196936A2DF0>


In [2]:
sev1 = spark.read.options(header="True", InferSchema="True", nullValue="null" ).csv("dataset/severity/Severity=1/part-00000-79480ebc-ce62-4cda-b67d-03599fb35b85.c000.csv")
sev2 = spark.read.options(header="True", InferSchema="True", nullValue="null" ).csv("dataset/severity/Severity=2/part-00000-79480ebc-ce62-4cda-b67d-03599fb35b85.c000.csv")
sev3 = spark.read.options(header="True", InferSchema="True", nullValue="null" ).csv("dataset/severity/Severity=3/part-00000-79480ebc-ce62-4cda-b67d-03599fb35b85.c000.csv")
sev4 = spark.read.options(header="True", InferSchema="True", nullValue="null" ).csv("dataset/severity/Severity=4/part-00000-79480ebc-ce62-4cda-b67d-03599fb35b85.c000.csv")
sev1=sev1.withColumn("Severity", lit(1)).limit(2000)#max 25499
sev2=sev2.withColumn("Severity", lit(2)).limit(2000)
sev3=sev3.withColumn("Severity", lit(3)).limit(2000)
sev4=sev4.withColumn("Severity", lit(4)).limit(2000)
df=sev1.union(sev2).union(sev3).union(sev4)

In [3]:
newdf=df.groupby("Severity").count().show()

+--------+-----+
|Severity|count|
+--------+-----+
|       1| 2000|
|       2| 2000|
|       3| 2000|
|       4| 2000|
+--------+-----+



In [4]:
#hot one encoder
inputIndexer=["Street","Side","City","County","State","Zipcode","Wind_Direction",
              "Weather_Condition"]

outputIndexer=["Streetindex","Sideindex","Cityindex","Countyindex","Stateindex","Zipcodeindex",
               "Wind_Directionindex","Weather_Conditionindex"]

outputEncoder=["Streetfinal","Sidefinal","Cityfinal","Countyfinal","Statefinal","Zipcodefinal",
              "Wind_Directionfinal","Weather_Conditionfinal"]

numericCols = ["Start_Lat", "Start_Lng", "Distance(mi)", "Temperature(F)","Humidity(%)","Pressure(in)","Visibility(mi)",
               "Wind_Speed(mph)","Precipitation(in)","Amenity","Crossing","Give_Way","Junction","No_Exit","Railway","Roundabout",
               "Station","Stop","Traffic_Calming","Traffic_Signal","dayofweek","year","month","dayofmonth","hour","Duration"]

#create a string indexer
indexer = StringIndexer(inputCols=inputIndexer, outputCols=outputIndexer, handleInvalid="error")

#create onehotencoder
#encoder = OneHotEncoder(inputCols=outputIndexer, outputCols=outputEncoder)

#make a vectorassembler(for spark all the features must be in one column this joins all the columns in one)
vector_assembler=VectorAssembler(inputCols=numericCols+outputIndexer, outputCol="features")

#featureScaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures")

In [5]:
#building the pipeline
from pyspark.ml import Pipeline 
pipe= Pipeline(stages=[indexer, vector_assembler])
#, encoder

In [6]:
#fit and transform the data
newdf = pipe.fit(df).transform(df)
newdf=newdf.select(["features", "Severity"])

In [7]:
#split the data into training and test sets
training, test = newdf.randomSplit([.7, .3])

In [8]:
#calculate the correlation between two features
df.stat.corr('Severity','Distance(mi)')

0.1971507171927432

In [9]:
test.show(1)

+--------------------+--------+
|            features|Severity|
+--------------------+--------+
|(34,[0,1,2,3,4,5,...|       1|
+--------------------+--------+
only showing top 1 row



In [10]:
#logistic regression:::
# Fit the model on the training data
logistic = LogisticRegression(labelCol="Severity", featuresCol="features", maxIter=10, regParam=0, elasticNetParam=0.8).fit(training)
# Create predictions for the testing data 
predictionlogistic = logistic.transform(test)

In [11]:
dt = DecisionTreeClassifier(labelCol="Severity", featuresCol="features", maxBins=10257).fit(training)

# Make predictions.
predictionsdt = dt.transform(test)

In [12]:
rf = RandomForestClassifier(labelCol="Severity", featuresCol="features", numTrees=10,maxDepth=8, maxBins=10257).fit(training)

# Make predictions.
predictionsrf = rf.transform(test)

In [13]:
#https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.evaluation.MulticlassClassificationEvaluator.html#pyspark.ml.evaluation.MulticlassClassificationEvaluator#evaluate the models
#Evaluating the model
#forest:::

predictionsrf = predictionsrf.select("prediction", "Severity")
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol= "Severity")

TPRseverity1=evaluator.evaluate(predictionsrf, {evaluator.metricName: "truePositiveRateByLabel", evaluator.metricLabel: 1})
TPRseverity2=evaluator.evaluate(predictionsrf, {evaluator.metricName: "truePositiveRateByLabel", evaluator.metricLabel: 2})
TPRseverity3=evaluator.evaluate(predictionsrf, {evaluator.metricName: "truePositiveRateByLabel", evaluator.metricLabel: 3})
TPRseverity4=evaluator.evaluate(predictionsrf, {evaluator.metricName: "truePositiveRateByLabel", evaluator.metricLabel: 4})

FPRseverity1=evaluator.evaluate(predictionsrf, {evaluator.metricName: "falsePositiveRateByLabel", evaluator.metricLabel: 1})
FPRseverity2=evaluator.evaluate(predictionsrf, {evaluator.metricName: "falsePositiveRateByLabel", evaluator.metricLabel: 2})
FPRseverity3=evaluator.evaluate(predictionsrf, {evaluator.metricName: "falsePositiveRateByLabel", evaluator.metricLabel: 3})
FPRseverity4=evaluator.evaluate(predictionsrf, {evaluator.metricName: "falsePositiveRateByLabel", evaluator.metricLabel: 4})

fMeasureseverity1=evaluator.evaluate(predictionsrf, {evaluator.metricName: "fMeasureByLabel", evaluator.metricLabel: 1})
fMeasureseverity2=evaluator.evaluate(predictionsrf, {evaluator.metricName: "fMeasureByLabel", evaluator.metricLabel: 2})
fMeasureseverity3=evaluator.evaluate(predictionsrf, {evaluator.metricName: "fMeasureByLabel", evaluator.metricLabel: 3})
fMeasureseverity4=evaluator.evaluate(predictionsrf, {evaluator.metricName: "fMeasureByLabel", evaluator.metricLabel: 4})

accuracy=evaluator.evaluate(predictionsrf, {evaluator.metricName: "accuracy"})
f1=evaluator.evaluate(predictionsrf, {evaluator.metricName: "f1"})
weightedPrecision=evaluator.evaluate(predictionsrf, {evaluator.metricName: "weightedPrecision"})
weightedRecall=evaluator.evaluate(predictionsrf, {evaluator.metricName: "weightedRecall"})
weightedTruePositiveRate=evaluator.evaluate(predictionsrf, {evaluator.metricName: "weightedTruePositiveRate"})
weightedFalsePositiveRate=evaluator.evaluate(predictionsrf, {evaluator.metricName: "weightedFalsePositiveRate"})
weightedFMeasure=evaluator.evaluate(predictionsrf, {evaluator.metricName: "weightedFMeasure"})#
hammingLoss=evaluator.evaluate(predictionsrf, {evaluator.metricName: "hammingLoss"})
print("########### Random Forest Classifier Evaluation ###########\n")
print("True positive rate for severity:1 = " + str(TPRseverity1))
print("True positive rate for severity:2 = " + str(TPRseverity2))
print("True positive rate for severity:3 = " + str(TPRseverity3))
print("True positive rate for severity:4 = " + str(TPRseverity4))

print("\nFalse positive rate for severity:1 = " + str(FPRseverity1))
print("False positive rate for severity:2 = " + str(FPRseverity2))
print("False positive rate for severity:3 = " + str(FPRseverity3))
print("False positive rate for severity:4 = " + str(FPRseverity4))

print("\nfMeasure for severity:1 = " + str(fMeasureseverity1))
print("fMeasure for severity:2 = " + str(fMeasureseverity2))
print("fMeasure for severity:3 = " + str(fMeasureseverity3))
print("fMeasure for severity:4 = " + str(fMeasureseverity4))


print("\nf1 Score: %s\nFalse Positive Rate %s\nTrue Positive Rate: %s\nPrecision: %s\nHamming Loss:%s"
      % (f1, weightedFalsePositiveRate, weightedTruePositiveRate, weightedPrecision, hammingLoss))

########### Random Forest Classifier Evaluation ###########

True positive rate for severity:1 = 0.9275590551181102
True positive rate for severity:2 = 0.7445742904841403
True positive rate for severity:3 = 0.6830065359477124
True positive rate for severity:4 = 0.5054545454545455

False positive rate for severity:1 = 0.0704145371947757
False positive rate for severity:2 = 0.06622148024485253
False positive rate for severity:3 = 0.12275784753363228
False positive rate for severity:4 = 0.10996749729144095

fMeasure for severity:1 = 0.8738872403560831
fMeasure for severity:2 = 0.7663230240549829
fMeasure for severity:3 = 0.6693354683746998
fMeasure for severity:4 = 0.5392822502424831

f1 Score: 0.7179400806818571
False Positive RateR: 0.09181545241495188
True Positive Rate: 0.7224540901502504
Precision: 0.716560329622679
Hamming Loss:0.2775459098497496


In [38]:
#https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.evaluation.MulticlassClassificationEvaluator.html#pyspark.ml.evaluation.MulticlassClassificationEvaluator#evaluate the models
#Evaluating the model
#Logistic Regression:::

#predictionlogistic.groupBy("Severity", "prediction").count().show()
predictionAndLabelslr = predictionlogistic.select("prediction", "Severity")
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol= "Severity")

TPRseverity1=evaluator.evaluate(predictionAndLabelslr, {evaluator.metricName: "truePositiveRateByLabel", evaluator.metricLabel: 1})
TPRseverity2=evaluator.evaluate(predictionAndLabelslr, {evaluator.metricName: "truePositiveRateByLabel", evaluator.metricLabel: 2})
TPRseverity3=evaluator.evaluate(predictionAndLabelslr, {evaluator.metricName: "truePositiveRateByLabel", evaluator.metricLabel: 3})
TPRseverity4=evaluator.evaluate(predictionAndLabelslr, {evaluator.metricName: "truePositiveRateByLabel", evaluator.metricLabel: 4})

FPRseverity1=evaluator.evaluate(predictionAndLabelslr, {evaluator.metricName: "falsePositiveRateByLabel", evaluator.metricLabel: 1})
FPRseverity2=evaluator.evaluate(predictionAndLabelslr, {evaluator.metricName: "falsePositiveRateByLabel", evaluator.metricLabel: 2})
FPRseverity3=evaluator.evaluate(predictionAndLabelslr, {evaluator.metricName: "falsePositiveRateByLabel", evaluator.metricLabel: 3})
FPRseverity4=evaluator.evaluate(predictionAndLabelslr, {evaluator.metricName: "falsePositiveRateByLabel", evaluator.metricLabel: 4})

fMeasureseverity1=evaluator.evaluate(predictionAndLabelslr, {evaluator.metricName: "fMeasureByLabel", evaluator.metricLabel: 1})
fMeasureseverity2=evaluator.evaluate(predictionAndLabelslr, {evaluator.metricName: "fMeasureByLabel", evaluator.metricLabel: 2})
fMeasureseverity3=evaluator.evaluate(predictionAndLabelslr, {evaluator.metricName: "fMeasureByLabel", evaluator.metricLabel: 3})
fMeasureseverity4=evaluator.evaluate(predictionAndLabelslr, {evaluator.metricName: "fMeasureByLabel", evaluator.metricLabel: 4})

accuracy=evaluator.evaluate(predictionAndLabelslr, {evaluator.metricName: "accuracy"})
f1=evaluator.evaluate(predictionAndLabelslr, {evaluator.metricName: "f1"})
weightedPrecision=evaluator.evaluate(predictionAndLabelslr, {evaluator.metricName: "weightedPrecision"})
weightedRecall=evaluator.evaluate(predictionAndLabelslr, {evaluator.metricName: "weightedRecall"})
weightedTruePositiveRate=evaluator.evaluate(predictionAndLabelslr, {evaluator.metricName: "weightedTruePositiveRate"})
weightedFalsePositiveRate=evaluator.evaluate(predictionAndLabelslr, {evaluator.metricName: "weightedFalsePositiveRate"})
weightedFMeasure=evaluator.evaluate(predictionAndLabelslr, {evaluator.metricName: "weightedFMeasure"})#
hammingLoss=evaluator.evaluate(predictionAndLabelslr, {evaluator.metricName: "hammingLoss"})
print("########### Logistoc Regression Evaluation ###########\n")

print("True positive rate for severity:1 = " + str(TPRseverity1))
print("True positive rate for severity:2 = " + str(TPRseverity2))
print("True positive rate for severity:3 = " + str(TPRseverity3))
print("True positive rate for severity:4 = " + str(TPRseverity4))

print("\nFalse positive rate for severity:1 = " + str(FPRseverity1))
print("False positive rate for severity:2 = " + str(FPRseverity2))
print("False positive rate for severity:3 = " + str(FPRseverity3))
print("False positive rate for severity:4 = " + str(FPRseverity4))

print("\nfMeasure for severity:1 = " + str(fMeasureseverity1))
print("fMeasure for severity:2 = " + str(fMeasureseverity2))
print("fMeasure for severity:3 = " + str(fMeasureseverity3))
print("fMeasure for severity:4 = " + str(fMeasureseverity4))

print("\nf1 Score: %s\nFalse Positive Rate %s\nTrue Positive Rate: %s\nPrecision: %s\nHamming Loss:%s"
      % (f1, weightedFalsePositiveRate, weightedTruePositiveRate, weightedPrecision, hammingLoss))

########### Logistoc Regression Evaluation ###########

True positive rate for severity:1 = 0.7716535433070866
True positive rate for severity:2 = 0.6393989983305509
True positive rate for severity:3 = 0.5980392156862745
True positive rate for severity:4 = 0.4509090909090909

False positive rate for severity:1 = 0.14026121521862578
False positive rate for severity:2 = 0.1196438508625487
False positive rate for severity:3 = 0.11434977578475336
False positive rate for severity:4 = 0.13163596966413868

fMeasure for severity:1 = 0.7142857142857143
fMeasure for severity:2 = 0.6399331662489558
fMeasure for severity:3 = 0.6192893401015229
fMeasure for severity:4 = 0.47646493756003844

f1 Score: 0.6168414803650694
False Positive RateR: 0.12650850769033364
True Positive Rate: 0.6206176961602672
Precision: 0.616274552795516
Hamming Loss:0.37938230383973287


In [39]:
#https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.evaluation.MulticlassClassificationEvaluator.html#pyspark.ml.evaluation.MulticlassClassificationEvaluator#evaluate the models
#Evaluating the model
#Tree:::

#predictionlogistic.groupBy("Severity", "prediction").count().show()
predictionsdt = predictionsdt.select("prediction", "Severity")
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol= "Severity")

TPRseverity1=evaluator.evaluate(predictionsdt, {evaluator.metricName: "truePositiveRateByLabel", evaluator.metricLabel: 1})
TPRseverity2=evaluator.evaluate(predictionsdt, {evaluator.metricName: "truePositiveRateByLabel", evaluator.metricLabel: 2})
TPRseverity3=evaluator.evaluate(predictionsdt, {evaluator.metricName: "truePositiveRateByLabel", evaluator.metricLabel: 3})
TPRseverity4=evaluator.evaluate(predictionsdt, {evaluator.metricName: "truePositiveRateByLabel", evaluator.metricLabel: 4})

FPRseverity1=evaluator.evaluate(predictionsdt, {evaluator.metricName: "falsePositiveRateByLabel", evaluator.metricLabel: 1})
FPRseverity2=evaluator.evaluate(predictionsdt, {evaluator.metricName: "falsePositiveRateByLabel", evaluator.metricLabel: 2})
FPRseverity3=evaluator.evaluate(predictionsdt, {evaluator.metricName: "falsePositiveRateByLabel", evaluator.metricLabel: 3})
FPRseverity4=evaluator.evaluate(predictionsdt, {evaluator.metricName: "falsePositiveRateByLabel", evaluator.metricLabel: 4})

fMeasureseverity1=evaluator.evaluate(predictionsdt, {evaluator.metricName: "fMeasureByLabel", evaluator.metricLabel: 1})
fMeasureseverity2=evaluator.evaluate(predictionsdt, {evaluator.metricName: "fMeasureByLabel", evaluator.metricLabel: 2})
fMeasureseverity3=evaluator.evaluate(predictionsdt, {evaluator.metricName: "fMeasureByLabel", evaluator.metricLabel: 3})
fMeasureseverity4=evaluator.evaluate(predictionsdt, {evaluator.metricName: "fMeasureByLabel", evaluator.metricLabel: 4})


accuracy=evaluator.evaluate(predictionsdt, {evaluator.metricName: "accuracy"})
f1=evaluator.evaluate(predictionsdt, {evaluator.metricName: "f1"})
weightedPrecision=evaluator.evaluate(predictionsdt, {evaluator.metricName: "weightedPrecision"})
weightedRecall=evaluator.evaluate(predictionsdt, {evaluator.metricName: "weightedRecall"})
weightedTruePositiveRate=evaluator.evaluate(predictionsdt, {evaluator.metricName: "weightedTruePositiveRate"})
weightedFalsePositiveRate=evaluator.evaluate(predictionsdt, {evaluator.metricName: "weightedFalsePositiveRate"})
weightedFMeasure=evaluator.evaluate(predictionsdt, {evaluator.metricName: "weightedFMeasure"})#
hammingLoss=evaluator.evaluate(predictionsdt, {evaluator.metricName: "hammingLoss"})
print("########### Decision Tree Classifier Evaluation ###########\n")

print("True positive rate for severity:1 = " + str(TPRseverity1))
print("True positive rate for severity:2 = " + str(TPRseverity2))
print("True positive rate for severity:3 = " + str(TPRseverity3))
print("True positive rate for severity:4 = " + str(TPRseverity4))

print("\nFalse positive rate for severity:1 = " + str(FPRseverity1))
print("False positive rate for severity:2 = " + str(FPRseverity2))
print("False positive rate for severity:3 = " + str(FPRseverity3))
print("False positive rate for severity:4 = " + str(FPRseverity4))

print("\nfMeasure for severity:1 = " + str(fMeasureseverity1))
print("fMeasure for severity:2 = " + str(fMeasureseverity2))
print("fMeasure for severity:3 = " + str(fMeasureseverity3))
print("fMeasure for severity:4 = " + str(fMeasureseverity4))

print("\nf1 Score: %s\nFalse Positive Rate %s\nTrue Positive Rate: %s\nPrecision: %s\nHamming Loss:%s"
      % (f1, weightedFalsePositiveRate, weightedTruePositiveRate, weightedPrecision, hammingLoss))

########### Decision Tree Classifier Evaluation ###########

True positive rate for severity:1 = 0.8125984251968504
True positive rate for severity:2 = 0.7562604340567612
True positive rate for severity:3 = 0.5310457516339869
True positive rate for severity:4 = 0.5818181818181818

False positive rate for severity:1 = 0.03691084611016468
False positive rate for severity:2 = 0.08569838619922092
False positive rate for severity:3 = 0.11378923766816143
False positive rate for severity:4 = 0.19501625135427952

fMeasure for severity:1 = 0.8486842105263158
fMeasure for severity:2 = 0.7512437810945274
fMeasure for severity:3 = 0.5701754385964911
fMeasure for severity:4 = 0.5203252032520326

f1 Score: 0.67781123905238
False Positive Rate 0.10503742583933907
True Positive Rate: 0.6736227045075125
Precision: 0.6871938317249804
Hamming Loss:0.32637729549248745


In [16]:
f1lr=str(evaluator.evaluate(predictionAndLabelslr, {evaluator.metricName: "f1"}))
f1dt=str(evaluator.evaluate(predictionsdt, {evaluator.metricName: "f1"}))
f1rf=str(evaluator.evaluate(predictionsrf, {evaluator.metricName: "f1"}))
print("\nf1 Score logistic regression: %s\nf1 Score Decision Tree: %s\nf1 Score Random Forest: %s"
      % (f1lr, f1dt, f1rf))


f1 Score logistic regression: 0.6168414803650694
f1 Score Decision Tree: 0.67781123905238
f1 Score Random Forest: 0.7179400806818571


In [18]:
tprlr=evaluator.evaluate(predictionAndLabelslr, {evaluator.metricName: "weightedTruePositiveRate"})
fprlr=evaluator.evaluate(predictionAndLabelslr, {evaluator.metricName: "weightedFalsePositiveRate"})

tprdt=evaluator.evaluate(predictionsdt, {evaluator.metricName: "weightedTruePositiveRate"})
fprdt=evaluator.evaluate(predictionsdt, {evaluator.metricName: "weightedFalsePositiveRate"})

tprrf=evaluator.evaluate(predictionsrf, {evaluator.metricName: "weightedTruePositiveRate"})
fprrf=evaluator.evaluate(predictionsrf, {evaluator.metricName: "weightedFalsePositiveRate"})

print("\nTrue Positive Rate Logistic Regression: %s\nFalse Positive Rate Logistic Regression: %s\nTrue Positive Rate Decision Tree: %s\nFalse Positive Rate Decision Tree: %s\nTrue Positive Rate Random Forest: %s\nFalse Positive Rate Random Forest: %s"
      % (tprlr, fprlr, tprdt, fprdt, tprrf, fprrf))


True Positive Rate Logistic Regression: 0.6206176961602672
False Positive Rate Logistic Regression: 0.12650850769033364
True Positive Rate Decision Tree: 0.6736227045075125
False Positive Rate Decision Tree: 0.10503742583933907
True Positive Rate Random Forest: 0.7224540901502504
False Positive Rate Random Forest: 0.09181545241495188


In [19]:
#cross validation
#make the grid

# Import the tuning submodule
import pyspark.ml.tuning as tune

lr = LogisticRegression(maxIter=10, labelCol="Severity", featuresCol="features")

# Create the parameter grid
grid = tune.ParamGridBuilder()

# Add the hyperparameter
grid = grid.addGrid(lr.regParam, [0.1, 0.01])
grid = grid.addGrid(lr.elasticNetParam, [0, 1])

# Build the grid
grid = grid.build()

In [20]:
#cross validation
#make the validator

# Create the CrossValidator
cv = tune.CrossValidator(estimator=lr,
               estimatorParamMaps=grid,
               evaluator=evaluator
               )

In [21]:
# Fit cross validation models
models = cv.fit(training)

# Extract the best model
best_lr = models.bestModel

In [22]:
#get the hyperparams from cross validation and
# Call lr.fit()
best_lr = lr.fit(training)

# Print best_lr
print(best_lr)

LogisticRegressionModel: uid=LogisticRegression_cd86e929c60f, numClasses=5, numFeatures=34


In [23]:
#evaluate the model
# Use the model to predict the test set
test_results = best_lr.transform(test)

# Evaluate the predictions
print(evaluator.evaluate(test_results, {evaluator.metricName: "f1"}))

0.6168414803650694


In [31]:
#cross validation
#make the grid

# Import the tuning submodule
import pyspark.ml.tuning as tune

dttuned = DecisionTreeClassifier(labelCol="Severity", featuresCol="features", maxBins=10257)

# Create the parameter grid
grid = tune.ParamGridBuilder()

# Add the hyperparameter
grid = grid.addGrid(dttuned.maxDepth, [5, 10, 15, 20])

# Build the grid
grid = grid.build()

In [32]:
#cross validation
#make the validator
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol= "Severity")
# Create the CrossValidator
cv = tune.CrossValidator(estimator=dttuned,
               estimatorParamMaps=grid,
               evaluator=evaluator
               )

In [None]:
# Fit cross validation models
model = cv.fit(training)

In [None]:
#evaluate the model
# Use the model to predict the test set
test_results = model.transform(test)

# Evaluate the predictions
print(evaluator.evaluate(test_results, {evaluator.metricName: "f1"}))