In [0]:
import pyspark.sql.functions as func
from pyspark.sql.functions import when

# loading the required libraries 

from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorAssembler
from sklearn.metrics import confusion_matrix
#from sklearn.datasets import load_iris
import pandas as pd 

In [0]:
schema = """
`0` STRING,
`SPEED` DOUBLE,
`DATA_AS_OF` STRING,
`LINK_ID` STRING,
`NewDateTime` TIMESTAMP,
`hour` INTEGER,
`minute` INTEGER,
`weekday` STRING,
`year` INTEGER,
`month` INTEGER,
`day` INTEGER,
`day_of_year` INTEGER """

In [0]:
df1 = spark.read.format("csv").option("header", "true").load("dbfs:/FileStore/shared_uploads/shapikul@gmu.edu/df1_sample-1.csv", schema=schema,header=True)

In [0]:
df1.printSchema()

root
 |-- 0: string (nullable = true)
 |-- SPEED: double (nullable = true)
 |-- DATA_AS_OF: string (nullable = true)
 |-- LINK_ID: string (nullable = true)
 |-- NewDateTime: timestamp (nullable = true)
 |-- hour: integer (nullable = true)
 |-- minute: integer (nullable = true)
 |-- weekday: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- day_of_year: integer (nullable = true)
 |-- congestion: integer (nullable = false)
 |-- nweekday: integer (nullable = true)



In [0]:
#func.when( \ func.col("variable1") == "value1" & func.col("variable2") == "value2", \ func.col("variable3")))

df1 = df1.withColumn("congestion", when(df1.SPEED <= 20, 1) \
               .otherwise(0))
df1 = df1.withColumn("nweekday", when (df1.weekday == "Sunday", 0) \
                    .when (df1.weekday == "Monday", 1) \
                    .when (df1.weekday == "Tuesday", 2) \
                    .when (df1.weekday == "Wednesday", 3) \
                    .when (df1.weekday == "Thursday", 4) \
                    .when (df1.weekday == "Friday", 5) \
                    .when (df1.weekday == "Saturday", 6) \
                    )
df1 = df1.withColumn("nLINK_ID", df1["LINK_ID"].cast("Integer"))

In [0]:
df1.display()

0,SPEED,DATA_AS_OF,LINK_ID,NewDateTime,hour,minute,weekday,year,month,day,day_of_year,congestion,nweekday,nLINK_ID
57336931,160.0,0.0,4616266,2019-08-14T17:58:05.000+0000,17,58,Wednesday,2019,8,14,226,0,3,4616266
36819189,155.0,17.39,4616232,2018-02-07T11:57:10.000+0000,11,57,Wednesday,2018,2,7,38,0,3,4616232
9653218,169.0,62.13,4616355,2020-06-18T05:43:04.000+0000,5,43,Thursday,2020,6,18,170,0,4,4616355
19702575,167.0,54.68,4616312,2021-05-07T01:23:03.000+0000,1,23,Friday,2021,5,7,127,0,5,4616312
52597983,448.0,41.63,4620343,2019-03-18T22:23:03.000+0000,22,23,Monday,2019,3,18,77,0,1,4620343
30498884,153.0,22.36,4616235,2017-08-23T19:58:20.000+0000,19,58,Wednesday,2017,8,23,235,0,3,4616235
47428172,354.0,49.7,4616228,2018-11-03T19:48:45.000+0000,19,48,Saturday,2018,11,3,307,0,6,4616228
18717587,430.0,0.0,4616213,2021-04-04T15:24:12.000+0000,15,24,Sunday,2021,4,4,94,0,0,4616213
25592712,439.0,39.76,4616200,2021-10-20T13:53:04.000+0000,13,53,Wednesday,2021,10,20,293,0,3,4616200
39421852,385.0,16.77,4616208,2018-04-13T19:18:04.000+0000,19,18,Friday,2018,4,13,103,0,5,4616208


In [0]:
df1.count()

Out[71]: 100000

In [0]:
features = ("nLINK_ID", "hour", "nweekday", "day_of_year", "year")

va = VectorAssembler(inputCols = features, outputCol='features')

va_df = va.transform(df1)
va_df = va_df.select(['features', 'congestion'])
va_df.show(3)

+--------------------+----------+
|            features|congestion|
+--------------------+----------+
|[4616266.0,17.0,3...|         0|
|[4616232.0,11.0,3...|         0|
|[4616355.0,5.0,4....|         0|
+--------------------+----------+
only showing top 3 rows



In [0]:
# split data into the train and test parts.
(train, test) = va_df.randomSplit([0.8, 0.2])

In [0]:
#can predict test data by using trasnform() method. 
 
dtc = DecisionTreeClassifier(featuresCol="features", labelCol="congestion")
dtc = dtc.fit(train)

pred = dtc.transform(test)
pred.show(3)

+--------------------+----------+-------------+-----------+----------+
|            features|congestion|rawPrediction|probability|prediction|
+--------------------+----------+-------------+-----------+----------+
|[4329472.0,0.0,5....|         0|[58046.0,0.0]|  [1.0,0.0]|       0.0|
|[4329472.0,1.0,0....|         0|[58046.0,0.0]|  [1.0,0.0]|       0.0|
|[4329472.0,1.0,2....|         0|[58046.0,0.0]|  [1.0,0.0]|       0.0|
+--------------------+----------+-------------+-----------+----------+
only showing top 3 rows



In [0]:
evaluator=MulticlassClassificationEvaluator(predictionCol="prediction")
acc = evaluator.evaluate(pred)
 
print("Prediction Accuracy: ", acc)
 
y_pred=pred.select("prediction").collect()
y_orig=pred.select("congestion").collect()

cm = confusion_matrix(y_orig, y_pred)
print("Confusion Matrix:")
print(cm)

[0;31m---------------------------------------------------------------------------[0m
[0;31mIllegalArgumentException[0m                  Traceback (most recent call last)
[0;32m<command-3373766230095446>[0m in [0;36m<module>[0;34m[0m
[1;32m      1[0m [0mevaluator[0m[0;34m=[0m[0mMulticlassClassificationEvaluator[0m[0;34m([0m[0mpredictionCol[0m[0;34m=[0m[0;34m"prediction"[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;32m----> 2[0;31m [0macc[0m [0;34m=[0m [0mevaluator[0m[0;34m.[0m[0mevaluate[0m[0;34m([0m[0mpred[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[1;32m      3[0m [0;34m[0m[0m
[1;32m      4[0m [0mprint[0m[0;34m([0m[0;34m"Prediction Accuracy: "[0m[0;34m,[0m [0macc[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[1;32m      5[0m [0;34m[0m[0m

[0;32m/databricks/python/lib/python3.8/site-packages/mlflow/utils/autologging_utils/safety.py[0m in [0;36msafe_patch_function[0;34m(*args, **kwargs)[0m
[1;32m    546[0m                