In [None]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://dlcdn.apache.org/spark/spark-3.3.0/spark-3.3.0-bin-hadoop3.tgz
!tar xf spark-3.3.0-bin-hadoop3.tgz
!pip install -q findspark

In [None]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.3.0-bin-hadoop3"

In [None]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
from pyspark.ml.feature import StringIndexer
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import DecisionTreeClassifier

In [None]:
df = spark.read.csv("/content/drive/MyDrive/processed_parking.csv",header=True,inferSchema=True)

In [None]:
index = [StringIndexer(inputCol=column, outputCol=column+"_index",handleInvalid='keep').fit(df) for column in list(set(df.columns)-set(['Month','Day','Time_Hour','Violation_County']))]
target_index = StringIndexer(inputCol="Violation_County", outputCol="label",handleInvalid='keep').fit(df)
assembler = VectorAssembler(inputCols=['Month','Day','Time_Hour','Violation_In_Front_Of_Or_Opposite_index','Street_Code1_index','Issuer_Command_index','Violation_Location_index','Vehicle_Body_Type_index','Meridiem_index','Registration_State_index','Plate_Type_index','Issuer_Precinct_index','Street_Code2_index','Issuing_Agency_index','Violation_Code_index','Vehicle_Make_index','Street_Code3_index'],outputCol='features')

In [None]:
pipeline = Pipeline(stages=index+[target_index,assembler])
df = pipeline.fit(df).transform(df)

In [None]:
train, test = df.randomSplit([0.8,0.2])

In [None]:
dt = DecisionTreeClassifier(labelCol="label", featuresCol="features",maxBins=6700)
model_dt = dt.fit(train)

In [None]:
model_dt.write().overwrite().save('/content/drive/MyDrive/DFmodel')
pred_dt = model_dt.transform(test)

In [None]:
evaluator = MulticlassClassificationEvaluator(labelCol='label',predictionCol='prediction')
accuracy_dt = evaluator.evaluate(pred_dt)

In [None]:
print("Accuracy for Decision Tree = %s" % (accuracy_dt))
print("Test Error for Decision Tree = %s" % (1.0 - accuracy_dt))

In [None]:
rf = RandomForestClassifier(maxBins=6700,labelCol="label", featuresCol="features")
model_rf = rf.fit(train)

In [None]:
#comment out 1st pipelinecell and run all if you want to use this cell
# or create a new train test split from the initial df
pipeline_2 = pipeline = Pipeline(stages=index+[target_index,assembler,rt])
pipeline_model = pipeline_2.fit(train)
pipeline_model.save('/content/drive/MyDrive/pipeline_model')

In [None]:
model_rf.save('/content/drive/MyDrive/Rfmodel')
pred_rf = model_rf.transform(test)

In [None]:
evaluator = MulticlassClassificationEvaluator(labelCol='label',predictionCol='prediction')
accuracy_dt = evaluator.evaluate(pred_rf)

In [None]:
print("Accuracy for Random Forest Tree = %s" % (accuracy_rf))
print("Test Error for Random Forest Tree = %s" % (1.0 - accuracy_rf))