In [None]:
import pyspark
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pyspark.sql import SparkSession
from pyspark.sql import functions as func
from pyspark.sql.types import IntegerType,StringType,DoubleType
from pyspark.ml.stat import Correlation
from pyspark.ml.feature import StringIndexer
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.sql.functions import split,udf,col,regexp_replace

In [None]:
conf = pyspark.SparkConf().setMaster('local[*]') \
        .set('spark.executor.heartbeatInterval', 10000) \
        .set('spark.network.timeout', 10000) \
        .set('spark.core.connection.ack.wait.timeout', '3600')
spark = SparkSession \
        .builder \
        .appName('parking_model') \
        .config(conf=conf) \
        .getOrCreate()

In [None]:
from pyspark import SparkContext
sc = SparkContext.getOrCreate()

In [None]:
df = spark.read.csv('./processed_parking.csv',header=True,inferSchema=True)

In [None]:
index = [StringIndexer(inputCol=column, outputCol=column+"_index",handleInvalid='keep').fit(df) for column in list(set(df.columns)-set(['Month','Day','Time_Hour','Violation_County']))]
target_index = StringIndexer(inputCol="Violation_County", outputCol="label",handleInvalid='keep').fit(df)
assembler = VectorAssembler(inputCols=['Month','Day','Time_Hour','Violation_In_Front_Of_Or_Opposite_index','Street_Code1_index','Issuer_Command_index','Violation_Location_index','Vehicle_Body_Type_index','Meridiem_index','Registration_State_index','Plate_Type_index','Issuer_Precinct_index','Street_Code2_index','Issuing_Agency_index','Violation_Code_index','Vehicle_Make_index','Street_Code3_index'],outputCol='features')

In [None]:
pipeline = Pipeline(stages=index+[target_index,assembler])
df = pipeline.fit(df).transform(df)

In [None]:
train, test = df.randomSplit([0.8,0.2])

In [None]:
#creating a small test csv file for testing purpose (kafka,docker)
sample_test = test.limit(10000)
sample_test = sample_test.drop(*['Registration_State','Plate_Type','Violation_Code','Vehicle_Body_Type','Vehicle_Make','Issuing_Agency','Street_Code1','Street_Code2','Street_Code3','Violation_Location','Issuer_Precinct','Issuer_Command','Violation_County','Violation_In_Front_Of_Or_Opposite','Meridiem','label',
 'features'])
sample_test.toPandas().to_csv('../sample_test.csv')

In [None]:
dt = DecisionTreeClassifier(labelCol="label", featuresCol="features",maxBins=6700)
model_dt = dt.fit(train)

In [None]:
pred_dt = model_dt.transform(test2)

In [None]:
evaluator = MulticlassClassificationEvaluator(labelCol='label',predictionCol='prediction')
accuracy_dt = evaluator.evaluate(pred_dt)
print("Accuracy for Decision Tree = %s" % (accuracy_dt))
print("Test Error for Decision Tree = %s" % (1.0 - accuracy_dt))

In [None]:
rf = RandomForestClassifier(maxBins=6700,labelCol="label", featuresCol="features")
model_rf = rf.fit(train)

In [None]:
pred_rf = model_rf.transform(test)

In [None]:
evaluator = MulticlassClassificationEvaluator(labelCol='label',predictionCol='prediction')
accuracy_rf = evaluator.evaluate(pred_rf)

In [None]:
print("Accuracy for Random Forest Tree = %s" % (accuracy_rf))
print("Test Error for Random Forest Tree = %s" % (1.0 - accuracy_rf)