In [1]:
import pyspark
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pyspark.sql import SparkSession
from pyspark.sql import functions as func
from pyspark.sql.types import IntegerType,StringType,DoubleType
from pyspark.ml.stat import Correlation
from pyspark.ml.feature import StringIndexer
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.sql.functions import split,udf,col,regexp_replace

In [2]:
conf = pyspark.SparkConf().setMaster('local[*]') \
        .set('spark.executor.heartbeatInterval', 10000) \
        .set('spark.network.timeout', 10000) \
        .set('spark.core.connection.ack.wait.timeout', '3600')
spark = SparkSession \
        .builder \
        .appName('parking_model') \
        .config(conf=conf) \
        .getOrCreate()

In [3]:
from pyspark import SparkContext
sc = SparkContext.getOrCreate()

In [4]:
df = spark.read.csv('./processed_parking.csv',header=True,inferSchema=True)

In [5]:
index = [StringIndexer(inputCol=column, outputCol=column+"_index",handleInvalid='keep').fit(df) for column in list(set(df.columns)-set(['Month','Day','Time_Hour','Violation_County']))]
target_index = StringIndexer(inputCol="Violation_County", outputCol="label",handleInvalid='keep').fit(df)
assembler = VectorAssembler(inputCols=['Month','Day','Time_Hour','Violation_In_Front_Of_Or_Opposite_index','Street_Code1_index','Issuer_Command_index','Violation_Location_index','Vehicle_Body_Type_index','Meridiem_index','Registration_State_index','Plate_Type_index','Issuer_Precinct_index','Street_Code2_index','Issuing_Agency_index','Violation_Code_index','Vehicle_Make_index','Street_Code3_index'],outputCol='features')

In [6]:
pipeline = Pipeline(stages=index+[target_index,assembler])
df = pipeline.fit(df).transform(df)


In [7]:
train, test = df.randomSplit([0.8,0.2])

In [13]:
test3 = test.drop(*['Registration_State',
 'Plate_Type',
 'Violation_Code',
 'Vehicle_Body_Type',
 'Vehicle_Make',
 'Issuing_Agency',
 'Street_Code1',
 'Street_Code2',
 'Street_Code3',
 'Violation_Location',
 'Issuer_Precinct',
 'Issuer_Command',
 'Violation_County',
 'Violation_In_Front_Of_Or_Opposite',
 'Meridiem','features'])

In [14]:
test3 = test3.limit(10000)


In [15]:
test3.columns

['Month',
 'Day',
 'Time_Hour',
 'Violation_In_Front_Of_Or_Opposite_index',
 'Vehicle_Make_index',
 'Vehicle_Body_Type_index',
 'Registration_State_index',
 'Plate_Type_index',
 'Street_Code1_index',
 'Violation_Code_index',
 'Issuer_Precinct_index',
 'Issuing_Agency_index',
 'Issuer_Command_index',
 'Street_Code3_index',
 'Meridiem_index',
 'Street_Code2_index',
 'Violation_Location_index',
 'label']

In [16]:
test3.toPandas().to_csv('./test_4.csv')

In [17]:
test3.columns

['Month',
 'Day',
 'Time_Hour',
 'Violation_In_Front_Of_Or_Opposite_index',
 'Vehicle_Make_index',
 'Vehicle_Body_Type_index',
 'Registration_State_index',
 'Plate_Type_index',
 'Street_Code1_index',
 'Violation_Code_index',
 'Issuer_Precinct_index',
 'Issuing_Agency_index',
 'Issuer_Command_index',
 'Street_Code3_index',
 'Meridiem_index',
 'Street_Code2_index',
 'Violation_Location_index',
 'label']

In [18]:
test3.printSchema()

root
 |-- Month: double (nullable = true)
 |-- Day: double (nullable = true)
 |-- Time_Hour: double (nullable = true)
 |-- Violation_In_Front_Of_Or_Opposite_index: double (nullable = false)
 |-- Vehicle_Make_index: double (nullable = false)
 |-- Vehicle_Body_Type_index: double (nullable = false)
 |-- Registration_State_index: double (nullable = false)
 |-- Plate_Type_index: double (nullable = false)
 |-- Street_Code1_index: double (nullable = false)
 |-- Violation_Code_index: double (nullable = false)
 |-- Issuer_Precinct_index: double (nullable = false)
 |-- Issuing_Agency_index: double (nullable = false)
 |-- Issuer_Command_index: double (nullable = false)
 |-- Street_Code3_index: double (nullable = false)
 |-- Meridiem_index: double (nullable = false)
 |-- Street_Code2_index: double (nullable = false)
 |-- Violation_Location_index: double (nullable = false)
 |-- label: double (nullable = false)



In [11]:
dt = DecisionTreeClassifier(labelCol="label", featuresCol="features",maxBins=6700)
model_dt = dt.fit(train)

In [12]:
pred_dt = model_dt.transform(test2)

In [13]:
evaluator = MulticlassClassificationEvaluator(labelCol='label',predictionCol='prediction')
accuracy_dt = evaluator.evaluate(pred_dt)
print("Accuracy for Decision Tree = %s" % (accuracy_dt))
print("Test Error for Decision Tree = %s" % (1.0 - accuracy_dt))

Accuracy for Decision Tree = 0.8007873942611157
Test Error for Decision Tree = 0.19921260573888433


In [27]:
test2.printSchema()

root
 |-- label: double (nullable = false)
 |-- features: vector (nullable = true)



In [21]:
from pyspark.sql.functions import *

test3.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in test3.columns]
   ).show()

NameError: name 'count' is not defined

In [None]:
test3.columns

In [16]:
test2.count()

1641136