In [1]:
import pyspark
from pyspark.ml import clustering
from pyspark.sql import SparkSession
from pyspark.ml import feature, regression, evaluation, Pipeline
from pyspark.sql import functions as fn, Row

from pyspark.sql.functions import when, lit, col,isnull,split, udf
from pyspark.ml.feature import StringIndexer, OneHotEncoderEstimator, VectorAssembler, OneHotEncoder
from pyspark.sql.functions import year, month, dayofmonth, hour
from pyspark.ml.feature import Bucketizer
from pyspark.sql.types import *
from pyspark.sql.types import IntegerType
from pyspark.ml.feature import PCA
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier
from pyspark.ml import classification
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator, RegressionEvaluator

import matplotlib.pyplot as plt
import pandas as pd

from ipywidgets import widgets
from IPython.display import display

spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext

In [2]:
#Read the csv file
data = spark.read.csv('data.csv',inferSchema=True, header=True)

In [3]:
data.printSchema()

root
 |-- Year: integer (nullable = true)
 |-- Crash Descriptor: string (nullable = true)
 |-- Time: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- Day of Week: string (nullable = true)
 |-- Police Report: string (nullable = true)
 |-- Lighting Conditions: string (nullable = true)
 |-- Municipality: string (nullable = true)
 |-- Collision Type Descriptor: string (nullable = true)
 |-- County Name: string (nullable = true)
 |-- Road Descriptor: string (nullable = true)
 |-- Weather Conditions: string (nullable = true)
 |-- Traffic Control Device: string (nullable = true)
 |-- Road Surface Conditions: string (nullable = true)
 |-- DOT Reference Marker Location: string (nullable = true)
 |-- Pedestrian Bicyclist Action: string (nullable = true)
 |-- Event Descriptor: string (nullable = true)
 |-- Number of Vehicles Involved: integer (nullable = true)



In [4]:
# Label the rows with injury accident = 1 and property damage accident = 0
df = data.withColumn('AccidentDescriptor', 
                     when((data['Crash Descriptor'] == 'Property Damage & Injury Accident') | 
                          (data['Crash Descriptor'] == 'Injury Accident')| 
                          (data['Crash Descriptor'] == 'Fatal Accident'), 1).otherwise(0))

In [5]:
## HOUR BUCKET

#Extract the hour from the time column
df = df.withColumn('Hour', hour('Time'))

#Bucket the hour column: 
# 5 Buckets: 0-4a.m. = 0.0, 5-11 = 1.0, 12-16 = 2.0, 17-22 = 3.0, 23-34 = 4.0
# 5 Buckets: 0-4 = Night, 5-11 = Morning, 12-16 = Afternoon, 17-22 = Evening, 23-24 = Night

bucketizer = Bucketizer(splits=[0,5,12,17,23,24],inputCol="Hour", outputCol="Hour_buckets")
df = bucketizer.setHandleInvalid("keep").transform(df)

# convert 5 buckets into 4 buckets (we have twice Night): 1.0 = Morning, 2.0 = Afternoon, 3.0 = Evening, 4.0 = Night
t = {0.0:4, 1.0: 1, 2.0: 2, 3.0: 3, 4.0: 4}
udf_foo = udf(lambda x: t[x], IntegerType())
df = df.withColumn("Hour", udf_foo("Hour_buckets"))

#Extract the month from the Date column
df = df.withColumn('Month', split(df['Date'], '/')[0])
df = df.withColumn('Month',df.Month.cast('int'))


In [6]:
# removing features
df= df.drop('Time','Date','Hour_buckets','Municipality','DOT Reference Marker Location', 'Crash Descriptor',
            'Number of Vehicles Involved', 'Year', 'Police Report', 'Pedestrian Bicyclist Action', 'Event Descriptor', 'Collision Type Descriptor', 'Traffic Control Device')

# let us take a look at it with pandas
df_pd = df.toPandas()

In [7]:
# display table
df_pd.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,895906,895907,895908,895909,895910,895911,895912,895913,895914,895915
Day of Week,Saturday,Saturday,Saturday,Saturday,Saturday,Saturday,Saturday,Saturday,Saturday,Saturday,...,Wednesday,Wednesday,Wednesday,Wednesday,Wednesday,Wednesday,Wednesday,Wednesday,Wednesday,Wednesday
Lighting Conditions,Dark-Road Unlighted,Dark-Road Unlighted,Daylight,Unknown,Daylight,Dusk,Daylight,Daylight,Daylight,Dark-Road Lighted,...,Dark-Road Unlighted,Daylight,Dark-Road Lighted,Dark-Road Lighted,Daylight,Dark-Road Lighted,Unknown,Daylight,Daylight,Daylight
County Name,LEWIS,SUFFOLK,OTSEGO,KINGS,RENSSELAER,ERIE,ALBANY,WESTCHESTER,GREENE,NASSAU,...,OTSEGO,SUFFOLK,BRONX,KINGS,ERIE,ERIE,MONROE,ROCKLAND,QUEENS,SUFFOLK
Road Descriptor,Curve and Grade,Straight and Level,Straight and Level,Unknown,Straight and Grade,Straight and Grade,Curve and Level,Curve and Level,Straight at Hill Crest,Straight and Level,...,Curve and Level,Straight and Level,Straight at Hill Crest,Straight and Level,Curve and Level,Straight and Level,Unknown,Straight and Level,Straight and Level,Straight and Level
Weather Conditions,Cloudy,Clear,Cloudy,Unknown,Clear,Clear,Clear,Clear,Cloudy,Clear,...,Snow,Clear,Clear,Clear,Clear,Clear,Unknown,Clear,Clear,Cloudy
Road Surface Conditions,Wet,Dry,Wet,Unknown,Dry,Snow/Ice,Snow/Ice,Dry,Snow/Ice,Dry,...,Snow/Ice,Dry,Dry,Dry,Snow/Ice,Wet,Unknown,Dry,Dry,Dry
AccidentDescriptor,0,0,1,1,0,0,1,1,0,1,...,0,0,1,0,0,1,0,1,0,0
Hour,3,1,1,2,1,3,1,2,2,3,...,3,1,3,1,1,4,3,2,1,2
Month,12,12,12,12,12,12,12,12,12,12,...,1,1,1,1,1,1,1,1,1,1


In [8]:
df.columns

['Day of Week',
 'Lighting Conditions',
 'County Name',
 'Road Descriptor',
 'Weather Conditions',
 'Road Surface Conditions',
 'AccidentDescriptor',
 'Hour',
 'Month']

In [9]:
#Using StringIndexer to change the column as categorical variable 
#or to convert the textual data to numeric data keeping the categorical context.

pipe_feat= Pipeline(stages=[StringIndexer(inputCol = 'Day of Week', outputCol = 'Day of Week_index'),
                           StringIndexer(inputCol = 'Lighting Conditions', outputCol = 'Lighting Conditions_index'),
                           StringIndexer(inputCol = 'County Name', outputCol = 'County Name_index'),
                           StringIndexer(inputCol = 'Road Descriptor', outputCol = 'Road Descriptor_index'),
                           StringIndexer(inputCol = 'Weather Conditions', outputCol = 'Weather Conditions_index'),
                            StringIndexer(inputCol = 'Road Surface Conditions', outputCol = 'Road Surface Conditions_index'),
                    OneHotEncoder(inputCol = 'Day of Week_index', outputCol = 'Day of Week_feat', dropLast=False),
                           OneHotEncoder(inputCol = 'Lighting Conditions_index', outputCol = 'Lighting Conditions_feat', dropLast=False),
                           OneHotEncoder(inputCol = 'County Name_index', outputCol = 'County Name_feat', dropLast=False),
                           OneHotEncoder(inputCol = 'Road Descriptor_index', outputCol = 'Road Descriptor_feat', dropLast=False),
                           OneHotEncoder(inputCol = 'Weather Conditions_index', outputCol = 'Weather Conditions_feat', dropLast=False),
                            OneHotEncoder(inputCol = 'Road Surface Conditions_index', outputCol = 'Road Surface Conditions_feat', dropLast=False),
                            OneHotEncoder(inputCol = 'Hour', outputCol = 'Hour_feat', dropLast=False),
                            OneHotEncoder(inputCol = 'Month', outputCol = 'Month_feat', dropLast=False),
            
                           VectorAssembler(inputCols=['Hour','Month','Day of Week_feat','Lighting Conditions_feat',
                                                    'County Name_feat', 'Road Descriptor_feat','Weather Conditions_feat',
                                                     'Road Surface Conditions_feat'], outputCol = 'features',handleInvalid="keep")]).fit(df)

In [10]:
df_feat = pipe_feat.transform(df)

In [11]:
training_df, validation_df, testing_df = df_feat.randomSplit([0.6, 0.3, 0.1], seed=0)

In [12]:
pipe_lr = Pipeline(stages=[classification.LogisticRegression(labelCol='AccidentDescriptor', featuresCol='features',regParam = 0.1,elasticNetParam = 0.001)])

In [13]:
pipe_rf = Pipeline(stages=[classification.RandomForestClassifier(labelCol='AccidentDescriptor', featuresCol='features',numTrees = 20,maxDepth = 6)])


In [14]:
pipe_gbt = Pipeline(stages=[classification.GBTClassifier(labelCol='AccidentDescriptor', featuresCol='features',maxIter = 10)])

In [15]:
fit_lr = pipe_lr.fit(training_df)
dataframe_lr = fit_lr.transform(validation_df)

fit_rf = pipe_rf.fit(training_df)
dataframe_rf = fit_rf.transform(validation_df)

fit_gbt = pipe_gbt.fit(training_df)
dataframe_gbt = fit_gbt.transform(validation_df)

In [16]:
evaluator = evaluation.BinaryClassificationEvaluator(labelCol='AccidentDescriptor')
AUC_LR = evaluator.evaluate(dataframe_lr)
AUC_RF = evaluator.evaluate(dataframe_rf)
AUC_GBT = evaluator.evaluate(dataframe_gbt)
print("The AUC of LR, RF, GBT is %s, %s, %s"%(AUC_LR,AUC_RF,AUC_GBT))

The AUC of LR, RF, GBT is 0.6953318532549726, 0.6735366266918393, 0.6995029073118374


In [18]:
df_feature =['Hour','Month']
label1 = pipe_feat.stages[0].labels
for i in label1:
    lab1 = i.replace(' ','_')
    df_feature.append('Day of Week_'+lab1)
    
label2 = pipe_feat.stages[1].labels
for i in label2:
    lab2 = i.replace(' ','_')
    df_feature.append('Lighting Conditions_'+lab2)
    

label3 = pipe_feat.stages[2].labels
for i in label3:
    lab3 = i.replace(' ','_')
    df_feature.append('County Name_'+lab3)
    
label4 = pipe_feat.stages[3].labels
for i in label4:
    lab4 = i.replace(' ','_')
    df_feature.append('Road Descriptor_'+lab4)
    
label5 = pipe_feat.stages[4].labels
for i in label5:
    lab5= i.replace(' ','_')
    df_feature.append('Weather Conditions_'+lab5)
    
label6 = pipe_feat.stages[5].labels
for i in label6:
    lab6= i.replace(' ','_')
    df_feature.append('Road Surface Conditions_'+lab6)


In [19]:
lrm = fit_lr.stages[-1]
lrmcoeff = lrm.coefficients

In [20]:
lr_coeff_feature = spark.createDataFrame([(tup[0], float(tup[1])) for tup in list(zip(df_feature , lrmcoeff))],['feature', 'LR Coeff']).sort('LR Coeff', ascending=False)
lr_coeff_feature.show(20,False)

+--------------------------------------+--------------------+
|feature                               |LR Coeff            |
+--------------------------------------+--------------------+
|County Name_KINGS                     |0.9754205417995916  |
|County Name_BRONX                     |0.9724207022981526  |
|County Name_QUEENS                    |0.8098447435670977  |
|County Name_NEW_YORK                  |0.709201126897745   |
|County Name_RICHMOND                  |0.6317184756888758  |
|Lighting Conditions_Dark-Road_Lighted |0.18428283260442652 |
|Lighting Conditions_Daylight          |0.09835893364600934 |
|Road Descriptor_Curve_at_Hill_Crest   |0.0754186864325578  |
|Weather Conditions_Rain               |0.07299653490220095 |
|Road Surface Conditions_Dry           |0.06793049250368209 |
|Road Descriptor_Straight_at_Hill_Crest|0.06759379476634327 |
|Weather Conditions_Clear              |0.06640464155138967 |
|Road Surface Conditions_Wet           |0.06633582508750095 |
|Road De

In [21]:
importance= fit_rf.stages[-1].featureImportances.toArray()

In [22]:
feature_importance = spark.createDataFrame([(tup[0], float(tup[1])) for tup in list(zip(df_feature , importance))],['feature', 'importance']).sort('importance', ascending=False)

# feature_importance =pd.DataFrame(list(zip(df_feature, importance)),columns = ['feature', 'importance']).sort_values('importance', ascending=False)

In [23]:
feature_importance.show(20,False)

+---------------------------------------+--------------------+
|feature                                |importance          |
+---------------------------------------+--------------------+
|County Name_BRONX                      |0.175303294772216   |
|County Name_KINGS                      |0.17117110329497942 |
|County Name_QUEENS                     |0.13669897346109372 |
|County Name_NEW_YORK                   |0.08455886438225978 |
|Weather Conditions_Clear               |0.08076181120558883 |
|Lighting Conditions_Dark-Road_Unlighted|0.07597921945921912 |
|Lighting Conditions_Dark-Road_Lighted  |0.054754749283697036|
|Road Descriptor_Straight_and_Level     |0.042996728683340917|
|Weather Conditions_Snow                |0.03235317773272778 |
|Road Surface Conditions_Snow/Ice       |0.016460813147570967|
|Lighting Conditions_Unknown            |0.011526637634928208|
|Weather Conditions_Cloudy              |0.009787047114071027|
|Lighting Conditions_Daylight           |0.009448176521

In [24]:
pipe_pca = Pipeline(stages=[feature.VectorAssembler(inputCols= ['Day of Week_feat',
 'Lighting Conditions_feat',
 'County Name_feat',
 'Road Descriptor_feat',
 'Weather Conditions_feat',
 'Road Surface Conditions_feat',
 'Hour',
 'Month'], outputCol = 'pcafeatures'), feature.PCA(k=80, inputCol='pcafeatures',outputCol='pc')]).fit(df_feat)


In [25]:
pc_80= pipe_pca.transform(df_feat)
training_pc, validation_pc, testing_pc = pc_80.randomSplit([0.6, 0.3, 0.1], seed=0)

In [26]:
pipe_lr_pc = Pipeline(stages=[classification.LogisticRegression(labelCol='AccidentDescriptor', featuresCol='pc',regParam = 0.1,elasticNetParam = 0.001)]).fit(training_pc)
pc_80_pred = pipe_lr_pc.transform(validation_pc)
AUC_LR_pc = evaluator.evaluate(pc_80_pred)
AUC_LR_pc

0.696579471357413

In [27]:
pipe_rf_pc = Pipeline(stages=[classification.RandomForestClassifier(labelCol='AccidentDescriptor', featuresCol='pc',numTrees = 20,maxDepth = 6)]).fit(training_pc)
pc_80_pred_rf = pipe_rf_pc.transform(validation_pc)
AUC_RF_pc = evaluator.evaluate(pc_80_pred_rf)
AUC_RF_pc

0.6887219040387068

In [28]:
pipe_gbt_pc = Pipeline(stages=[classification.GBTClassifier(labelCol='AccidentDescriptor', featuresCol='pc',maxIter = 10)]).fit(training_pc)
pc_80_pred_gbt = pipe_gbt_pc.transform(validation_pc)
AUC_GBT_pc = evaluator.evaluate(pc_80_pred_gbt)
AUC_GBT_pc

0.698921637184369

In [29]:
pca_model = pipe_pca.stages[-1]
pc1 = abs(pca_model.pc.toArray()[:, 0]).tolist()
pc2 = abs(pca_model.pc.toArray()[:, 1]).tolist()
pc1_df = spark.createDataFrame(list(zip(df_feature , pc1)),['feature', 'loadings']).sort('loadings', ascending=False)
pc2_df = spark.createDataFrame(list(zip(df_feature , pc2)),['feature', 'loadings']).sort('loadings', ascending=False)

In [30]:
pc1_df.show(5,False)

+-------------------------------------+--------------------+
|feature                              |loadings            |
+-------------------------------------+--------------------+
|Road Surface Conditions_Flooded_Water|0.9993845006469853  |
|Road Surface Conditions_Dry          |0.020384750500850733|
|Weather Conditions_Fog/Smog/Smoke    |0.017510754436209756|
|Weather Conditions_Rain              |0.011093253051689476|
|Road Surface Conditions_Muddy        |0.010620202459730731|
+-------------------------------------+--------------------+
only showing top 5 rows



In [31]:
pc2_df.show(5,False)

+---------------------------------+-------------------+
|feature                          |loadings           |
+---------------------------------+-------------------+
|Road Surface Conditions_Muddy    |0.9184183186056983 |
|Day of Week_Saturday             |0.3156258768800851 |
|Day of Week_Sunday               |0.17476911155549837|
|Lighting Conditions_Daylight     |0.08819557375239975|
|Weather Conditions_Fog/Smog/Smoke|0.04827834145251888|
+---------------------------------+-------------------+
only showing top 5 rows



In [59]:
# new pipleline for limited number of features

#Using StringIndexer to change the column as categorical variable 
#or to convert the textual data to numeric data keeping the categorical context.

pipe_feat_limited = Pipeline(stages=[StringIndexer(inputCol = 'Day of Week', outputCol = 'Day of Week_index'),
                           StringIndexer(inputCol = 'Lighting Conditions', outputCol = 'Lighting Conditions_index'),
                           StringIndexer(inputCol = 'County Name', outputCol = 'County Name_index'),
                           StringIndexer(inputCol = 'Road Descriptor', outputCol = 'Road Descriptor_index'),
                           StringIndexer(inputCol = 'Weather Conditions', outputCol = 'Weather Conditions_index'),
                            StringIndexer(inputCol = 'Road Surface Conditions', outputCol = 'Road Surface Conditions_index'),
                    OneHotEncoder(inputCol = 'Day of Week_index', outputCol = 'Day of Week_feat', dropLast=False),
                           OneHotEncoder(inputCol = 'Lighting Conditions_index', outputCol = 'Lighting Conditions_feat', dropLast=False),
                           OneHotEncoder(inputCol = 'County Name_index', outputCol = 'County Name_feat', dropLast=False),
                           OneHotEncoder(inputCol = 'Road Descriptor_index', outputCol = 'Road Descriptor_feat', dropLast=False),
                           OneHotEncoder(inputCol = 'Weather Conditions_index', outputCol = 'Weather Conditions_feat', dropLast=False),
                            OneHotEncoder(inputCol = 'Road Surface Conditions_index', outputCol = 'Road Surface Conditions_feat', dropLast=False),
                            OneHotEncoder(inputCol = 'Hour', outputCol = 'Hour_feat', dropLast=False),
                            OneHotEncoder(inputCol = 'Month', outputCol = 'Month_feat', dropLast=False),
            
                           VectorAssembler(inputCols=['Hour','Month','Day of Week_feat','Lighting Conditions_feat',
                                                    'County Name_feat', 'Road Descriptor_feat','Weather Conditions_feat',
                                                     'Road Surface Conditions_feat'], outputCol = 'features_limited',handleInvalid="keep")]).fit(df)
df_feat_limited = pipe_feat.transform(df)

pipe_pca_limited = Pipeline(stages=[feature.VectorAssembler(inputCols= ['Hour','Month','Day of Week_feat','Lighting Conditions_feat',
                                                    'County Name_feat', 'Road Descriptor_feat','Weather Conditions_feat',
                                                     'Road Surface Conditions_feat'], outputCol = 'pcafeatures'), feature.PCA(k=80, inputCol='pcafeatures',outputCol='pc')]).fit(df_feat_limited)


In [73]:
print("1. Please define the Time first: 1 = Morning, 2 = Afternoon, 3 = Evening, 4 = Night")
display(time)
print("2. Please define the Month of the year")
display(month)
print("3. Please choose the Day of the Week")
display(day)
print("4. Please choose the Lighting Conditions")
display(light)
print("5. Please choose the County")
display(county)
print("6. Please choose the Road Type")
display(road1)
print("7. Please choose the Weather")
display(weather)
print("8. Please choose the Road Surface Conditions")
display(road2)

1. Please define the Time first: 1 = Morning, 2 = Afternoon, 3 = Evening, 4 = Night


IntSlider(value=4, continuous_update=False, description='Time:', max=4, min=1)

2. Please define the Month of the year


IntSlider(value=12, continuous_update=False, description='Month:', max=12, min=1)

3. Please choose the Day of the Week


Dropdown(description='Day of Week:', index=5, options=('Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday',…

4. Please choose the Lighting Conditions


Dropdown(description='Lighting:', index=2, options=('Daylight', 'Dark-Road Lighted', 'Dark-Road Unlighted', 'U…

5. Please choose the County


Dropdown(description='County:', index=2, options=('NASSAU', 'SUFFOLK', 'QUEENS', 'KINGS', 'ERIE', 'WESTCHESTER…

6. Please choose the Road Type


Dropdown(description='Road:', options=('Straight and Level', 'Straight and Grade', 'Unknown', 'Curve and Level…

7. Please choose the Weather


Dropdown(description='Weather:', index=2, options=('Clear', 'Cloudy', 'Rain', 'Unknown', 'Snow', 'Sleet/Hail/F…

8. Please choose the Road Surface Conditions


Dropdown(description='Road:', index=1, options=('Dry', 'Wet', 'Snow/Ice', 'Unknown', 'Slush', 'Other', 'Muddy'…

In [74]:
time1 = str(time.value)
time1 = int(time1)
month1 = str(month.value)
month1 = int(month1)
day1 = str(day.value)
light1 = str(light.value)
county1 = str(county.value)
road11 = str(road1.value)
weather1 = str(weather.value)
road22 = str(road2.value)

In [75]:
test_df = spark.createDataFrame([(time1, month1 , day1, light1 , county1 , road11, weather1, road22)], 
    [ 'Hour', 'Month', 'Day of Week',
 'Lighting Conditions',
 'County Name',
 'Road Descriptor',
 'Weather Conditions',
 'Road Surface Conditions'])


In [76]:
test_df_pd = test_df.toPandas()

In [77]:
test_df_pd

Unnamed: 0,Hour,Month,Day of Week,Lighting Conditions,County Name,Road Descriptor,Weather Conditions,Road Surface Conditions
0,4,12,Saturday,Dark-Road Unlighted,QUEENS,Straight and Level,Rain,Wet


In [78]:
test_feat = pipe_feat_limited.transform(test_df)

In [79]:
pc_test = pipe_pca_limited.transform(test_feat)

In [80]:
pred = pipe_rf_pc.transform(pc_test)

In [81]:
probability = pred.select('probability').toPandas()

In [82]:
probability

Unnamed: 0,probability
0,"[0.5386810998869163, 0.4613189001130838]"
