## Traffic Crash Analysis

### Data importing and pre-processing

In [0]:
%pip install sodapy
dbutils.library.restartPython()

[43mNote: you may need to restart the kernel using dbutils.library.restartPython() to use updated packages.[0m
Collecting sodapy
  Using cached sodapy-2.2.0-py2.py3-none-any.whl (15 kB)
Installing collected packages: sodapy
Successfully installed sodapy-2.2.0
[43mNote: you may need to restart the kernel using dbutils.library.restartPython() to use updated packages.[0m


In [0]:
import pandas as pd
from sodapy import Socrata


# Unauthenticated client only works with public data sets. Note 'None'
# in place of application token, and no username or password:
client = Socrata("data.cityofchicago.org", None)

# Specify the columns you want to retrieve
desired_columns = "crash_record_id,crash_date,crash_type,num_units,weather_condition,most_severe_injury,latitude,longitude"

results = client.get("85ca-t3if", select=desired_columns, limit=800000)



In [0]:
# Convert the results to a Spark DataFrame
df2 = spark.createDataFrame(results)
print(df2.count())
# Show the first few rows of the DataFrame
df2.show(5)

790180
+--------------------+--------------------+--------------------+------------+-------------+--------------------+---------+-----------------+
|          crash_date|     crash_record_id|          crash_type|    latitude|    longitude|  most_severe_injury|num_units|weather_condition|
+--------------------+--------------------+--------------------+------------+-------------+--------------------+---------+-----------------+
|2023-12-17T00:09:...|9ae981e02784d3954...|INJURY AND / OR T...| 41.99343782|-87.802396002|NO INDICATION OF ...|        2|             RAIN|
|2023-12-16T23:15:...|9574a24157fafae29...|NO INJURY / DRIVE...|41.806155683|-87.662618708|NO INDICATION OF ...|        5|  CLOUDY/OVERCAST|
|2023-12-16T23:00:...|425066d3d6815e079...|NO INJURY / DRIVE...|41.959509273|-87.708341289|NO INDICATION OF ...|        3|             RAIN|
|2023-12-16T22:58:...|5095111c15aa191a6...|INJURY AND / OR T...|41.964306938| -87.74776186|NO INDICATION OF ...|        1|             RAIN|
|2023-

### Create RDD of wanted features

In [0]:
wanted_columns = df2.select("Crash_type","num_units","Weather_condition","Crash_date","Most_severe_injury","Longitude","Latitude")
wanted_columns.show(1)
rdd_of_features = wanted_columns.rdd.map(lambda row:[row[0],row[1],row[2],row[3],row[4],row[5],row[6]])


+--------------------+---------+-----------------+--------------------+--------------------+-------------+-----------+
|          Crash_type|num_units|Weather_condition|          Crash_date|  Most_severe_injury|    Longitude|   Latitude|
+--------------------+---------+-----------------+--------------------+--------------------+-------------+-----------+
|INJURY AND / OR T...|        2|             RAIN|2023-12-17T00:09:...|NO INDICATION OF ...|-87.802396002|41.99343782|
+--------------------+---------+-----------------+--------------------+--------------------+-------------+-----------+
only showing top 1 row



### Remove all rows where the content of one of the fields is unknown

In [0]:
print(rdd_of_features.count())
#row[0] = Crash_type, row[2] = Weather_condition,  row[4]= Most_severe_injury
cleaned_data_rdd = rdd_of_features.filter(lambda row: row[0]!="UNKNOWN"  and row[2]!="UNKNOWN"  and row[4]!="UNKNOWN" and row[5] != None and row[6] != None and row[0] != None and row[1] != None and row[2] != None and row[3] != None and row[4] != None)
print(cleaned_data_rdd.count())

790180
740552


### Create Dataframe from RDD and get it ready for regression

In [0]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline
from pyspark.sql.functions import col

cleaned_data_df = spark.createDataFrame(cleaned_data_rdd)

#_1 = Crash_type, _2 = numUnits, _3 = weather, _4 = time, _5 = injury severity, _6 = longitude, _7 = latitude
print(cleaned_data_df.dtypes)
numeric_cols = ["_2", "_6", "_7"]
for col_name in numeric_cols:    
    cleaned_data_df = cleaned_data_df.withColumn(col_name, col(col_name).cast("double"))
print(cleaned_data_df.dtypes)

string_cols = ["_1", "_3", "_4", "_5"]
indexers = [StringIndexer(inputCol=column, outputCol=column+"_index").fit(cleaned_data_df) for column in string_cols ]

pipeline = Pipeline(stages=indexers)
indexed_df = pipeline.fit(cleaned_data_df).transform(cleaned_data_df)
indexed_df.show(5)

[('_1', 'string'), ('_2', 'string'), ('_3', 'string'), ('_4', 'string'), ('_5', 'string'), ('_6', 'string'), ('_7', 'string')]
[('_1', 'string'), ('_2', 'double'), ('_3', 'string'), ('_4', 'string'), ('_5', 'string'), ('_6', 'double'), ('_7', 'double')]


Downloading artifacts:   0%|          | 0/30 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

+--------------------+---+---------------+--------------------+--------------------+-------------+------------+--------+--------+--------+--------+
|                  _1| _2|             _3|                  _4|                  _5|           _6|          _7|_1_index|_3_index|_4_index|_5_index|
+--------------------+---+---------------+--------------------+--------------------+-------------+------------+--------+--------+--------+--------+
|INJURY AND / OR T...|2.0|           RAIN|2023-12-17T00:09:...|NO INDICATION OF ...|-87.802396002| 41.99343782|     1.0|     1.0|498688.0|     0.0|
|NO INJURY / DRIVE...|5.0|CLOUDY/OVERCAST|2023-12-16T23:15:...|NO INDICATION OF ...|-87.662618708|41.806155683|     0.0|     3.0|498687.0|     0.0|
|NO INJURY / DRIVE...|3.0|           RAIN|2023-12-16T23:00:...|NO INDICATION OF ...|-87.708341289|41.959509273|     0.0|     1.0|498686.0|     0.0|
|INJURY AND / OR T...|1.0|           RAIN|2023-12-16T22:58:...|NO INDICATION OF ...| -87.74776186|41.964306938| 

### Create Labeled Points and Normalize features

Partially taken from lab notebook

In [0]:
from pyspark.ml.feature import VectorAssembler, MinMaxScaler, StandardScaler
from pyspark.mllib.regression import LabeledPoint

feature_column_names = ["_2", "_1_index", "_3_index", "_4_index", "_5_index"]

# Assemble numeric columns into a feature vector
assembler = VectorAssembler(inputCols=feature_column_names, outputCol="features")
vector_df = assembler.transform(indexed_df)

useMinMaxScaler=True
if (useMinMaxScaler):
    # Apply MinMaxScaler
    scaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures")
    scalerModel = scaler.fit(vector_df)
    scaledData = scalerModel.transform(vector_df)
else:
    # Apply StandarScaler
    scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", withStd=True, withMean=True)
    scalerModel = scaler.fit(vector_df)
    scaledData = scalerModel.transform(vector_df)

normalizedSamplePoints = scaledData.withColumn("label", col("_7"))
firstPoint = normalizedSamplePoints.take(1)
print(firstPoint)
print (firstPoint[0].scaledFeatures, firstPoint[0].label)
print(len(firstPointFeatures))

[Row(_1='INJURY AND / OR TOW DUE TO CRASH', _2=2.0, _3='RAIN', _4='2023-12-17T00:09:00.000', _5='NO INDICATION OF INJURY', _6=-87.802396002, _7=41.99343782, _1_index=1.0, _3_index=1.0, _4_index=498688.0, _5_index=0.0, features=DenseVector([2.0, -87.8024, 1.0, 1.0, 498688.0, 0.0]), scaledFeatures=DenseVector([0.0588, 0.0015, 1.0, 0.1, 1.0, 0.0]))]
[Row(_1='INJURY AND / OR TOW DUE TO CRASH', _2=2.0, _3='RAIN', _4='2023-12-17T00:09:00.000', _5='NO INDICATION OF INJURY', _6=-87.802396002, _7=41.99343782, _1_index=1.0, _3_index=1.0, _4_index=498688.0, _5_index=0.0, features=DenseVector([2.0, -87.8024, 1.0, 1.0, 498688.0, 0.0]), scaledFeatures=DenseVector([0.0588, 0.0015, 1.0, 0.1, 1.0, 0.0]), label=41.99343782)]
[0.058823529411764705,0.0015215230557074607,1.0,0.1,1.0,0.0] 41.99343782
6


In [0]:
def to_labeled_point(row):
    # Convert the DenseVector to a list for features
    features_list = row.scaledFeatures.toArray().tolist()
    return LabeledPoint(row['label'], features_list)

weights = [.8, .2] # train/test split
seed = 42

# List of column names that are not needed in df, performance optimization for conversion to rdd later
drop_columns = ["_1", "_2", "_3", "_4", "_5", "_6","_7","_1_index","_3_index","_4_index","_5_index","features"]
normalizedSamplePoints=normalizedSamplePoints.drop(*drop_columns)
parsedTrainData, parsedValData = normalizedSamplePoints.randomSplit(weights,seed)
parsedTrainData=parsedTrainData.rdd.map(to_labeled_point)
parsedValData=parsedValData.rdd.map(to_labeled_point)

parsedTrainData.cache()
parsedValData.cache()

nTrain = parsedTrainData.count()
nVal = parsedValData.count()

print(nTrain, nVal, nTrain + nVal)

592577 147975 740552


### Create baseline using the average value

In [0]:
averagelatitude = (parsedTrainData.map(lambda s: s.label)).mean()
print(averagelatitude)

41.85415196454593


In [0]:
import math
def squaredError(label, prediction):
    sqrError = (label-prediction)*(label-prediction)
    return sqrError

def calcRMSE(labelsAndPreds):
    sqrSum = labelsAndPreds.map(lambda s: squaredError(s[0],s[1])).sum()
    return math.sqrt(sqrSum/labelsAndPreds.count())

labelsAndPredsTrain = parsedTrainData.map(lambda s: (s.label,averagelatitude))
rmseTrainBase = calcRMSE(labelsAndPredsTrain)

labelsAndPredsVal = parsedValData.map(lambda s: (s.label,averagelatitude))
rmseValBase = calcRMSE(labelsAndPredsVal)
print(rmseValBase)

0.33749975599488014


### Apply linear regression with weights version one

In [0]:
from pyspark.mllib.linalg import DenseVector
from pyspark.mllib.regression import LinearRegressionWithSGD
# Values to use when training the linear regression model
numIters = 500  # iterations
alpha = 1.0  # step
miniBatchFrac = 1.0  # miniBatchFraction
reg = 1e-1  # regParam
regType = 'l2'  # regType
useIntercept = True  # intercept

In [0]:
firstModel = LinearRegressionWithSGD.train(parsedTrainData,numIters,alpha,miniBatchFrac,initialWeights=None,regParam=reg,regType=regType,intercept=useIntercept)

# weightsLR1 stores the model weights; interceptLR1 stores the model intercept
weightsLR1 = firstModel.weights
interceptLR1 = firstModel.intercept
print(weightsLR1, interceptLR1)

In [0]:
samplePoints = parsedValData.take(5)
print(samplePoints)
for i in range(5):
    samplePrediction = firstModel.predict(samplePoints[i].features)
    print(samplePrediction)

[LabeledPoint(41.953766077, [-2.271177215016908,-0.3438561969213648,1.623785064831159,-0.36501597422854265,-1.1113111841599141,1.2576563865428563]), LabeledPoint(41.976201139, [-2.271177215016908,-0.3311764777169709,-0.6158442218212618,-0.36501597422854265,1.7722232272071285,-0.36406021416075135]), LabeledPoint(41.976201139, [-2.271177215016908,-0.3311764777169709,-0.6158442218212618,-0.36501597422854265,2.0397628204068066,-0.36406021416075135]), LabeledPoint(41.976201139, [-2.271177215016908,-0.3311764777169709,1.623785064831159,-0.36501597422854265,-0.3490794277634612,-0.36406021416075135]), LabeledPoint(41.976201139, [-2.271177215016908,-0.3311764777169709,1.623785064831159,-0.36501597422854265,2.024769186913883,-0.36406021416075135])]
38.18813280915872
38.19151244587358
38.191971052540744
38.17996885753666
38.18403802273387


In [0]:
labelsAndPreds = parsedValData.map(lambda lp: (lp.label,firstModel.predict(lp.features)))
rmseValLR1 = calcRMSE(labelsAndPreds)

print(rmseValBase)
print(rmseValLR1)

0.3375046806783204
3.7677388192795545


### Apply linear regression with weights 2

In [0]:
numIters = 1000  # iterations
alpha = 1.0  # step
miniBatchFrac = 0.3  # miniBatchFraction
reg = 1e-1  # regParam
regType = 'l2'  # regType
useIntercept = True  # intercept

In [0]:
secondModel = LinearRegressionWithSGD.train(parsedTrainData,numIters,alpha,miniBatchFrac,initialWeights=None,regParam=reg,regType=regType,intercept=useIntercept)

# weightsLR1 stores the model weights; interceptLR1 stores the model intercept
weightsLR2 = secondModel.weights
interceptLR2 = secondModel.intercept
print(weightsLR2, interceptLR2)

[-0.0014663170739338547,-0.30519910377744236,-0.004721574114414496,0.004322624573910402,0.0029022498307462712,-0.0027006219689228333] 38.08705152877708


In [0]:
labelsAndPreds = parsedValData.map(lambda lp: (lp.label,secondModel.predict(lp.features)))
rmseValLR2 = calcRMSE(labelsAndPreds)

print(rmseValBase)
print(rmseValLR2)

0.3375046806783204
3.7677334504421314


### Random Forest Version One

In [0]:
from pyspark.mllib.tree import RandomForest
thirdModel = RandomForest.trainRegressor(parsedTrainData, categoricalFeaturesInfo={},
                                      numTrees=8, featureSubsetStrategy="auto",
                                      impurity='variance', maxDepth=5, maxBins=32)



In [0]:
samplePoints = parsedValData.take(5)
print(samplePoints)
for i in range(5):
    samplePrediction = thirdModel.predict(samplePoints[i].features)
    print(samplePrediction)

[LabeledPoint(41.953766077, [-2.271177215016908,-0.3438561969213648,1.623785064831159,-0.36501597422854265,-1.1113111841599141,1.2576563865428563]), LabeledPoint(41.976201139, [-2.271177215016908,-0.3311764777169709,-0.6158442218212618,-0.36501597422854265,1.7722232272071285,-0.36406021416075135]), LabeledPoint(41.976201139, [-2.271177215016908,-0.3311764777169709,-0.6158442218212618,-0.36501597422854265,2.0397628204068066,-0.36406021416075135]), LabeledPoint(41.976201139, [-2.271177215016908,-0.3311764777169709,1.623785064831159,-0.36501597422854265,-0.3490794277634612,-0.36406021416075135]), LabeledPoint(41.976201139, [-2.271177215016908,-0.3311764777169709,1.623785064831159,-0.36501597422854265,2.024769186913883,-0.36406021416075135])]
41.89881648978902
41.89423755284405
41.89423755284405
41.91018926780793
41.90619244972316


In [0]:
import numpy as np
labels = parsedValData.map(lambda x: x.label).collect()
predictions = thirdModel.predict(parsedValData.map(lambda x: x.features)).collect()
rmseDT1 = np.sqrt(np.mean((np.array(predictions)-np.array(labels))**2))

print(rmseValBase)
print(rmseDT1)

0.3375046806783204
0.330398273614085


### Random Forest Version Two

In [0]:
thirdModel = RandomForest.trainRegressor(parsedTrainData, categoricalFeaturesInfo={},
                                      numTrees=10, featureSubsetStrategy="auto",
                                      impurity='variance', maxDepth=6, maxBins=32)

In [0]:
samplePoints = parsedValData.take(5)
print(samplePoints)
for i in range(5):
    samplePrediction = thirdModel.predict(samplePoints[i].features)
    print(samplePrediction)

[LabeledPoint(41.953766077, [-2.271177215016908,-0.3438561969213648,1.623785064831159,-0.36501597422854265,-1.1113111841599141,1.2576563865428563]), LabeledPoint(41.976201139, [-2.271177215016908,-0.3311764777169709,-0.6158442218212618,-0.36501597422854265,1.7722232272071285,-0.36406021416075135]), LabeledPoint(41.976201139, [-2.271177215016908,-0.3311764777169709,-0.6158442218212618,-0.36501597422854265,2.0397628204068066,-0.36406021416075135]), LabeledPoint(41.976201139, [-2.271177215016908,-0.3311764777169709,1.623785064831159,-0.36501597422854265,-0.3490794277634612,-0.36406021416075135]), LabeledPoint(41.976201139, [-2.271177215016908,-0.3311764777169709,1.623785064831159,-0.36501597422854265,2.024769186913883,-0.36406021416075135])]
41.878382899264764
41.89042959900137
41.89042959900137
41.87933583130467
41.89256993244285


In [0]:
labels = parsedValData.map(lambda x: x.label).collect()
predictions = thirdModel.predict(parsedValData.map(lambda x: x.features)).collect()
rmseDT2 = np.sqrt(np.mean((np.array(predictions)-np.array(labels))**2))

print(rmseValBase)
print(rmseDT2)

0.3375046806783204
0.3334090623972566
