## Traffic Crash Analysis

### Data importing and pre-processing

In [0]:
%pip install sodapy
dbutils.library.restartPython()

[43mNote: you may need to restart the kernel using dbutils.library.restartPython() to use updated packages.[0m
Collecting sodapy
  Downloading sodapy-2.2.0-py2.py3-none-any.whl (15 kB)
Installing collected packages: sodapy
Successfully installed sodapy-2.2.0
[43mNote: you may need to restart the kernel using dbutils.library.restartPython() to use updated packages.[0m


In [0]:
import pandas as pd
from sodapy import Socrata


# Unauthenticated client only works with public data sets. Note 'None'
# in place of application token, and no username or password:
client = Socrata("data.cityofchicago.org", None)

# Specify the columns you want to retrieve
desired_columns = "crash_record_id,crash_date,crash_type,num_units,weather_condition,most_severe_injury,latitude,longitude"

results = client.get("85ca-t3if", select=desired_columns, limit=100000)



In [0]:
# Convert the results to a Spark DataFrame
df2 = spark.createDataFrame(results)
print(df2.count())
# Show the first few rows of the DataFrame
df2.show(5)

100000
+--------------------+--------------------+--------------------+------------+-------------+--------------------+---------+-----------------+
|          crash_date|     crash_record_id|          crash_type|    latitude|    longitude|  most_severe_injury|num_units|weather_condition|
+--------------------+--------------------+--------------------+------------+-------------+--------------------+---------+-----------------+
|2023-12-15T00:00:...|262291733c474feb8...|INJURY AND / OR T...|41.780454051|-87.732727335|NO INDICATION OF ...|        2|            CLEAR|
|2023-12-14T23:50:...|cd9763836867eafcd...|NO INJURY / DRIVE...| 41.70743645| -87.60416001|NO INDICATION OF ...|        2|            CLEAR|
|2023-12-14T23:00:...|cdbf4819b8728eeb8...|NO INJURY / DRIVE...|41.678551187|-87.640582164|NO INDICATION OF ...|        2|            CLEAR|
|2023-12-14T22:54:...|c5f848b945da58e94...|INJURY AND / OR T...|41.877800803|-87.649546727|NO INDICATION OF ...|        2|            CLEAR|
|2023-

### Create RDD of wanted features

In [0]:
wanted_columns = df2.select("Crash_type","num_units","Weather_condition","Crash_date","Most_severe_injury","Longitude","Latitude")
wanted_columns.show(1)
rdd_of_features = wanted_columns.rdd.map(lambda row:[row[0],row[1],row[2],row[3],row[4],row[5],row[6]])


+--------------------+---------+-----------------+--------------------+--------------------+-------------+------------+
|          Crash_type|num_units|Weather_condition|          Crash_date|  Most_severe_injury|    Longitude|    Latitude|
+--------------------+---------+-----------------+--------------------+--------------------+-------------+------------+
|INJURY AND / OR T...|        2|            CLEAR|2023-12-15T00:00:...|NO INDICATION OF ...|-87.732727335|41.780454051|
+--------------------+---------+-----------------+--------------------+--------------------+-------------+------------+
only showing top 1 row



### Remove all rows where the content of one of the fields is unknown

In [0]:
print(rdd_of_features.count())
#row[0] = Crash_type, row[2] = Weather_condition,  row[4]= Most_severe_injury
cleaned_data_rdd = rdd_of_features.filter(lambda row: row[0]!="UNKNOWN"  and row[2]!="UNKNOWN"  and row[4]!="UNKNOWN" and row[5] != None and row[6] != None and row[0] != None and row[1] != None and row[2] != None and row[3] != None and row[4] != None)
print(cleaned_data_rdd.count())

100000
90339


### Create Dataframe from RDD and get it ready for regression

In [0]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline
from pyspark.sql.functions import col

cleaned_data_df = spark.createDataFrame(cleaned_data_rdd)

#_1 = Crash_type, _2 = numUnits, _3 = weather, _4 = time, _5 = injury severity, _6 = longitude, _7 = latitude
print(cleaned_data_df.dtypes)
numeric_cols = ["_2", "_6", "_7"]
for col_name in numeric_cols:    
    cleaned_data_df = cleaned_data_df.withColumn(col_name, col(col_name).cast("double"))
print(cleaned_data_df.dtypes)

string_cols = ["_1", "_3", "_4", "_5"]
indexers = [StringIndexer(inputCol=column, outputCol=column+"_index").fit(cleaned_data_df) for column in string_cols ]

pipeline = Pipeline(stages=indexers)
indexed_df = pipeline.fit(cleaned_data_df).transform(cleaned_data_df)
indexed_df.show(5)

[('_1', 'string'), ('_2', 'string'), ('_3', 'string'), ('_4', 'string'), ('_5', 'string'), ('_6', 'string'), ('_7', 'string')]
[('_1', 'string'), ('_2', 'double'), ('_3', 'string'), ('_4', 'string'), ('_5', 'string'), ('_6', 'double'), ('_7', 'double')]


Downloading artifacts:   0%|          | 0/30 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

+--------------------+---+-----+--------------------+--------------------+-------------+------------+--------+--------+--------+--------+
|                  _1| _2|   _3|                  _4|                  _5|           _6|          _7|_1_index|_3_index|_4_index|_5_index|
+--------------------+---+-----+--------------------+--------------------+-------------+------------+--------+--------+--------+--------+
|INJURY AND / OR T...|2.0|CLEAR|2023-12-15T00:00:...|NO INDICATION OF ...|-87.732727335|41.780454051|     1.0|     0.0| 62438.0|     0.0|
|NO INJURY / DRIVE...|2.0|CLEAR|2023-12-14T23:50:...|NO INDICATION OF ...| -87.60416001| 41.70743645|     0.0|     0.0| 62437.0|     0.0|
|NO INJURY / DRIVE...|2.0|CLEAR|2023-12-14T23:00:...|NO INDICATION OF ...|-87.640582164|41.678551187|     0.0|     0.0| 62436.0|     0.0|
|INJURY AND / OR T...|2.0|CLEAR|2023-12-14T22:54:...|NO INDICATION OF ...|-87.649546727|41.877800803|     1.0|     0.0| 62435.0|     0.0|
|INJURY AND / OR T...|3.0|CLEAR|20

### Create Labeled Points and Normalize features

Partially taken from lab notebook

In [0]:
from pyspark.ml.feature import VectorAssembler, MinMaxScaler, StandardScaler
from pyspark.mllib.regression import LabeledPoint

# List of feature column names
feature_column_names = ["_2", "_6", "_1_index", "_3_index", "_4_index", "_5_index"]

# Assemble numeric columns into a feature vector
assembler = VectorAssembler(inputCols=feature_column_names, outputCol="features")
vector_df = assembler.transform(indexed_df)

useMinMaxScaler=True
if (useMinMaxScaler):
    # Apply MinMaxScaler
    scaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures")
    scalerModel = scaler.fit(vector_df)
    scaledData = scalerModel.transform(vector_df)
else:
    # Apply StandarScaler
    scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", withStd=True, withMean=True)
    scalerModel = scaler.fit(df)
    scaledData = scalerModel.transform(df)

# Convert to RDD of LabeledPoint
def to_labeled_point(row):
    # Convert the DenseVector to a list for features
    features_list = row.scaledFeatures.toArray().tolist()
    return LabeledPoint(row['_7'], features_list)

normalizedSamplePoints = scaledData.rdd.map(to_labeled_point)

firstPoint = normalizedSamplePoints.take(1)
firstPointFeatures =firstPoint[0].features
firstPointLabel = firstPoint[0].label
print (firstPointFeatures, firstPointLabel)
d = len(firstPointFeatures)
print(d)


In [0]:
weights = [.8, .2] # train/test split
seed = 42
parsedTrainData, parsedValData = normalizedSamplePoints.randomSplit(weights,seed)
parsedTrainData.cache()
parsedValData.cache()
nTrain = parsedTrainData.count()
nVal = parsedValData.count()

print(nTrain, nVal, nTrain + nVal)
print(normalizedSamplePoints.count())

7228 1833 9061
9061


### Create baseline using the average value

In [0]:
averagelatitude = (parsedTrainData.map(lambda s: s.label)).mean()
print(averagelatitude)

41.845516966703784


In [0]:
import math
def squaredError(label, prediction):
    sqrError = (label-prediction)*(label-prediction)
    return sqrError

def calcRMSE(labelsAndPreds):
    sqrSum = labelsAndPreds.map(lambda s: squaredError(s[0],s[1])).sum()
    return math.sqrt(sqrSum/labelsAndPreds.count())

labelsAndPredsTrain = parsedTrainData.map(lambda s: (s.label,averagelatitude))
rmseTrainBase = calcRMSE(labelsAndPredsTrain)

labelsAndPredsVal = parsedValData.map(lambda s: (s.label,averagelatitude))
rmseValBase = calcRMSE(labelsAndPredsVal)
print(rmseValBase)

0.0886549286702423


### Apply linear regression with weights version one

In [0]:
from pyspark.mllib.linalg import DenseVector
from pyspark.mllib.regression import LinearRegressionWithSGD
# Values to use when training the linear regression model
numIters = 500  # iterations
alpha = 1.0  # step
miniBatchFrac = 1.0  # miniBatchFraction
reg = 1e-1  # regParam
regType = 'l2'  # regType
useIntercept = True  # intercept

In [0]:
firstModel = LinearRegressionWithSGD.train(parsedTrainData,numIters,alpha,miniBatchFrac,initialWeights=None,regParam=reg,regType=regType,intercept=useIntercept)

# weightsLR1 stores the model weights; interceptLR1 stores the model intercept
weightsLR1 = firstModel.weights
interceptLR1 = firstModel.intercept
print(weightsLR1, interceptLR1)



[4.772929777130594,0.02564015613890659,3.1043536870845547,0.7792756857158752,7.211021173135255,0.600584758917293] 33.890889504973664


In [0]:
samplePoints = parsedValData.take(5)
print(samplePoints)
for i in range(5):
    samplePrediction = firstModel.predict(samplePoints[i].features)
    print(samplePrediction)

[LabeledPoint(41.684586373, [0.14285714285714285,0.002655013578821437,1.0,0.0,0.9985930905111771,0.25]), LabeledPoint(41.943765168, [0.14285714285714285,0.0012215230483288526,0.0,0.0,0.9982804439581053,0.0]), LabeledPoint(41.887809278, [0.14285714285714285,0.001697362996742011,0.0,0.0,0.9979677974050336,0.0]), LabeledPoint(41.691361177, [0.14285714285714285,0.002358200528497731,0.0,0.0,0.9976551508519619,0.0]), LabeledPoint(41.756663578, [0.14285714285714285,0.0035114886705232906,0.0,0.0,0.24730342347975612,0.0])]
45.02818048679158
41.77138935414277
41.76914705383943
41.766909496903025
36.35613687401147


In [0]:
labelsAndPreds = parsedValData.map(lambda lp: (lp.label,firstModel.predict(lp.features)))
rmseValLR1 = calcRMSE(labelsAndPreds)

print(rmseValBase)
print(rmseValLR1)

0.0886549286702423
4.568451011313926


### Apply linear regression with weights 2

In [0]:
numIters = 1000  # iterations
alpha = 1.0  # step
miniBatchFrac = 0.3  # miniBatchFraction
reg = 1e-1  # regParam
regType = 'l2'  # regType
useIntercept = True  # intercept

In [0]:
secondModel = LinearRegressionWithSGD.train(parsedTrainData,numIters,alpha,miniBatchFrac,initialWeights=None,regParam=reg,regType=regType,intercept=useIntercept)

# weightsLR1 stores the model weights; interceptLR1 stores the model intercept
weightsLR2 = secondModel.weights
interceptLR2 = secondModel.intercept
print(weightsLR2, interceptLR2)

[4.769060324721782,0.027175287213863947,3.117732896312873,0.7837786946184394,7.275851076859432,0.6132815393077098] 33.84304851759033


In [0]:
labelsAndPreds = parsedValData.map(lambda lp: (lp.label,secondModel.predict(lp.features)))
rmseValLR2 = calcRMSE(labelsAndPreds)

print(rmseValBase)
print(rmseValLR2)

0.0886549286702423
4.5970261240721815


### Random Forest Version One

In [0]:
from pyspark.mllib.tree import RandomForest
thirdModel = RandomForest.trainRegressor(parsedTrainData, categoricalFeaturesInfo={},
                                      numTrees=8, featureSubsetStrategy="auto",
                                      impurity='variance', maxDepth=5, maxBins=32)



In [0]:
samplePoints = parsedValData.take(5)
print(samplePoints)
for i in range(5):
    samplePrediction = thirdModel.predict(samplePoints[i].features)
    print(samplePrediction)

[LabeledPoint(41.684586373, [0.14285714285714285,0.002655013578821437,1.0,0.0,0.9985930905111771,0.25]), LabeledPoint(41.943765168, [0.14285714285714285,0.0012215230483288526,0.0,0.0,0.9982804439581053,0.0]), LabeledPoint(41.887809278, [0.14285714285714285,0.001697362996742011,0.0,0.0,0.9979677974050336,0.0]), LabeledPoint(41.691361177, [0.14285714285714285,0.002358200528497731,0.0,0.0,0.9976551508519619,0.0]), LabeledPoint(41.756663578, [0.14285714285714285,0.0035114886705232906,0.0,0.0,0.24730342347975612,0.0])]
41.86367616492856
41.88191544626116
41.88191544626116
41.87520171992061
41.793095420063366


In [0]:
import numpy as np
labels = parsedValData.map(lambda x: x.label).collect()
predictions = thirdModel.predict(parsedValData.map(lambda x: x.features)).collect()
rmseDT1 = np.sqrt(np.mean((np.array(predictions)-np.array(labels))**2))

print(rmseValBase)
print(rmseDT1)

0.0886549286702423
0.16161155340247219


### Random Forest Version Two

In [0]:
thirdModel = RandomForest.trainRegressor(parsedTrainData, categoricalFeaturesInfo={},
                                      numTrees=10, featureSubsetStrategy="auto",
                                      impurity='variance', maxDepth=6, maxBins=32)

In [0]:
samplePoints = parsedValData.take(5)
print(samplePoints)
for i in range(5):
    samplePrediction = thirdModel.predict(samplePoints[i].features)
    print(samplePrediction)

[LabeledPoint(41.684586373, [0.14285714285714285,0.002655013578821437,1.0,0.0,0.9985930905111771,0.25]), LabeledPoint(41.943765168, [0.14285714285714285,0.0012215230483288526,0.0,0.0,0.9982804439581053,0.0]), LabeledPoint(41.887809278, [0.14285714285714285,0.001697362996742011,0.0,0.0,0.9979677974050336,0.0]), LabeledPoint(41.691361177, [0.14285714285714285,0.002358200528497731,0.0,0.0,0.9976551508519619,0.0]), LabeledPoint(41.756663578, [0.14285714285714285,0.0035114886705232906,0.0,0.0,0.24730342347975612,0.0])]
41.86841375533434
41.927633602641414
41.91073297121619
41.87609034877359
41.77433458339947


In [0]:
labels = parsedValData.map(lambda x: x.label).collect()
predictions = thirdModel.predict(parsedValData.map(lambda x: x.features)).collect()
rmseDT2 = np.sqrt(np.mean((np.array(predictions)-np.array(labels))**2))

print(rmseValBase)
print(rmseDT2)

0.0886549286702423
0.12272992713706508
