## Traffic Crash Analysis

### Data importing and pre-processing

In [0]:
%pip install sodapy
dbutils.library.restartPython()

Python interpreter will be restarted.
Collecting sodapy
  Downloading sodapy-2.2.0-py2.py3-none-any.whl (15 kB)
Collecting requests>=2.28.1
  Downloading requests-2.31.0-py3-none-any.whl (62 kB)
Installing collected packages: requests, sodapy
  Attempting uninstall: requests
    Found existing installation: requests 2.27.1
    Not uninstalling requests at /databricks/python3/lib/python3.9/site-packages, outside environment /local_disk0/.ephemeral_nfs/envs/pythonEnv-1c4fed61-24d6-4e48-a8ab-e137e5b3bfe9
    Can't uninstall 'requests'. No files were found to uninstall.
Successfully installed requests-2.31.0 sodapy-2.2.0
Python interpreter will be restarted.


In [0]:
import pandas as pd
from sodapy import Socrata

# Unauthenticated client only works with public data sets. Note 'None'
# in place of application token, and no username or password:
client = Socrata("data.cityofchicago.org", None)

# Specify the columns you want to retrieve
desired_columns = "crash_record_id,crash_date,crash_type,num_units,weather_condition,most_severe_injury,latitude,longitude"

results = client.get("85ca-t3if", select=desired_columns, limit=800000, timeout=60)



[0;31m---------------------------------------------------------------------------[0m
[0;31mHTTPError[0m                                 Traceback (most recent call last)
File [0;32m<command-2915852555945579>:11[0m
[1;32m      8[0m [38;5;66;03m# Specify the columns you want to retrieve[39;00m
[1;32m      9[0m desired_columns [38;5;241m=[39m [38;5;124m"[39m[38;5;124mcrash_record_id,crash_date,crash_type,num_units,weather_condition,most_severe_injury,latitude,longitude[39m[38;5;124m"[39m
[0;32m---> 11[0m results [38;5;241m=[39m client[38;5;241m.[39mget([38;5;124m"[39m[38;5;124m85ca-t3if[39m[38;5;124m"[39m, select[38;5;241m=[39mdesired_columns, limit[38;5;241m=[39m[38;5;241m800000[39m, timeout[38;5;241m=[39m[38;5;241m60[39m)

File [0;32m/local_disk0/.ephemeral_nfs/envs/pythonEnv-1c4fed61-24d6-4e48-a8ab-e137e5b3bfe9/lib/python3.9/site-packages/sodapy/socrata.py:412[0m, in [0;36mSocrata.get[0;34m(self, dataset_identifier, content_type, **kwargs)

In [0]:
# Convert the results to a Spark DataFrame
df2 = spark.createDataFrame(results)
print(df2.count())
# Show the first few rows of the DataFrame
df2.show(5)

784815
+--------------------+----------------+--------------------+------------------+----------------------+--------------------+-----------------+------------------+--------------------+--------------------+--------+------------------+--------------------+-----------+-----------+--------------------+----------------------+------------------+-------------+-------------+--------------------+-----------------------+----------------------+---------+----------------+-----------+------------------+--------------+------------------+---------+-----------+--------------+-----------------+---------+--------------------+--------------+--------------+-----------------------+---------------------------+-----------------------------+----------------------+----------------+----------+-----------------+-----------+------------+-------------+--------------------+
|     CRASH_RECORD_ID|CRASH_DATE_EST_I|          CRASH_DATE|POSTED_SPEED_LIMIT|TRAFFIC_CONTROL_DEVICE|    DEVICE_CONDITION|WEATHER_CONDITIO

### Create RDD of wanted features

In [0]:
wanted_columns = df2.select("Crash_type","num_units","Weather_condition","Crash_date","Most_severe_injury","Longitude","Latitude")
wanted_columns.show(1)
rdd_of_features = wanted_columns.rdd.map(lambda row:[row[0],row[1],row[2],row[3],row[4],row[5],row[6]])


+--------------------+---------+-----------------+--------------------+--------------------+---------+--------+
|          Crash_type|num_units|Weather_condition|          Crash_date|  Most_severe_injury|Longitude|Latitude|
+--------------------+---------+-----------------+--------------------+--------------------+---------+--------+
|INJURY AND / OR T...|        2|            CLEAR|08/18/2023 12:50:...|NONINCAPACITATING...|     null|    null|
+--------------------+---------+-----------------+--------------------+--------------------+---------+--------+
only showing top 1 row



### Remove all rows where the content of one of the fields is unknown

In [0]:
print(rdd_of_features.count())
#row[0] = Crash_type, row[2] = Weather_condition,  row[4]= Most_severe_injury
cleaned_data_rdd = rdd_of_features.filter(lambda row: row[0]!="UNKNOWN"  and row[2]!="UNKNOWN"  and row[4]!="UNKNOWN" and row[5] != None and row[6] != None and row[0] != None and row[1] != None and row[2] != None and row[3] != None and row[4] != None)
print(cleaned_data_rdd.count())

784815
735689


### Create Dataframe from RDD and get it ready for regression

In [0]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline
from pyspark.sql.functions import col

cleaned_data_df = spark.createDataFrame(cleaned_data_rdd)

#_1 = Crash_type, _2 = numUnits, _3 = weather, _4 = time, _5 = injury severity, _6 = longitude, _7 = latitude
print(cleaned_data_df.dtypes)
numeric_cols = ["_2", "_6", "_7"]
for col_name in numeric_cols:    
    cleaned_data_df = cleaned_data_df.withColumn(col_name, col(col_name).cast("double"))
print(cleaned_data_df.dtypes)

string_cols = ["_1", "_3", "_4", "_5"]
indexers = [StringIndexer(inputCol=column, outputCol=column+"_index").fit(cleaned_data_df) for column in string_cols ]

pipeline = Pipeline(stages=indexers)
indexed_df = pipeline.fit(cleaned_data_df).transform(cleaned_data_df)
indexed_df.show(5)

[('_1', 'string'), ('_2', 'bigint'), ('_3', 'string'), ('_4', 'string'), ('_5', 'string'), ('_6', 'double'), ('_7', 'double')]
[('_1', 'string'), ('_2', 'double'), ('_3', 'string'), ('_4', 'string'), ('_5', 'string'), ('_6', 'double'), ('_7', 'double')]
+--------------------+---+-----+--------------------+--------------------+-------------+------------+--------+--------+--------+--------+
|                  _1| _2|   _3|                  _4|                  _5|           _6|          _7|_1_index|_3_index|_4_index|_5_index|
+--------------------+---+-----+--------------------+--------------------+-------------+------------+--------+--------+--------+--------+
|NO INJURY / DRIVE...|4.0|CLEAR|07/29/2023 02:45:...|NO INDICATION OF ...|-87.665902343|41.854120263|     0.0|     0.0|  9668.0|     0.0|
|INJURY AND / OR T...|2.0|CLEAR|08/18/2023 05:58:...|NONINCAPACITATING...|-87.761883497|41.942975745|     1.0|     0.0|354021.0|     1.0|
|NO INJURY / DRIVE...|2.0|CLEAR|07/29/2023 12:50:...|NO 

# Latitude

### Create Labeled Points and Normalize features

In [0]:
from pyspark.ml.feature import VectorAssembler, MinMaxScaler, StandardScaler
from pyspark.mllib.regression import LabeledPoint

feature_column_names = ["_2", "_6", "_1_index", "_3_index", "_4_index", "_5_index"]

# Assemble numeric columns into a feature vector
assembler = VectorAssembler(inputCols=feature_column_names, outputCol="features")
vector_df = assembler.transform(indexed_df)

useMinMaxScaler=True
if (useMinMaxScaler):
    # Apply MinMaxScaler
    scaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures")
    scalerModel = scaler.fit(vector_df)
    scaledData = scalerModel.transform(vector_df)
else:
    # Apply StandarScaler
    scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", withStd=True, withMean=True)
    scalerModel = scaler.fit(vector_df)
    scaledData = scalerModel.transform(vector_df)

normalizedSamplePoints = scaledData.withColumn("label", col("_7"))
firstPoint = normalizedSamplePoints.take(1)
print(firstPoint)
print (firstPoint[0].scaledFeatures, firstPoint[0].label)

[Row(_1='NO INJURY / DRIVE AWAY', _2=4.0, _3='CLEAR', _4='07/29/2023 02:45:00 PM', _5='NO INDICATION OF INJURY', _6=-87.665902343, _7=41.854120263, _1_index=0.0, _3_index=0.0, _4_index=9668.0, _5_index=0.0, features=DenseVector([4.0, -87.6659, 0.0, 0.0, 9668.0, 0.0]), scaledFeatures=DenseVector([0.1765, 0.0031, 0.0, 0.0, 0.0195, 0.0]), label=41.854120263)]
[0.1764705882352941,0.0030737128245124593,0.0,0.0,0.01951916787130432,0.0] 41.854120263


[0;31m---------------------------------------------------------------------------[0m
[0;31mNameError[0m                                 Traceback (most recent call last)
File [0;32m<command-2915852555945588>:26[0m
[1;32m     24[0m [38;5;28mprint[39m(firstPoint)
[1;32m     25[0m [38;5;28mprint[39m (firstPoint[[38;5;241m0[39m][38;5;241m.[39mscaledFeatures, firstPoint[[38;5;241m0[39m][38;5;241m.[39mlabel)
[0;32m---> 26[0m [38;5;28mprint[39m([38;5;28mlen[39m(firstPointFeatures))

[0;31mNameError[0m: name 'firstPointFeatures' is not defined

In [0]:
def to_labeled_point(row):
    # Convert the DenseVector to a list for features
    features_list = row.scaledFeatures.toArray().tolist()
    return LabeledPoint(row['label'], features_list)

weights = [.8, .2] # train/test split
seed = 42

# List of column names that are not needed in df, performance optimization for conversion to rdd later
drop_columns = ["_1", "_2", "_3", "_4", "_5", "_6","_7","_1_index","_3_index","_4_index","_5_index","features"]
normalizedSamplePoints=normalizedSamplePoints.drop(*drop_columns)
parsedTrainData, parsedValData = normalizedSamplePoints.randomSplit(weights,seed)
parsedTrainData=parsedTrainData.rdd.map(to_labeled_point)
parsedValData=parsedValData.rdd.map(to_labeled_point)

parsedTrainData.cache()
parsedValData.cache()

nTrain = parsedTrainData.count()
nVal = parsedValData.count()

print(nTrain, nVal, nTrain + nVal)

588745 146944 735689


### Create baseline using the average value

In [0]:
averagelatitude = (parsedTrainData.map(lambda s: s.label)).mean()
print(averagelatitude)

41.85411854360393


In [0]:
import math
def squaredError(label, prediction):
    sqrError = (label-prediction)*(label-prediction)
    return sqrError

def calcRMSE(labelsAndPreds):
    sqrSum = labelsAndPreds.map(lambda s: squaredError(s[0],s[1])).sum()
    return math.sqrt(sqrSum/labelsAndPreds.count())

labelsAndPredsTrain = parsedTrainData.map(lambda s: (s.label,averagelatitude))
rmseTrainBase = calcRMSE(labelsAndPredsTrain)

labelsAndPredsVal = parsedValData.map(lambda s: (s.label,averagelatitude))
rmseValBase = calcRMSE(labelsAndPredsVal)
print(rmseValBase)

0.3557268831871511


### Random Forest Version One

In [0]:
from pyspark.mllib.tree import RandomForest
thirdModel = RandomForest.trainRegressor(parsedTrainData, categoricalFeaturesInfo={},
                                      numTrees=8, featureSubsetStrategy="auto",
                                      impurity='variance', maxDepth=5, maxBins=32)



In [0]:
samplePoints = parsedValData.take(5)
print(samplePoints)
for i in range(5):
    samplePrediction = thirdModel.predict(samplePoints[i].features)
    print(samplePrediction)

[LabeledPoint(41.976201139, [0.0,0.00035120717607830393,0.0,0.0,0.09826612935789449,0.0]), LabeledPoint(41.994793386, [0.0,0.0006082283665866339,0.0,0.0,0.1666155200400559,0.0]), LabeledPoint(41.994793386, [0.0,0.0006082283665866339,0.0,0.0,0.6761610957222577,0.0]), LabeledPoint(41.977955465, [0.0,0.0010220258119903588,0.0,0.0,0.260678204268859,0.0]), LabeledPoint(41.952142587, [0.0,0.0011301226112881355,0.0,0.0,0.8405497185589573,0.0])]
41.88699044150097
41.88699044150097
41.88311287092069
41.88283808646855
41.88311287092069


In [0]:
import numpy as np
labels = parsedValData.map(lambda x: x.label).collect()
predictions = thirdModel.predict(parsedValData.map(lambda x: x.features)).collect()
rmseDT1 = np.sqrt(np.mean((np.array(predictions)-np.array(labels))**2))

print(rmseValBase)
print(rmseDT1)

0.3557268831871511
0.3528108420753974


### Random Forest Version Two

In [0]:
thirdModel = RandomForest.trainRegressor(parsedTrainData, categoricalFeaturesInfo={},
                                      numTrees=10, featureSubsetStrategy="auto",
                                      impurity='variance', maxDepth=6, maxBins=32)

In [0]:
samplePoints = parsedValData.take(5)
print(samplePoints)
for i in range(5):
    samplePrediction = thirdModel.predict(samplePoints[i].features)
    print(samplePrediction)

[LabeledPoint(41.976201139, [0.0,0.00035120717607830393,0.0,0.0,0.09826612935789449,0.0]), LabeledPoint(41.994793386, [0.0,0.0006082283665866339,0.0,0.0,0.1666155200400559,0.0]), LabeledPoint(41.994793386, [0.0,0.0006082283665866339,0.0,0.0,0.6761610957222577,0.0]), LabeledPoint(41.977955465, [0.0,0.0010220258119903588,0.0,0.0,0.260678204268859,0.0]), LabeledPoint(41.952142587, [0.0,0.0011301226112881355,0.0,0.0,0.8405497185589573,0.0])]
41.893968146825465
41.893968146825465
41.90063402583679
41.897134107364536
41.90186023035237


In [0]:
labels = parsedValData.map(lambda x: x.label).collect()
predictions = thirdModel.predict(parsedValData.map(lambda x: x.features)).collect()
rmseDT2 = np.sqrt(np.mean((np.array(predictions)-np.array(labels))**2))

print(rmseValBase)
print(rmseDT2)

0.3557268831871511
0.3507406269653482


# Longitude

## Create Labeled Points and Normalize

In [0]:
feature_column_names = ["_2", "_7", "_1_index", "_3_index", "_4_index", "_5_index"]

# Assemble numeric columns into a feature vector
assembler = VectorAssembler(inputCols=feature_column_names, outputCol="features")
vector_df = assembler.transform(indexed_df)

useMinMaxScaler=True
if (useMinMaxScaler):
    # Apply MinMaxScaler
    scaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures")
    scalerModel = scaler.fit(vector_df)
    scaledData = scalerModel.transform(vector_df)
else:
    # Apply StandarScaler
    scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", withStd=True, withMean=True)
    scalerModel = scaler.fit(vector_df)
    scaledData = scalerModel.transform(vector_df)

normalizedSamplePoints = scaledData.withColumn("label", col("_6"))
firstPoint = normalizedSamplePoints.take(1)
print(firstPoint)
print (firstPoint[0].scaledFeatures, firstPoint[0].label)

[Row(_1='NO INJURY / DRIVE AWAY', _2=4.0, _3='CLEAR', _4='07/29/2023 02:45:00 PM', _5='NO INDICATION OF INJURY', _6=-87.665902343, _7=41.854120263, _1_index=0.0, _3_index=0.0, _4_index=9668.0, _5_index=0.0, features=DenseVector([4.0, 41.8541, 0.0, 0.0, 9668.0, 0.0]), scaledFeatures=DenseVector([0.1765, 0.996, 0.0, 0.0, 0.0195, 0.0]), label=-87.665902343)]
[0.1764705882352941,0.995986472133498,0.0,0.0,0.01951916787130432,0.0] -87.665902343


[0;31m---------------------------------------------------------------------------[0m
[0;31mNameError[0m                                 Traceback (most recent call last)
File [0;32m<command-2915852555945612>:23[0m
[1;32m     21[0m [38;5;28mprint[39m(firstPoint)
[1;32m     22[0m [38;5;28mprint[39m (firstPoint[[38;5;241m0[39m][38;5;241m.[39mscaledFeatures, firstPoint[[38;5;241m0[39m][38;5;241m.[39mlabel)
[0;32m---> 23[0m [38;5;28mprint[39m([38;5;28mlen[39m(firstPointFeatures))

[0;31mNameError[0m: name 'firstPointFeatures' is not defined

In [0]:
weights = [.8, .2] # train/test split
seed = 42

# List of column names that are not needed in df, performance optimization for conversion to rdd later
drop_columns = ["_1", "_2", "_3", "_4", "_5", "_6","_7","_1_index","_3_index","_4_index","_5_index","features"]
normalizedSamplePoints=normalizedSamplePoints.drop(*drop_columns)
parsedTrainData, parsedValData = normalizedSamplePoints.randomSplit(weights,seed)
parsedTrainData=parsedTrainData.rdd.map(to_labeled_point)
parsedValData=parsedValData.rdd.map(to_labeled_point)

parsedTrainData.cache()
parsedValData.cache()

nTrain = parsedTrainData.count()
nVal = parsedValData.count()

print(nTrain, nVal, nTrain + nVal)

588745 146944 735689


## Create Baseline

In [0]:
averagelongitude = (parsedTrainData.map(lambda s: s.label)).mean()
print(averagelongitude)

-87.67353957861799


In [0]:
labelsAndPredsTrain = parsedTrainData.map(lambda s: (s.label,averagelatitude))
rmseTrainBase = calcRMSE(labelsAndPredsTrain)

labelsAndPredsVal = parsedValData.map(lambda s: (s.label,averagelatitude))
rmseValBase = calcRMSE(labelsAndPredsVal)
print(rmseValBase)

129.5279791671811


## Random Forest Version One

In [0]:
longitudeOne = RandomForest.trainRegressor(parsedTrainData, categoricalFeaturesInfo={},
                                      numTrees=8, featureSubsetStrategy="auto",
                                      impurity='variance', maxDepth=5, maxBins=32)
samplePoints = parsedValData.take(5)
print(samplePoints)
for i in range(5):
    samplePrediction = longitudeOne.predict(samplePoints[i].features)
    print(samplePrediction)
labels = parsedValData.map(lambda x: x.label).collect()
predictions = longitudeOne.predict(parsedValData.map(lambda x: x.features)).collect()
rmseDT1 = np.sqrt(np.mean((np.array(predictions)-np.array(labels))**2))

print(rmseValBase)
print(rmseDT1)

[LabeledPoint(0.0, [0.058823529411764705,0.0,0.0,0.0,0.0318730971435955,0.0]), LabeledPoint(0.0, [0.058823529411764705,0.0,0.0,0.0,0.08369337866539608,0.0]), LabeledPoint(0.0, [0.058823529411764705,0.0,0.0,0.0,0.21620890435850018,0.0]), LabeledPoint(0.0, [0.058823529411764705,0.0,0.0,0.0,0.4414788374102579,0.0]), LabeledPoint(0.0, [0.058823529411764705,0.0,0.0,0.0,0.7866620365509946,0.0])]
-87.55268607925034
-87.55268607925034
-87.53376281848195
-87.48414517085668
-87.48414517085668
129.5279791671811
0.7933934493548133


## Random Forest Version Two

In [0]:
longitudeTwo = RandomForest.trainRegressor(parsedTrainData, categoricalFeaturesInfo={},
                                      numTrees=10, featureSubsetStrategy="auto",
                                      impurity='variance', maxDepth=6, maxBins=32)

samplePoints = parsedValData.take(5)
print(samplePoints)
for i in range(5):
    samplePrediction = longitudeTwo.predict(samplePoints[i].features)
    print(samplePrediction)
labels = parsedValData.map(lambda x: x.label).collect()
predictions = longitudeTwo.predict(parsedValData.map(lambda x: x.features)).collect()
rmseDT2 = np.sqrt(np.mean((np.array(predictions)-np.array(labels))**2))

print(rmseValBase)
print(rmseDT2)

[LabeledPoint(0.0, [0.058823529411764705,0.0,0.0,0.0,0.0318730971435955,0.0]), LabeledPoint(0.0, [0.058823529411764705,0.0,0.0,0.0,0.08369337866539608,0.0]), LabeledPoint(0.0, [0.058823529411764705,0.0,0.0,0.0,0.21620890435850018,0.0]), LabeledPoint(0.0, [0.058823529411764705,0.0,0.0,0.0,0.4414788374102579,0.0]), LabeledPoint(0.0, [0.058823529411764705,0.0,0.0,0.0,0.7866620365509946,0.0])]
-87.49483083634316
-87.54332104844782
-87.54998009262786
-87.5416644386359
-87.55762057431751
129.5279791671811
0.7933588810407468


# Type of Crash

## Create Labeled Points and Normalize

In [0]:
feature_column_names = ["_2", "_7", "_6", "_3_index", "_4_index", "_5_index"]

# Assemble numeric columns into a feature vector
assembler = VectorAssembler(inputCols=feature_column_names, outputCol="features")
vector_df = assembler.transform(indexed_df)

useMinMaxScaler=True
if (useMinMaxScaler):
    # Apply MinMaxScaler
    scaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures")
    scalerModel = scaler.fit(vector_df)
    scaledData = scalerModel.transform(vector_df)
else:
    # Apply StandarScaler
    scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", withStd=True, withMean=True)
    scalerModel = scaler.fit(vector_df)
    scaledData = scalerModel.transform(vector_df)

normalizedSamplePoints = scaledData.withColumn("label", col("_1_index"))
firstPoint = normalizedSamplePoints.take(1)
print(firstPoint)
print (firstPoint[0].scaledFeatures, firstPoint[0].label)

[Row(_1='NO INJURY / DRIVE AWAY', _2=4.0, _3='CLEAR', _4='07/29/2023 02:45:00 PM', _5='NO INDICATION OF INJURY', _6=-87.665902343, _7=41.854120263, _1_index=0.0, _3_index=0.0, _4_index=9668.0, _5_index=0.0, features=DenseVector([4.0, 41.8541, -87.6659, 0.0, 9668.0, 0.0]), scaledFeatures=DenseVector([0.1765, 0.996, 0.0031, 0.0, 0.0195, 0.0]), label=0.0)]
[0.1764705882352941,0.995986472133498,0.0030737128245124593,0.0,0.01951916787130432,0.0] 0.0


[0;31m---------------------------------------------------------------------------[0m
[0;31mNameError[0m                                 Traceback (most recent call last)
File [0;32m<command-2915852555945623>:23[0m
[1;32m     21[0m [38;5;28mprint[39m(firstPoint)
[1;32m     22[0m [38;5;28mprint[39m (firstPoint[[38;5;241m0[39m][38;5;241m.[39mscaledFeatures, firstPoint[[38;5;241m0[39m][38;5;241m.[39mlabel)
[0;32m---> 23[0m [38;5;28mprint[39m([38;5;28mlen[39m(firstPointFeatures))

[0;31mNameError[0m: name 'firstPointFeatures' is not defined

In [0]:
weights = [.8, .2] # train/test split
seed = 42

# List of column names that are not needed in df, performance optimization for conversion to rdd later
drop_columns = ["_1", "_2", "_3", "_4", "_5", "_6","_7","_1_index","_3_index","_4_index","_5_index","features"]
normalizedSamplePoints=normalizedSamplePoints.drop(*drop_columns)
parsedTrainData, parsedValData = normalizedSamplePoints.randomSplit(weights,seed)
parsedTrainData=parsedTrainData.rdd.map(to_labeled_point)
parsedValData=parsedValData.rdd.map(to_labeled_point)

parsedTrainData.cache()
parsedValData.cache()

nTrain = parsedTrainData.count()
nVal = parsedValData.count()

print(nTrain, nVal, nTrain + nVal)

588745 146944 735689


## Create Baseline

In [0]:
averagetype = (parsedTrainData.map(lambda s: s.label)).mean()
print(averagetype)

0.2749730358644238


In [0]:
labelsAndPredsTrain = parsedTrainData.map(lambda s: (s.label,averagelatitude))
rmseTrainBase = calcRMSE(labelsAndPredsTrain)

labelsAndPredsVal = parsedValData.map(lambda s: (s.label,averagelatitude))
rmseValBase = calcRMSE(labelsAndPredsVal)
print(rmseValBase)

41.58225778891926


## Random Forest Version One

In [0]:
typeOne = RandomForest.trainRegressor(parsedTrainData, categoricalFeaturesInfo={},
                                      numTrees=8, featureSubsetStrategy="auto",
                                      impurity='variance', maxDepth=5, maxBins=32)
samplePoints = parsedValData.take(5)
print(samplePoints)
for i in range(5):
    samplePrediction = typeOne.predict(samplePoints[i].features)
    print(samplePrediction)
labels = parsedValData.map(lambda x: x.label).collect()
predictions = typeOne.predict(parsedValData.map(lambda x: x.features)).collect()
rmseDT1 = np.sqrt(np.mean((np.array(predictions)-np.array(labels))**2))

print(rmseValBase)
print(rmseDT1)

[LabeledPoint(1.0, [0.0,0.991134582570875,0.0044363947076390825,0.0,0.23895030970628378,0.0]), LabeledPoint(0.0, [0.0,0.9912085289164112,0.0036286714981203774,0.0,0.07289807554087559,0.0]), LabeledPoint(1.0, [0.0,0.9912185808692187,0.004632042556646592,0.2,0.31060673358798974,0.0]), LabeledPoint(0.0, [0.0,0.9912581380571414,0.00439508509576857,0.1,0.8348825377340968,0.0]), LabeledPoint(0.0, [0.0,0.9912825164062985,0.0036586329271032776,0.0,0.26806148901289706,0.0])]
0.30061708627288575
0.29565314114612884
0.5635406993969293
0.5734715988130848
0.449712152625503
41.58225778891926
0.3207803221909808


## Random Forest Version Two

In [0]:
typeTwo = RandomForest.trainRegressor(parsedTrainData, categoricalFeaturesInfo={},
                                      numTrees=8, featureSubsetStrategy="auto",
                                      impurity='variance', maxDepth=5, maxBins=32)
samplePoints = parsedValData.take(5)
print(samplePoints)
for i in range(5):
    samplePrediction = typeTwo.predict(samplePoints[i].features)
    print(samplePrediction)
labels = parsedValData.map(lambda x: x.label).collect()
predictions = typeTwo.predict(parsedValData.map(lambda x: x.features)).collect()
rmseDT2 = np.sqrt(np.mean((np.array(predictions)-np.array(labels))**2))

print(rmseValBase)
print(rmseDT2)

[LabeledPoint(1.0, [0.0,0.991134582570875,0.0044363947076390825,0.0,0.23895030970628378,0.0]), LabeledPoint(0.0, [0.0,0.9912085289164112,0.0036286714981203774,0.0,0.07289807554087559,0.0]), LabeledPoint(1.0, [0.0,0.9912185808692187,0.004632042556646592,0.2,0.31060673358798974,0.0]), LabeledPoint(0.0, [0.0,0.9912581380571414,0.00439508509576857,0.1,0.8348825377340968,0.0]), LabeledPoint(0.0, [0.0,0.9912825164062985,0.0036586329271032776,0.0,0.26806148901289706,0.0])]
0.2980893301573213
0.29653073331398383
0.4513659525869353
0.46016532504909846
0.47242738036928256
41.58225778891926
0.3281357779519216


# Weather

## Create Labeled Points and Normalize

In [0]:
feature_column_names = ["_2", "_7", "_6", "_1_index", "_4_index", "_5_index"]

# Assemble numeric columns into a feature vector
assembler = VectorAssembler(inputCols=feature_column_names, outputCol="features")
vector_df = assembler.transform(indexed_df)

useMinMaxScaler=True
if (useMinMaxScaler):
    # Apply MinMaxScaler
    scaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures")
    scalerModel = scaler.fit(vector_df)
    scaledData = scalerModel.transform(vector_df)
else:
    # Apply StandarScaler
    scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", withStd=True, withMean=True)
    scalerModel = scaler.fit(vector_df)
    scaledData = scalerModel.transform(vector_df)

normalizedSamplePoints = scaledData.withColumn("label", col("_3_index"))
firstPoint = normalizedSamplePoints.take(1)
print(firstPoint)
print (firstPoint[0].scaledFeatures, firstPoint[0].label)

[Row(_1='NO INJURY / DRIVE AWAY', _2=4.0, _3='CLEAR', _4='07/29/2023 02:45:00 PM', _5='NO INDICATION OF INJURY', _6=-87.665902343, _7=41.854120263, _1_index=0.0, _3_index=0.0, _4_index=9668.0, _5_index=0.0, features=DenseVector([4.0, 41.8541, -87.6659, 0.0, 9668.0, 0.0]), scaledFeatures=DenseVector([0.1765, 0.996, 0.0031, 0.0, 0.0195, 0.0]), label=0.0)]
[0.1764705882352941,0.995986472133498,0.0030737128245124593,0.0,0.01951916787130432,0.0] 0.0


[0;31m---------------------------------------------------------------------------[0m
[0;31mNameError[0m                                 Traceback (most recent call last)
File [0;32m<command-2915852555945630>:23[0m
[1;32m     21[0m [38;5;28mprint[39m(firstPoint)
[1;32m     22[0m [38;5;28mprint[39m (firstPoint[[38;5;241m0[39m][38;5;241m.[39mscaledFeatures, firstPoint[[38;5;241m0[39m][38;5;241m.[39mlabel)
[0;32m---> 23[0m [38;5;28mprint[39m([38;5;28mlen[39m(firstPointFeatures))

[0;31mNameError[0m: name 'firstPointFeatures' is not defined

In [0]:
weights = [.8, .2] # train/test split
seed = 42

# List of column names that are not needed in df, performance optimization for conversion to rdd later
drop_columns = ["_1", "_2", "_3", "_4", "_5", "_6","_7","_1_index","_3_index","_4_index","_5_index","features"]
normalizedSamplePoints=normalizedSamplePoints.drop(*drop_columns)
parsedTrainData, parsedValData = normalizedSamplePoints.randomSplit(weights,seed)
parsedTrainData=parsedTrainData.rdd.map(to_labeled_point)
parsedValData=parsedValData.rdd.map(to_labeled_point)

parsedTrainData.cache()
parsedValData.cache()

nTrain = parsedTrainData.count()
nVal = parsedValData.count()

print(nTrain, nVal, nTrain + nVal)

588745 146944 735689


## Create Baseline

In [0]:
averageweather = (parsedTrainData.map(lambda s: s.label)).mean()
print(averageweather)

0.3019133920457928


In [0]:
labelsAndPredsTrain = parsedTrainData.map(lambda s: (s.label,averagelatitude))
rmseTrainBase = calcRMSE(labelsAndPredsTrain)

labelsAndPredsVal = parsedValData.map(lambda s: (s.label,averagelatitude))
rmseValBase = calcRMSE(labelsAndPredsVal)
print(rmseValBase)

41.56084745132242


## Random Forest Version One

In [0]:
weatherOne = RandomForest.trainRegressor(parsedTrainData, categoricalFeaturesInfo={},
                                      numTrees=8, featureSubsetStrategy="auto",
                                      impurity='variance', maxDepth=5, maxBins=32)
samplePoints = parsedValData.take(5)
print(samplePoints)
for i in range(5):
    samplePrediction = weatherOne.predict(samplePoints[i].features)
    print(samplePrediction)
labels = parsedValData.map(lambda x: x.label).collect()
predictions = weatherOne.predict(parsedValData.map(lambda x: x.features)).collect()
rmseDT1 = np.sqrt(np.mean((np.array(predictions)-np.array(labels))**2))

print(rmseValBase)
print(rmseDT1)

[LabeledPoint(0.0, [0.0,0.991134582570875,0.0044363947076390825,1.0,0.23895030970628378,0.0]), LabeledPoint(0.0, [0.0,0.9912085289164112,0.0036286714981203774,0.0,0.07289807554087559,0.0]), LabeledPoint(2.0, [0.0,0.9912185808692187,0.004632042556646592,1.0,0.31060673358798974,0.0]), LabeledPoint(1.0, [0.0,0.9912581380571414,0.00439508509576857,0.0,0.8348825377340968,0.0]), LabeledPoint(0.0, [0.0,0.9912825164062985,0.0036586329271032776,0.0,0.26806148901289706,0.0])]
0.43602195203080163
0.39302928882855515
0.7120867050423874
0.30775897044345824
0.5439281077097404
41.56084745132242
0.8188218608003667


## Random Forest Version Two

In [0]:
weatherTwo = RandomForest.trainRegressor(parsedTrainData, categoricalFeaturesInfo={},
                                      numTrees=8, featureSubsetStrategy="auto",
                                      impurity='variance', maxDepth=5, maxBins=32)
samplePoints = parsedValData.take(5)
print(samplePoints)
for i in range(5):
    samplePrediction = weatherTwo.predict(samplePoints[i].features)
    print(samplePrediction)
labels = parsedValData.map(lambda x: x.label).collect()
predictions = weatherTwo.predict(parsedValData.map(lambda x: x.features)).collect()
rmseDT2 = np.sqrt(np.mean((np.array(predictions)-np.array(labels))**2))

print(rmseValBase)
print(rmseDT2)

[LabeledPoint(0.0, [0.0,0.991134582570875,0.0044363947076390825,1.0,0.23895030970628378,0.0]), LabeledPoint(0.0, [0.0,0.9912085289164112,0.0036286714981203774,0.0,0.07289807554087559,0.0]), LabeledPoint(2.0, [0.0,0.9912185808692187,0.004632042556646592,1.0,0.31060673358798974,0.0]), LabeledPoint(1.0, [0.0,0.9912581380571414,0.00439508509576857,0.0,0.8348825377340968,0.0]), LabeledPoint(0.0, [0.0,0.9912825164062985,0.0036586329271032776,0.0,0.26806148901289706,0.0])]
0.40712059261811023
0.3484273402624314
0.6326541867915426
0.36246482545027275
0.47074187778286314
41.56084745132242
0.8185131741214071


# Injury Severity

## Create Laeled Points and Normalize

In [0]:
feature_column_names = ["_2", "_7", "_6", "_1_index", "_3_index", "_5_index"]

# Assemble numeric columns into a feature vector
assembler = VectorAssembler(inputCols=feature_column_names, outputCol="features")
vector_df = assembler.transform(indexed_df)

useMinMaxScaler=True
if (useMinMaxScaler):
    # Apply MinMaxScaler
    scaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures")
    scalerModel = scaler.fit(vector_df)
    scaledData = scalerModel.transform(vector_df)
else:
    # Apply StandarScaler
    scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", withStd=True, withMean=True)
    scalerModel = scaler.fit(vector_df)
    scaledData = scalerModel.transform(vector_df)

normalizedSamplePoints = scaledData.withColumn("label", col("_4_index"))
firstPoint = normalizedSamplePoints.take(1)
print(firstPoint)
print (firstPoint[0].scaledFeatures, firstPoint[0].label)

[Row(_1='NO INJURY / DRIVE AWAY', _2=4.0, _3='CLEAR', _4='07/29/2023 02:45:00 PM', _5='NO INDICATION OF INJURY', _6=-87.665902343, _7=41.854120263, _1_index=0.0, _3_index=0.0, _4_index=9668.0, _5_index=0.0, features=DenseVector([4.0, 41.8541, -87.6659, 0.0, 0.0, 0.0]), scaledFeatures=DenseVector([0.1765, 0.996, 0.0031, 0.0, 0.0, 0.0]), label=9668.0)]
[0.1764705882352941,0.995986472133498,0.0030737128245124593,0.0,0.0,0.0] 9668.0


[0;31m---------------------------------------------------------------------------[0m
[0;31mNameError[0m                                 Traceback (most recent call last)
File [0;32m<command-2915852555945637>:23[0m
[1;32m     21[0m [38;5;28mprint[39m(firstPoint)
[1;32m     22[0m [38;5;28mprint[39m (firstPoint[[38;5;241m0[39m][38;5;241m.[39mscaledFeatures, firstPoint[[38;5;241m0[39m][38;5;241m.[39mlabel)
[0;32m---> 23[0m [38;5;28mprint[39m([38;5;28mlen[39m(firstPointFeatures))

[0;31mNameError[0m: name 'firstPointFeatures' is not defined

In [0]:
weights = [.8, .2] # train/test split
seed = 42

# List of column names that are not needed in df, performance optimization for conversion to rdd later
drop_columns = ["_1", "_2", "_3", "_4", "_5", "_6","_7","_1_index","_3_index","_4_index","_5_index","features"]
normalizedSamplePoints=normalizedSamplePoints.drop(*drop_columns)
parsedTrainData, parsedValData = normalizedSamplePoints.randomSplit(weights,seed)
parsedTrainData=parsedTrainData.rdd.map(to_labeled_point)
parsedValData=parsedValData.rdd.map(to_labeled_point)

parsedTrainData.cache()
parsedValData.cache()

nTrain = parsedTrainData.count()
nVal = parsedValData.count()

print(nTrain, nVal, nTrain + nVal)

## Create Baseline

In [0]:
averageinjury = (parsedTrainData.map(lambda s: s.label)).mean()
print(averageinjury)

In [0]:
labelsAndPredsTrain = parsedTrainData.map(lambda s: (s.label,averagelatitude))
rmseTrainBase = calcRMSE(labelsAndPredsTrain)

labelsAndPredsVal = parsedValData.map(lambda s: (s.label,averagelatitude))
rmseValBase = calcRMSE(labelsAndPredsVal)
print(rmseValBase)

## Random Forest Version One

In [0]:
injuryOne = RandomForest.trainRegressor(parsedTrainData, categoricalFeaturesInfo={},
                                      numTrees=8, featureSubsetStrategy="auto",
                                      impurity='variance', maxDepth=5, maxBins=32)
samplePoints = parsedValData.take(5)
print(samplePoints)
for i in range(5):
    samplePrediction = injuryOne.predict(samplePoints[i].features)
    print(samplePrediction)
labels = parsedValData.map(lambda x: x.label).collect()
predictions = injuryOne.predict(parsedValData.map(lambda x: x.features)).collect()
rmseDT1 = np.sqrt(np.mean((np.array(predictions)-np.array(labels))**2))

print(rmseValBase)
print(rmseDT1)

## Random Forest Version Two

In [0]:
injuryTwo = RandomForest.trainRegressor(parsedTrainData, categoricalFeaturesInfo={},
                                      numTrees=8, featureSubsetStrategy="auto",
                                      impurity='variance', maxDepth=5, maxBins=32)
samplePoints = parsedValData.take(5)
print(samplePoints)
for i in range(5):
    samplePrediction = injuryTwo.predict(samplePoints[i].features)
    print(samplePrediction)
labels = parsedValData.map(lambda x: x.label).collect()
predictions = injuryTwo.predict(parsedValData.map(lambda x: x.features)).collect()
rmseDT2 = np.sqrt(np.mean((np.array(predictions)-np.array(labels))**2))

print(rmseValBase)
print(rmseDT2)

# Number of Units

## Create Labeled Points and Normalize

In [0]:
feature_column_names = ["_7", "_6", "_1_index", "_3_index", "_5_index","_4_index"]

# Assemble numeric columns into a feature vector
assembler = VectorAssembler(inputCols=feature_column_names, outputCol="features")
vector_df = assembler.transform(indexed_df)

useMinMaxScaler=True
if (useMinMaxScaler):
    # Apply MinMaxScaler
    scaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures")
    scalerModel = scaler.fit(vector_df)
    scaledData = scalerModel.transform(vector_df)
else:
    # Apply StandarScaler
    scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", withStd=True, withMean=True)
    scalerModel = scaler.fit(vector_df)
    scaledData = scalerModel.transform(vector_df)

normalizedSamplePoints = scaledData.withColumn("label", col("_2"))
firstPoint = normalizedSamplePoints.take(1)
print(firstPoint)
print (firstPoint[0].scaledFeatures, firstPoint[0].label)

In [0]:
weights = [.8, .2] # train/test split
seed = 42

# List of column names that are not needed in df, performance optimization for conversion to rdd later
drop_columns = ["_1", "_2", "_3", "_4", "_5", "_6","_7","_1_index","_3_index","_4_index","_5_index","features"]
normalizedSamplePoints=normalizedSamplePoints.drop(*drop_columns)
parsedTrainData, parsedValData = normalizedSamplePoints.randomSplit(weights,seed)
parsedTrainData=parsedTrainData.rdd.map(to_labeled_point)
parsedValData=parsedValData.rdd.map(to_labeled_point)

parsedTrainData.cache()
parsedValData.cache()

nTrain = parsedTrainData.count()
nVal = parsedValData.count()

print(nTrain, nVal, nTrain + nVal)

## Create Baseline

In [0]:
averagenumber = (parsedTrainData.map(lambda s: s.label)).mean()
print(averagenumber)

In [0]:
labelsAndPredsTrain = parsedTrainData.map(lambda s: (s.label,averagenumber))
rmseTrainBase = calcRMSE(labelsAndPredsTrain)

labelsAndPredsVal = parsedValData.map(lambda s: (s.label,averagenumber))
rmseValBase = calcRMSE(labelsAndPredsVal)
print(rmseValBase)

## Random Forest Version One

In [0]:
numberOne = RandomForest.trainRegressor(parsedTrainData, categoricalFeaturesInfo={},
                                      numTrees=8, featureSubsetStrategy="auto",
                                      impurity='variance', maxDepth=5, maxBins=32)
samplePoints = parsedValData.take(5)
print(samplePoints)
for i in range(5):
    samplePrediction = numberOne.predict(samplePoints[i].features)
    print(samplePrediction)
labels = parsedValData.map(lambda x: x.label).collect()
predictions = numberOne.predict(parsedValData.map(lambda x: x.features)).collect()
rmseDT1 = np.sqrt(np.mean((np.array(predictions)-np.array(labels))**2))

print(rmseValBase)
print(rmseDT1)

## Random Forest Version Two

In [0]:
numberTwo = RandomForest.trainRegressor(parsedTrainData, categoricalFeaturesInfo={},
                                      numTrees=8, featureSubsetStrategy="auto",
                                      impurity='variance', maxDepth=5, maxBins=32)
samplePoints = parsedValData.take(5)
print(samplePoints)
for i in range(5):
    samplePrediction = numberTwo.predict(samplePoints[i].features)
    print(samplePrediction)
labels = parsedValData.map(lambda x: x.label).collect()
predictions = numberTwo.predict(parsedValData.map(lambda x: x.features)).collect()
rmseDT2 = np.sqrt(np.mean((np.array(predictions)-np.array(labels))**2))

print(rmseValBase)
print(rmseDT2)

# Time of Year

## Create Labeled Points and Normalize

In [0]:
feature_column_names = ["_7", "_6", "_1_index", "_3_index", "_2","_4_index"]

# Assemble numeric columns into a feature vector
assembler = VectorAssembler(inputCols=feature_column_names, outputCol="features")
vector_df = assembler.transform(indexed_df)

useMinMaxScaler=True
if (useMinMaxScaler):
    # Apply MinMaxScaler
    scaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures")
    scalerModel = scaler.fit(vector_df)
    scaledData = scalerModel.transform(vector_df)
else:
    # Apply StandarScaler
    scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", withStd=True, withMean=True)
    scalerModel = scaler.fit(vector_df)
    scaledData = scalerModel.transform(vector_df)

normalizedSamplePoints = scaledData.withColumn("label", col("_5_index"))
firstPoint = normalizedSamplePoints.take(1)
print(firstPoint)
print (firstPoint[0].scaledFeatures, firstPoint[0].label)

In [0]:
weights = [.8, .2] # train/test split
seed = 42

# List of column names that are not needed in df, performance optimization for conversion to rdd later
drop_columns = ["_1", "_2", "_3", "_4", "_5", "_6","_7","_1_index","_3_index","_4_index","_5_index","features"]
normalizedSamplePoints=normalizedSamplePoints.drop(*drop_columns)
parsedTrainData, parsedValData = normalizedSamplePoints.randomSplit(weights,seed)
parsedTrainData=parsedTrainData.rdd.map(to_labeled_point)
parsedValData=parsedValData.rdd.map(to_labeled_point)

parsedTrainData.cache()
parsedValData.cache()

nTrain = parsedTrainData.count()
nVal = parsedValData.count()

print(nTrain, nVal, nTrain + nVal)

## Create Baseline

In [0]:
averagetime = (parsedTrainData.map(lambda s: s.label)).mean()
print(averagetime)

In [0]:
labelsAndPredsTrain = parsedTrainData.map(lambda s: (s.label,averagetime))
rmseTrainBase = calcRMSE(labelsAndPredsTrain)

labelsAndPredsVal = parsedValData.map(lambda s: (s.label,averagetime))
rmseValBase = calcRMSE(labelsAndPredsVal)
print(rmseValBase)

## Random Forest Version One

In [0]:
timeOne = RandomForest.trainRegressor(parsedTrainData, categoricalFeaturesInfo={},
                                      numTrees=8, featureSubsetStrategy="auto",
                                      impurity='variance', maxDepth=5, maxBins=32)
samplePoints = parsedValData.take(5)
print(samplePoints)
for i in range(5):
    samplePrediction = timeOne.predict(samplePoints[i].features)
    print(samplePrediction)
labels = parsedValData.map(lambda x: x.label).collect()
predictions = timeOne.predict(parsedValData.map(lambda x: x.features)).collect()
rmseDT1 = np.sqrt(np.mean((np.array(predictions)-np.array(labels))**2))

print(rmseValBase)
print(rmseDT1)

## Random Forest Version Two

In [0]:
timeTwo = RandomForest.trainRegressor(parsedTrainData, categoricalFeaturesInfo={},
                                      numTrees=8, featureSubsetStrategy="auto",
                                      impurity='variance', maxDepth=5, maxBins=32)
samplePoints = parsedValData.take(5)
print(samplePoints)
for i in range(5):
    samplePrediction = timeTwo.predict(samplePoints[i].features)
    print(samplePrediction)
labels = parsedValData.map(lambda x: x.label).collect()
predictions = timeTwo.predict(parsedValData.map(lambda x: x.features)).collect()
rmseDT2 = np.sqrt(np.mean((np.array(predictions)-np.array(labels))**2))

print(rmseValBase)
print(rmseDT2)