## Traffic Crash Analysis

### Data importing and pre-processing

In [0]:
import requests
from pyspark.sql import SparkSession

# Make the GET request
resp = requests.get('https://data.cityofchicago.org/resource/85ca-t3if.json?$query=SELECT%20crash_record_id%2C%20crash_date_est_i%2C%20crash_date%2C%20posted_speed_limit%2C%20traffic_control_device%2C%20device_condition%2C%20weather_condition%2C%20lighting_condition%2C%20first_crash_type%2C%20trafficway_type%2C%20lane_cnt%2C%20alignment%2C%20roadway_surface_cond%2C%20road_defect%2C%20report_type%2C%20crash_type%2C%20intersection_related_i%2C%20private_property_i%2C%20hit_and_run_i%2C%20damage%2C%20date_police_notified%2C%20prim_contributory_cause%2C%20sec_contributory_cause%2C%20street_no%2C%20street_direction%2C%20street_name%2C%20beat_of_occurrence%2C%20photos_taken_i%2C%20statements_taken_i%2C%20dooring_i%2C%20work_zone_i%2C%20work_zone_type%2C%20workers_present_i%2C%20num_units%2C%20most_severe_injury%2C%20injuries_total%2C%20injuries_fatal%2C%20injuries_incapacitating%2C%20injuries_non_incapacitating%2C%20injuries_reported_not_evident%2C%20injuries_no_indication%2C%20injuries_unknown%2C%20crash_hour%2C%20crash_day_of_week%2C%20crash_month%2C%20latitude%2C%20longitude%2C%20location%20ORDER%20BY%20crash_date%20DESC%2C%20crash_record_id%20ASC')

# Create a Spark session
spark = SparkSession.builder.appName("SENG550").getOrCreate()

# Create a Spark DataFrame from the response text
df2 = spark.read.json(spark.sparkContext.parallelize([resp.text]))


In [0]:
# Show the DataFrame
df2.show(5)

+------------------+------------------+--------------------+----------------+-----------------+----------+-----------+--------------------+--------------------+-------------+--------------------+--------------------+---------+--------------------+-------------+--------------+-----------------------+----------------------+---------------------------+-----------------------------+--------------+----------------+----------------------+------------+--------------------+--------------------+-------------+--------------------+---------+--------------+------------------+-----------------------+------------------+--------------------+-----------+--------------------+----------------------+------------------+----------------+-----------+---------+----------------------+--------------------+-----------------+-----------+--------------+
|         alignment|beat_of_occurrence|          crash_date|crash_date_est_i|crash_day_of_week|crash_hour|crash_month|     crash_record_id|          crash_type|  

### Extract Weather Condition

In [0]:
feature_weather = df2.select("Weather_condition")
# Extract the values from the selected column using rdd.map
rdd_from_weather = feature_weather.rdd.map(lambda row: row[0])

# Display the top 10 records
print("Top 10 Records from Weather condition:")
print(rdd_from_weather.take(10))

Top 10 Records from Weather condition:
['CLEAR', 'CLEAR', 'CLEAR', 'CLEAR', 'CLEAR', 'CLEAR', 'CLEAR', 'UNKNOWN', 'UNKNOWN', 'CLEAR']


### Extract Crash Type

In [0]:
feature_crashType = df2.select("Crash_type")
# Extract the values from the selected column using rdd.map
rdd_from_crashType = feature_crashType.rdd.map(lambda row: row[0])

# Display the top 10 records
print("Top 10 Records from Crash Type:")
print(rdd_from_crashType.take(10))

Top 10 Records from Crash Type:
['NO INJURY / DRIVE AWAY', 'INJURY AND / OR TOW DUE TO CRASH', 'INJURY AND / OR TOW DUE TO CRASH', 'NO INJURY / DRIVE AWAY', 'NO INJURY / DRIVE AWAY', 'INJURY AND / OR TOW DUE TO CRASH', 'INJURY AND / OR TOW DUE TO CRASH', 'INJURY AND / OR TOW DUE TO CRASH', 'NO INJURY / DRIVE AWAY', 'INJURY AND / OR TOW DUE TO CRASH']


### Extract Lighting Condition and Time of Year Crash Occurred

In [0]:
feature_crashDate = df2.select("Crash_date")
# Extract the values from the selected column using rdd.map
rdd_from_crashDate = feature_crashDate.rdd.map(lambda row: row[0])

feature_lighting = df2.select("Lighting_condition")
rdd_from_lighting = feature_lighting.rdd.map(lambda row: row[0]) 

# Display the top 10 records
print("First 10 Records from Crash Date and Time:")
print(rdd_from_crashDate.take(10))

print("First 10 Records from Lighting Condition:")
print(rdd_from_lighting.take(10))

First 10 Records from Crash Date and Time:
['2023-11-26T22:50:00.000', '2023-11-26T21:46:00.000', '2023-11-26T21:25:00.000', '2023-11-26T21:00:00.000', '2023-11-26T21:00:00.000', '2023-11-26T20:40:00.000', '2023-11-26T20:33:00.000', '2023-11-26T20:30:00.000', '2023-11-26T20:20:00.000', '2023-11-26T20:14:00.000']
First 10 Records from Lighting Condition:
['DARKNESS, LIGHTED ROAD', 'UNKNOWN', 'DARKNESS, LIGHTED ROAD', 'DARKNESS, LIGHTED ROAD', 'DARKNESS, LIGHTED ROAD', 'DARKNESS, LIGHTED ROAD', 'DARKNESS, LIGHTED ROAD', 'DARKNESS, LIGHTED ROAD', 'UNKNOWN', 'DARKNESS, LIGHTED ROAD']


### Extract Longitude and Latitude

In [0]:
feature_longitude = df2.select("Longitude")
# Extract the values from the selected column using rdd.map
rdd_from_longitude = feature_longitude.rdd.map(lambda row: row[0])

feature_latitude = df2.select("Latitude")
rdd_from_latitude = feature_latitude.rdd.map(lambda row: row[0]) 

# Display the first 10 records
print("First 10 Records from Longitude:")
print(rdd_from_longitude.take(10))

print("First 10 Records from Latitude:")
print(rdd_from_latitude.take(10))

First 10 Records from Longitude:
['-87.715260845', '-87.597853029', '-87.585086454', '-87.630845188', '-87.63285766', '-87.633356364', '-87.633988101', '-87.762641644', '-87.527491945', '-87.726071484']
First 10 Records from Latitude:
['41.767951147', '41.659434298', '41.737030992', '41.748908764', '41.896678875', '41.885840747', '41.716236015', '41.892834466', '41.65170129', '41.889771263']


### Create RDD of wanted features

In [0]:
wanted_columns = df2.select("Crash_type","num_units","Weather_condition","Crash_date","Most_severe_injury","Longitude","Latitude")
wanted_columns.show(1)
rdd_of_features = wanted_columns.rdd.map(lambda row:[row[0],row[1],row[2],row[3],row[4],row[5],row[6]])


+--------------------+---------+-----------------+--------------------+--------------------+-------------+------------+
|          Crash_type|num_units|Weather_condition|          Crash_date|  Most_severe_injury|    Longitude|    Latitude|
+--------------------+---------+-----------------+--------------------+--------------------+-------------+------------+
|NO INJURY / DRIVE...|        2|            CLEAR|2023-11-26T22:50:...|NO INDICATION OF ...|-87.715260845|41.767951147|
+--------------------+---------+-----------------+--------------------+--------------------+-------------+------------+
only showing top 1 row



### Remove all rows where the content of one of the fields is unknown

In [0]:
print(rdd_of_features.count())
#row[0] = Crash_type, row[2] = Weather_condition,  row[4]= Most_severe_injury
cleaned_data_rdd = rdd_of_features.filter(lambda row: row[0]!="UNKNOWN"  and row[2]!="UNKNOWN"  and row[4]!="UNKNOWN" and row[5] != None and row[6] != None and row[0] != None and row[1] != None and row[2] != None and row[3] != None and row[4] != None)
print(cleaned_data_rdd.count())

1000
892


### Create Dataframe from RDD and get it ready for regression

In [0]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline
from pyspark.sql.functions import col

cleaned_data_df = spark.createDataFrame(cleaned_data_rdd)

#_1 = Crash_type, _2 = numUnits, _3 = weather, _4 = time, _5 = injury severity, _6 = longitude, _7 = latitude
print(cleaned_data_df.dtypes)
numeric_cols = ["_2", "_6", "_7"]
for col_name in numeric_cols:    
    cleaned_data_df = cleaned_data_df.withColumn(col_name, col(col_name).cast("double"))
print(cleaned_data_df.dtypes)

string_cols = ["_1", "_3", "_4", "_5"]
indexers = [StringIndexer(inputCol=column, outputCol=column+"_index").fit(cleaned_data_df) for column in string_cols ]

pipeline = Pipeline(stages=indexers)
indexed_df = pipeline.fit(cleaned_data_df).transform(cleaned_data_df)
indexed_df.show(5)

[('_1', 'string'), ('_2', 'string'), ('_3', 'string'), ('_4', 'string'), ('_5', 'string'), ('_6', 'string'), ('_7', 'string')]
[('_1', 'string'), ('_2', 'double'), ('_3', 'string'), ('_4', 'string'), ('_5', 'string'), ('_6', 'double'), ('_7', 'double')]


Downloading artifacts:   0%|          | 0/30 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

+--------------------+---+-----+--------------------+--------------------+-------------+------------+--------+--------+--------+--------+
|                  _1| _2|   _3|                  _4|                  _5|           _6|          _7|_1_index|_3_index|_4_index|_5_index|
+--------------------+---+-----+--------------------+--------------------+-------------+------------+--------+--------+--------+--------+
|NO INJURY / DRIVE...|2.0|CLEAR|2023-11-26T22:50:...|NO INDICATION OF ...|-87.715260845|41.767951147|     0.0|     0.0|   705.0|     0.0|
|INJURY AND / OR T...|2.0|CLEAR|2023-11-26T21:46:...|NONINCAPACITATING...|-87.597853029|41.659434298|     1.0|     0.0|   704.0|     1.0|
|INJURY AND / OR T...|2.0|CLEAR|2023-11-26T21:25:...|INCAPACITATING IN...|-87.585086454|41.737030992|     1.0|     0.0|   703.0|     3.0|
|NO INJURY / DRIVE...|2.0|CLEAR|2023-11-26T21:00:...|NO INDICATION OF ...|-87.630845188|41.748908764|     0.0|     0.0|   118.0|     0.0|
|NO INJURY / DRIVE...|2.0|CLEAR|20

### Apply linear regression

In [0]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import RegressionEvaluator

feature_assembler = VectorAssembler(inputCols = ["_1_index","_2","_3_index","_4_index","_5_index"], outputCol = "Independent Features")

output = feature_assembler.transform(indexed_df)
#output.select("Independent Features").show()

finalised_data = output.select("Independent Features", "_6")

train_data, test_data = finalised_data.randomSplit([0.8, 0.2])

regressor = LinearRegression(featuresCol = 'Independent Features', labelCol = '_6')
trained_model = regressor.fit(train_data)

results = trained_model.evaluate(train_data)
print(results.r2) #closer to 1, better it is
print(results.meanSquaredError)
print(results.meanAbsoluteError)

predictions = trained_model.transform(test_data)

Downloading artifacts:   0%|          | 0/9 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

0.01601063944292369
0.003598011485886745
0.04888741395890051


In [0]:
print ("Longitude actual vs predicted")
pred = predictions.select("prediction")
act = finalised_data.select("_6").withColumnRenamed("_6","actual")
act_vs_pred = pred.join(act)
act_vs_pred.show()

Longitude actual vs predicted
+-----------------+-------------+
|       prediction|       actual|
+-----------------+-------------+
|-87.6936719770031|-87.715260845|
|-87.6936719770031|-87.597853029|
|-87.6936719770031|-87.585086454|
|-87.6936719770031|-87.630845188|
|-87.6936719770031| -87.63285766|
|-87.6936719770031|-87.633356364|
|-87.6936719770031|-87.633988101|
|-87.6936719770031|-87.726071484|
|-87.6936719770031|-87.626944185|
|-87.6936719770031|-87.752809814|
|-87.6936719770031|-87.634042511|
|-87.6936719770031|-87.707841584|
|-87.6936719770031|-87.728783821|
|-87.6936719770031|-87.580854271|
|-87.6936719770031|-87.714068537|
|-87.6936719770031|-87.647565569|
|-87.6936719770031|-87.722639835|
|-87.6936719770031|-87.703493141|
|-87.6936719770031|-87.647275129|
|-87.6936719770031|-87.653483175|
+-----------------+-------------+
only showing top 20 rows



In [0]:
from pyspark.ml.evaluation import RegressionEvaluator
finalised_data = output.select("Independent Features", "_7")

train_data, test_data = finalised_data.randomSplit([0.8, 0.2])

regressor = LinearRegression(featuresCol = 'Independent Features', labelCol = '_7')
trained_model = regressor.fit(train_data)

results = trained_model.evaluate(train_data)
print(results.r2) #closer to 1, better it is
print(results.meanSquaredError)
print(results.meanAbsoluteError)

predictions = trained_model.transform(test_data)


Downloading artifacts:   0%|          | 0/9 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

0.009355288681948126
0.0074181203434875035
0.07428601709575482


In [0]:
print ("Latitude actual vs predicted")
pred = predictions.select("prediction")
act = finalised_data.select("_7").withColumnRenamed("_7","actual")
act_vs_pred = pred.join(act)
act_vs_pred.show(1000)

Latitude actual vs predicted
+-----------------+------------+
|       prediction|      actual|
+-----------------+------------+
|41.86778850796988|41.767951147|
|41.86778850796988|41.659434298|
|41.86778850796988|41.737030992|
|41.86778850796988|41.748908764|
|41.86778850796988|41.896678875|
|41.86778850796988|41.885840747|
|41.86778850796988|41.716236015|
|41.86778850796988|41.889771263|
|41.86778850796988|41.853058661|
|41.86778850796988|41.874080494|
|41.86778850796988|41.887040881|
|41.86778850796988|41.990811419|
|41.86778850796988|41.997129423|
|41.86778850796988|41.707849555|
|41.86778850796988|41.775946359|
|41.86778850796988|41.883800588|
|41.86778850796988|41.771482784|
|41.86778850796988|41.786155308|
|41.86778850796988|41.883922894|
|41.86778850796988|41.923447364|
|41.86778850796988|41.853019479|
|41.86778850796988| 41.74579217|
|41.86778850796988|41.950199482|
|41.86778850796988|41.896903533|
|41.86778850796988|41.912254431|
|41.86778850796988|41.771382503|
|41.8677885079