## Traffic Crash Analysis

### Data importing and pre-processing

In [0]:
import requests
from pyspark.sql import SparkSession

# Make the GET request
resp = requests.get('https://data.cityofchicago.org/resource/85ca-t3if.json?$query=SELECT%20crash_record_id%2C%20crash_date_est_i%2C%20crash_date%2C%20posted_speed_limit%2C%20traffic_control_device%2C%20device_condition%2C%20weather_condition%2C%20lighting_condition%2C%20first_crash_type%2C%20trafficway_type%2C%20lane_cnt%2C%20alignment%2C%20roadway_surface_cond%2C%20road_defect%2C%20report_type%2C%20crash_type%2C%20intersection_related_i%2C%20private_property_i%2C%20hit_and_run_i%2C%20damage%2C%20date_police_notified%2C%20prim_contributory_cause%2C%20sec_contributory_cause%2C%20street_no%2C%20street_direction%2C%20street_name%2C%20beat_of_occurrence%2C%20photos_taken_i%2C%20statements_taken_i%2C%20dooring_i%2C%20work_zone_i%2C%20work_zone_type%2C%20workers_present_i%2C%20num_units%2C%20most_severe_injury%2C%20injuries_total%2C%20injuries_fatal%2C%20injuries_incapacitating%2C%20injuries_non_incapacitating%2C%20injuries_reported_not_evident%2C%20injuries_no_indication%2C%20injuries_unknown%2C%20crash_hour%2C%20crash_day_of_week%2C%20crash_month%2C%20latitude%2C%20longitude%2C%20location%20ORDER%20BY%20crash_date%20DESC%2C%20crash_record_id%20ASC')

# Create a Spark session
spark = SparkSession.builder.appName("SENG550").getOrCreate()
#df2 = spark.read.csv("/FileStore/tables/Traffic_Crashes___Crashes.csv", header=True, inferSchema=True)

# Create a Spark DataFrame from the response text
df2 = spark.read.json(spark.sparkContext.parallelize([resp.text]))


In [0]:
# Show the DataFrame
df2.show(5)

+------------------+------------------+--------------------+----------------+-----------------+----------+-----------+--------------------+--------------------+-----------+--------------------+--------------------+---------+--------------------+-------------+--------------+-----------------------+----------------------+---------------------------+-----------------------------+--------------+----------------+----------------------+------------+--------------------+--------------------+-------------+--------------------+---------+--------------+------------------+-----------------------+------------------+-----------+------------+--------------------+----------------------+------------------+----------------+--------------+---------+----------------------+--------------------+-----------------+-----------+--------------+-----------------+
|         alignment|beat_of_occurrence|          crash_date|crash_date_est_i|crash_day_of_week|crash_hour|crash_month|     crash_record_id|          cr

### Create RDD of wanted features

In [0]:
wanted_columns = df2.select("Crash_type","num_units","Weather_condition","Crash_date","Most_severe_injury","Longitude","Latitude")
wanted_columns.show(1)
rdd_of_features = wanted_columns.rdd.map(lambda row:[row[0],row[1],row[2],row[3],row[4],row[5],row[6]])


+--------------------+---------+-----------------+--------------------+--------------------+-------------+------------+
|          Crash_type|num_units|Weather_condition|          Crash_date|  Most_severe_injury|    Longitude|    Latitude|
+--------------------+---------+-----------------+--------------------+--------------------+-------------+------------+
|NO INJURY / DRIVE...|        2|            CLEAR|2023-12-14T05:02:...|NO INDICATION OF ...|-87.636548181|41.892465831|
+--------------------+---------+-----------------+--------------------+--------------------+-------------+------------+
only showing top 1 row



### Remove all rows where the content of one of the fields is unknown

In [0]:
print(rdd_of_features.count())
#row[0] = Crash_type, row[2] = Weather_condition,  row[4]= Most_severe_injury
cleaned_data_rdd = rdd_of_features.filter(lambda row: row[0]!="UNKNOWN"  and row[2]!="UNKNOWN"  and row[4]!="UNKNOWN" and row[5] != None and row[6] != None and row[0] != None and row[1] != None and row[2] != None and row[3] != None and row[4] != None and row[5]!= 0 and row[6]!=0)
print(cleaned_data_rdd.count())

1000
936


### Create Dataframe from RDD and get it ready for regression

In [0]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline
from pyspark.sql.functions import col

cleaned_data_df = spark.createDataFrame(cleaned_data_rdd)

#_1 = Crash_type, _2 = numUnits, _3 = weather, _4 = time, _5 = injury severity, _6 = longitude, _7 = latitude
print(cleaned_data_df.dtypes)
numeric_cols = ["_2", "_6", "_7"]
for col_name in numeric_cols:    
    cleaned_data_df = cleaned_data_df.withColumn(col_name, col(col_name).cast("double"))
print(cleaned_data_df.dtypes)

string_cols = ["_1", "_3", "_4", "_5"]
indexers = [StringIndexer(inputCol=column, outputCol=column+"_index").fit(cleaned_data_df) for column in string_cols ]

pipeline = Pipeline(stages=indexers)
indexed_df = pipeline.fit(cleaned_data_df).transform(cleaned_data_df)
indexed_df.show(5)

[('_1', 'string'), ('_2', 'string'), ('_3', 'string'), ('_4', 'string'), ('_5', 'string'), ('_6', 'string'), ('_7', 'string')]
[('_1', 'string'), ('_2', 'double'), ('_3', 'string'), ('_4', 'string'), ('_5', 'string'), ('_6', 'double'), ('_7', 'double')]
+--------------------+---+-----+--------------------+--------------------+-------------+------------+--------+--------+--------+--------+
|                  _1| _2|   _3|                  _4|                  _5|           _6|          _7|_1_index|_3_index|_4_index|_5_index|
+--------------------+---+-----+--------------------+--------------------+-------------+------------+--------+--------+--------+--------+
|NO INJURY / DRIVE...|2.0|CLEAR|2023-12-14T05:02:...|NO INDICATION OF ...|-87.636548181|41.892465831|     0.0|     0.0|   688.0|     0.0|
|NO INJURY / DRIVE...|2.0|CLEAR|2023-12-14T02:27:...|NO INDICATION OF ...|-87.731095577|41.743760507|     0.0|     0.0|   687.0|     0.0|
|INJURY AND / OR T...|2.0|CLEAR|2023-12-14T01:00:...|NO 

# Latitude

### Create Labeled Points

Partially taken from lab notebook

In [0]:
indexed_rdd = indexed_df.rdd
print(indexed_rdd.take(1)[0][10])

0.0


In [0]:
from pyspark.mllib.regression import LabeledPoint
def createRDD(values):
    return str(values[6]) +',' +str(values[1]) +',' +str(values[5]) +',' +str(values[7]) +',' +str(values[8]) +',' + str(values[9]) +',' + str(values[10])

def parsePoint(line):
    label_features = line.split(',')
    ret_val = LabeledPoint(label_features[0],label_features[1:])
    return ret_val

indexed_rdd = indexed_df.rdd
nice_rdd = indexed_rdd.map(createRDD)
print(nice_rdd.take(1))
parsedPoints = nice_rdd.map(parsePoint)
firstPoint = parsedPoints.take(1)
firstPointFeatures =firstPoint[0].features 
firstPointLabel = firstPoint[0].label
print (firstPointFeatures, firstPointLabel)
d = len(firstPointFeatures)
print(d)

['41.892465831,2.0,-87.636548181,0.0,0.0,688.0,0.0']
[2.0,-87.636548181,0.0,0.0,688.0,0.0] 41.892465831
6


### Normalize features

Taken from lab notebook

In [0]:
import numpy as np
from pyspark.mllib.stat import Statistics

def normalizeFeatures2(lp):
    normalizedFeatures = (lp.features - broadcastMean2.value) / broadcastStdev2.value
    return LabeledPoint(lp.label, normalizedFeatures)

def getNormalizedRDD2(nonNormalizedRDD): 
    # Compute column summary statistics
    summary = Statistics.colStats(nonNormalizedRDD.map(lambda lp: lp.features))
    meanList = summary.mean()
    stdevList = summary.variance()**0.5  # sqrt of variance to get standard deviation

    # Broadcast the mean and standard deviation
    global broadcastMean2
    broadcastMean2 = sc.broadcast(meanList)
    global broadcastStdev2
    broadcastStdev2 = sc.broadcast(stdevList)

    # Normalize the features
    returnRDD = nonNormalizedRDD.map(lambda lp: normalizeFeatures2(lp))
    return returnRDD

normalizedSamplePoints = getNormalizedRDD2(parsedPoints)
print(normalizedSamplePoints.take(5))

[LabeledPoint(41.892465831, [-0.1311238427122638,0.8548456297589372,-0.6830707265339998,-0.14982623822743735,1.961756223093018,-0.39717293991120234]), LabeledPoint(41.743760507, [-0.1311238427122638,-0.8151993951560601,-0.6830707265339998,-0.14982623822743735,1.9570815935727632,-0.39717293991120234]), LabeledPoint(41.957818587, [-0.1311238427122638,-1.275403406296264,1.4624131662036641,-0.14982623822743735,1.9524069640525086,-0.39717293991120234]), LabeledPoint(41.791964031, [-2.362613238688235,-0.15294789595655534,1.4624131662036641,-0.14982623822743735,1.947732334532254,-0.39717293991120234]), LabeledPoint(41.819052466, [-0.1311238427122638,-0.6918568585370565,1.4624131662036641,-0.14982623822743735,1.9430577050119993,1.145374245802015])]


In [0]:
weights = [.8, .2] # train/test split
seed = 42
parsedTrainData, parsedValData = normalizedSamplePoints.randomSplit(weights,seed)
parsedTrainData.cache()
parsedValData.cache()
nTrain = parsedTrainData.count()
nVal = parsedValData.count()

print(nTrain, nVal, nTrain + nVal)
print(normalizedSamplePoints.count())

728 208 936
936


### Create baseline using the average value

In [0]:
averagelatitude = (parsedTrainData.map(lambda s: s.label)).mean()
print(averagelatitude)

41.8622154629863


In [0]:
import math
def squaredError(label, prediction):
    sqrError = (label-prediction)*(label-prediction)
    return sqrError

def calcRMSE(labelsAndPreds):
    sqrSum = labelsAndPreds.map(lambda s: squaredError(s[0],s[1])).sum()
    return math.sqrt(sqrSum/labelsAndPreds.count())

labelsAndPredsTrain = parsedTrainData.map(lambda s: (s.label,averagelatitude))
rmseTrainBase = calcRMSE(labelsAndPredsTrain)

labelsAndPredsVal = parsedValData.map(lambda s: (s.label,averagelatitude))
rmseValBase = calcRMSE(labelsAndPredsVal)
print(rmseValBase)

0.08687977088552416


### Apply linear regression with weights version one

In [0]:
from pyspark.mllib.linalg import DenseVector
from pyspark.mllib.regression import LinearRegressionWithSGD
# Values to use when training the linear regression model
numIters = 500  # iterations
alpha = 1.0  # step
miniBatchFrac = 1.0  # miniBatchFraction
reg = 1e-1  # regParam
regType = 'l2'  # regType
useIntercept = True  # intercept

In [0]:
firstModel = LinearRegressionWithSGD.train(parsedTrainData,numIters,alpha,miniBatchFrac,initialWeights=None,regParam=reg,regType=regType,intercept=useIntercept)

# weightsLR1 stores the model weights; interceptLR1 stores the model intercept
weightsLR1 = firstModel.weights
interceptLR1 = firstModel.intercept
print(weightsLR1, interceptLR1)



[0.0278509863158071,-0.05716169131759258,0.04747734105360197,0.006705664752213127,0.010390822985318198,-0.05073197024761918] 38.09290581828401


In [0]:
samplePoints = parsedValData.take(5)
print(samplePoints)
for i in range(5):
    samplePrediction = firstModel.predict(samplePoints[i].features)
    print(samplePrediction)

[LabeledPoint(41.970677586, [-0.1311238427122638,-1.3863222497408927,-0.6830707265339998,-0.14982623822743735,1.9103352983702169,-0.39717293991120234]), LabeledPoint(41.909814337, [-0.1311238427122638,-0.6723953788384704,-0.6830707265339998,-0.14982623822743735,1.8869621507689436,-0.39717293991120234]), LabeledPoint(41.779926518, [-0.1311238427122638,0.9355479429383212,-0.6830707265339998,-0.14982623822743735,1.868263632687925,-0.39717293991120234]), LabeledPoint(41.965042796, [-0.1311238427122638,0.17082292646035496,1.4624131662036641,-0.14982623822743735,1.840215855566397,1.145374245802015]), LabeledPoint(41.92616072, [-0.1311238427122638,0.943755517384239,-0.6830707265339998,-0.14982623822743735,1.8261919670056332,-0.39717293991120234])]
38.17506266976803
38.13401053611084
38.041903483303784
38.1089304317207
38.040997165236035


In [0]:
labelsAndPreds = parsedValData.map(lambda lp: (lp.label,firstModel.predict(lp.features)))
rmseValLR1 = calcRMSE(labelsAndPreds)

print(rmseValBase)
print(rmseValLR1)

0.08687977088552416
3.771046491832153


### Apply linear regression with weights 2

In [0]:
numIters = 1000  # iterations
alpha = 1.0  # step
miniBatchFrac = 0.3  # miniBatchFraction
reg = 1e-1  # regParam
regType = 'l2'  # regType
useIntercept = True  # intercept

In [0]:
secondModel = LinearRegressionWithSGD.train(parsedTrainData,numIters,alpha,miniBatchFrac,initialWeights=None,regParam=reg,regType=regType,intercept=useIntercept)

# weightsLR1 stores the model weights; interceptLR1 stores the model intercept
weightsLR2 = secondModel.weights
interceptLR2 = secondModel.intercept
print(weightsLR2, interceptLR2)

[-0.006107972239543204,0.003027845295892827,-0.013623843798681638,0.017539556086358026,-0.029371773721555257,-0.07878945919138249] 38.054444585276265


In [0]:
labelsAndPreds = parsedValData.map(lambda lp: (lp.label,secondModel.predict(lp.features)))
rmseValLR2 = calcRMSE(labelsAndPreds)

print(rmseValBase)
print(rmseValLR2)

0.08687977088552416
3.8063269090666023


### Random Forest Version One

In [0]:
from pyspark.mllib.tree import RandomForest
thirdModel = RandomForest.trainRegressor(parsedTrainData, categoricalFeaturesInfo={},
                                      numTrees=8, featureSubsetStrategy="auto",
                                      impurity='variance', maxDepth=5, maxBins=32)



In [0]:
samplePoints = parsedValData.take(5)
print(samplePoints)
for i in range(5):
    samplePrediction = thirdModel.predict(samplePoints[i].features)
    print(samplePrediction)

[LabeledPoint(41.970677586, [-0.1311238427122638,-1.3863222497408927,-0.6830707265339998,-0.14982623822743735,1.9103352983702169,-0.39717293991120234]), LabeledPoint(41.909814337, [-0.1311238427122638,-0.6723953788384704,-0.6830707265339998,-0.14982623822743735,1.8869621507689436,-0.39717293991120234]), LabeledPoint(41.779926518, [-0.1311238427122638,0.9355479429383212,-0.6830707265339998,-0.14982623822743735,1.868263632687925,-0.39717293991120234]), LabeledPoint(41.965042796, [-0.1311238427122638,0.17082292646035496,1.4624131662036641,-0.14982623822743735,1.840215855566397,1.145374245802015]), LabeledPoint(41.92616072, [-0.1311238427122638,0.943755517384239,-0.6830707265339998,-0.14982623822743735,1.8261919670056332,-0.39717293991120234])]
41.90226857599271
41.884560886742484
41.83739111731327
41.8585301797274
41.83739111731327


In [0]:
import numpy as np
labels = parsedValData.map(lambda x: x.label).collect()
predictions = thirdModel.predict(parsedValData.map(lambda x: x.features)).collect()
rmseDT1 = np.sqrt(np.mean((np.array(predictions)-np.array(labels))**2))

print(rmseValBase)
print(rmseDT1)

0.08687977088552416
0.07460726627248873


### Random Forest Version Two

In [0]:
thirdModel = RandomForest.trainRegressor(parsedTrainData, categoricalFeaturesInfo={},
                                      numTrees=10, featureSubsetStrategy="auto",
                                      impurity='variance', maxDepth=6, maxBins=32)

In [0]:
samplePoints = parsedValData.take(5)
print(samplePoints)
for i in range(5):
    samplePrediction = thirdModel.predict(samplePoints[i].features)
    print(samplePrediction)

[LabeledPoint(41.970677586, [-0.1311238427122638,-1.3863222497408927,-0.6830707265339998,-0.14982623822743735,1.9103352983702169,-0.39717293991120234]), LabeledPoint(41.909814337, [-0.1311238427122638,-0.6723953788384704,-0.6830707265339998,-0.14982623822743735,1.8869621507689436,-0.39717293991120234]), LabeledPoint(41.779926518, [-0.1311238427122638,0.9355479429383212,-0.6830707265339998,-0.14982623822743735,1.868263632687925,-0.39717293991120234]), LabeledPoint(41.965042796, [-0.1311238427122638,0.17082292646035496,1.4624131662036641,-0.14982623822743735,1.840215855566397,1.145374245802015]), LabeledPoint(41.92616072, [-0.1311238427122638,0.943755517384239,-0.6830707265339998,-0.14982623822743735,1.8261919670056332,-0.39717293991120234])]
41.90698333894528
41.88715649167684
41.86147963580981
41.86669562823262
41.85418257742056


In [0]:
labels = parsedValData.map(lambda x: x.label).collect()
predictions = thirdModel.predict(parsedValData.map(lambda x: x.features)).collect()
rmseDT2 = np.sqrt(np.mean((np.array(predictions)-np.array(labels))**2))

print(rmseValBase)
print(rmseDT2)

0.08687977088552416
0.07599848169250893


# Longitude


In [0]:
indexed_rdd = indexed_df.rdd
print(indexed_rdd.take(1)[0][10])
from pyspark.mllib.regression import LabeledPoint
def createLatitudeRDD(values):
    return str(-values[5]) +',' +str(values[1]) +',' +str(values[6]) +',' +str(values[7]) +',' +str(values[8]) +',' + str(values[9]) +',' + str(values[10])


indexed_rdd = indexed_df.rdd
nice_rdd = indexed_rdd.map(createLatitudeRDD)
print(nice_rdd.take(1))
parsedPoints = nice_rdd.map(parsePoint)
firstPoint = parsedPoints.take(1)
firstPointFeatures =firstPoint[0].features 
firstPointLabel = firstPoint[0].label
print (firstPointFeatures, firstPointLabel)
d = len(firstPointFeatures)
print(d)

0.0
['87.636548181,2.0,41.892465831,0.0,0.0,688.0,0.0']
[2.0,41.892465831,0.0,0.0,688.0,0.0] 87.636548181
6


In [0]:
normalizedSamplePoints = getNormalizedRDD2(parsedPoints)
print(normalizedSamplePoints.take(5))

[LabeledPoint(87.636548181, [-0.1311238427122638,0.36495917786870435,-0.6830707265339998,-0.14982623822743735,1.961756223093018,-0.39717293991120234]), LabeledPoint(87.731095577, [-0.1311238427122638,-1.3824393254700018,-0.6830707265339998,-0.14982623822743735,1.9570815935727632,-0.39717293991120234]), LabeledPoint(87.75714942, [-0.1311238427122638,1.1329028193717372,1.4624131662036641,-0.14982623822743735,1.9524069640525086,-0.39717293991120234]), LabeledPoint(87.693603082, [-2.362613238688235,-0.816011954697108,1.4624131662036641,-0.14982623822743735,1.947732334532254,-0.39717293991120234]), LabeledPoint(87.724112702, [-0.1311238427122638,-0.49770263325820874,1.4624131662036641,-0.14982623822743735,1.9430577050119993,1.145374245802015])]


In [0]:
weights = [.8, .2] # train/test split
seed = 42
parsedTrainData, parsedValData = normalizedSamplePoints.randomSplit(weights,seed)
parsedTrainData.cache()
parsedValData.cache()
nTrain = parsedTrainData.count()
nVal = parsedValData.count()

print(nTrain, nVal, nTrain + nVal)
print(normalizedSamplePoints.count())

728 208 936
936


In [0]:
averagelongitude = (parsedTrainData.map(lambda s: s.label)).mean()
print(averagelongitude)

labelsAndPredsTrain = parsedTrainData.map(lambda s: (s.label,averagelongitude))
rmseTrainBase = calcRMSE(labelsAndPredsTrain)

labelsAndPredsVal = parsedValData.map(lambda s: (s.label,averagelongitude))
rmseValBase = calcRMSE(labelsAndPredsVal)
print(rmseValBase)

87.6853364728132
0.05564986824222757


In [0]:
from pyspark.mllib.linalg import DenseVector
from pyspark.mllib.regression import LinearRegressionWithSGD
# Values to use when training the linear regression model
numIters = 500  # iterations
alpha = 1.0  # step
miniBatchFrac = 1.0  # miniBatchFraction
reg = 1e-1  # regParam
regType = 'l2'  # regType
useIntercept = True  # intercept
firstModel = LinearRegressionWithSGD.train(parsedTrainData,numIters,alpha,miniBatchFrac,initialWeights=None,regParam=reg,regType=regType,intercept=useIntercept)

# weightsLR1 stores the model weights; interceptLR1 stores the model intercept
weightsLR1 = firstModel.weights
interceptLR1 = firstModel.intercept
print(weightsLR1, interceptLR1)
samplePoints = parsedValData.take(5)
print(samplePoints)
for i in range(5):
    samplePrediction = firstModel.predict(samplePoints[i].features)
    print(samplePrediction)
labelsAndPreds = parsedValData.map(lambda lp: (lp.label,firstModel.predict(lp.features)))
rmseValLR1 = calcRMSE(labelsAndPreds)

print(rmseValBase)
print(rmseValLR1)

[0.056394434792837705,0.10621308583395057,0.14056899647063403,0.01901456479457416,0.01260512920969955,-0.11592060221387676] 79.790504282462
[LabeledPoint(87.763428944, [-0.1311238427122638,1.2840056514871308,-0.6830707265339998,-0.14982623822743735,1.9103352983702169,-0.39717293991120234]), LabeledPoint(87.723010916, [-0.1311238427122638,0.568817066773592,-0.6830707265339998,-0.14982623822743735,1.8869621507689436,-0.39717293991120234]), LabeledPoint(87.631979326, [-0.1311238427122638,-0.9574617134166541,-0.6830707265339998,-0.14982623822743735,1.868263632687925,-0.39717293991120234]), LabeledPoint(87.675273224, [-0.1311238427122638,1.2177926649727335,1.4624131662036641,-0.14982623822743735,1.840215855566397,1.145374245802015]), LabeledPoint(87.631514665, [-0.1311238427122638,0.7608992627627377,-0.6830707265339998,-0.14982623822743735,1.8261919670056332,-0.39717293991120234])]
79.89074093232237
79.81448392424117
79.65213744791785
80.00559990110058
79.83411955099251
0.05564986824222757


In [0]:
### Apply linear regression with weights 2
numIters = 1000  # iterations
alpha = 1.0  # step
miniBatchFrac = 0.3  # miniBatchFraction
reg = 1e-1  # regParam
regType = 'l2'  # regType
useIntercept = True  # intercept
secondModel = LinearRegressionWithSGD.train(parsedTrainData,numIters,alpha,miniBatchFrac,initialWeights=None,regParam=reg,regType=regType,intercept=useIntercept)

# weightsLR1 stores the model weights; interceptLR1 stores the model intercept
weightsLR2 = secondModel.weights
interceptLR2 = secondModel.intercept
print(weightsLR2, interceptLR2)
labelsAndPreds = parsedValData.map(lambda lp: (lp.label,secondModel.predict(lp.features)))
rmseValLR2 = calcRMSE(labelsAndPreds)

print(rmseValBase)
print(rmseValLR2)

[0.22545739272629092,0.27862001289184357,0.0919751438503133,-0.11200654525536913,-0.11233637451717271,0.08562077260644438] 79.702157524454
0.05564986824222757
8.00541149277323


In [0]:
from pyspark.mllib.tree import RandomForest
thirdModel = RandomForest.trainRegressor(parsedTrainData, categoricalFeaturesInfo={},
                                      numTrees=8, featureSubsetStrategy="auto",
                                      impurity='variance', maxDepth=5, maxBins=32)


samplePoints = parsedValData.take(5)
print(samplePoints)
for i in range(5):
    samplePrediction = thirdModel.predict(samplePoints[i].features)
    print(samplePrediction)
import numpy as np
labels = parsedValData.map(lambda x: x.label).collect()
predictions = thirdModel.predict(parsedValData.map(lambda x: x.features)).collect()
rmseDT1 = np.sqrt(np.mean((np.array(predictions)-np.array(labels))**2))

print(rmseValBase)
print(rmseDT1)

[LabeledPoint(87.763428944, [-0.1311238427122638,1.2840056514871308,-0.6830707265339998,-0.14982623822743735,1.9103352983702169,-0.39717293991120234]), LabeledPoint(87.723010916, [-0.1311238427122638,0.568817066773592,-0.6830707265339998,-0.14982623822743735,1.8869621507689436,-0.39717293991120234]), LabeledPoint(87.631979326, [-0.1311238427122638,-0.9574617134166541,-0.6830707265339998,-0.14982623822743735,1.868263632687925,-0.39717293991120234]), LabeledPoint(87.675273224, [-0.1311238427122638,1.2177926649727335,1.4624131662036641,-0.14982623822743735,1.840215855566397,1.145374245802015]), LabeledPoint(87.631514665, [-0.1311238427122638,0.7608992627627377,-0.6830707265339998,-0.14982623822743735,1.8261919670056332,-0.39717293991120234])]
87.71768948936446
87.70028743780861
87.66527216400041
87.7288705813525
87.70558366006617
0.05564986824222757
0.048489467402581914


In [0]:
thirdModel = RandomForest.trainRegressor(parsedTrainData, categoricalFeaturesInfo={},
                                      numTrees=10, featureSubsetStrategy="auto",
                                      impurity='variance', maxDepth=6, maxBins=32)
samplePoints = parsedValData.take(5)
print(samplePoints)
for i in range(5):
    samplePrediction = thirdModel.predict(samplePoints[i].features)
    print(samplePrediction)
labels = parsedValData.map(lambda x: x.label).collect()
predictions = thirdModel.predict(parsedValData.map(lambda x: x.features)).collect()
rmseDT2 = np.sqrt(np.mean((np.array(predictions)-np.array(labels))**2))

print(rmseValBase)
print(rmseDT2)

[LabeledPoint(87.763428944, [-0.1311238427122638,1.2840056514871308,-0.6830707265339998,-0.14982623822743735,1.9103352983702169,-0.39717293991120234]), LabeledPoint(87.723010916, [-0.1311238427122638,0.568817066773592,-0.6830707265339998,-0.14982623822743735,1.8869621507689436,-0.39717293991120234]), LabeledPoint(87.631979326, [-0.1311238427122638,-0.9574617134166541,-0.6830707265339998,-0.14982623822743735,1.868263632687925,-0.39717293991120234]), LabeledPoint(87.675273224, [-0.1311238427122638,1.2177926649727335,1.4624131662036641,-0.14982623822743735,1.840215855566397,1.145374245802015]), LabeledPoint(87.631514665, [-0.1311238427122638,0.7608992627627377,-0.6830707265339998,-0.14982623822743735,1.8261919670056332,-0.39717293991120234])]
87.7033866149425
87.69129302775625
87.661066250721
87.72192593876805
87.70643891353157
0.05564986824222757
0.04864750478797434


#Type of Crash

In [0]:
indexed_rdd = indexed_df.rdd
print(indexed_rdd.take(1)[0][10])
from pyspark.mllib.regression import LabeledPoint
def createLatitudeRDD(values):
    return str(values[7]) +',' +str(values[1]) +',' +str(values[6]) +',' +str(values[5]) +',' +str(values[8]) +',' + str(values[9]) +',' + str(values[10])


indexed_rdd = indexed_df.rdd
nice_rdd = indexed_rdd.map(createLatitudeRDD)
print(nice_rdd.take(1))
parsedPoints = nice_rdd.map(parsePoint)
firstPoint = parsedPoints.take(1)
firstPointFeatures =firstPoint[0].features 
firstPointLabel = firstPoint[0].label
print (firstPointFeatures, firstPointLabel)
d = len(firstPointFeatures)
print(d)

0.0
['0.0,2.0,41.892465831,-87.636548181,0.0,688.0,0.0']
[2.0,41.892465831,-87.636548181,0.0,688.0,0.0] 0.0
6


In [0]:
normalizedSamplePoints = getNormalizedRDD2(parsedPoints)
print(normalizedSamplePoints.take(5))

[LabeledPoint(0.0, [-0.1311238427122638,0.36495917786870435,0.8548456297589372,-0.14982623822743735,1.961756223093018,-0.39717293991120234]), LabeledPoint(0.0, [-0.1311238427122638,-1.3824393254700018,-0.8151993951560601,-0.14982623822743735,1.9570815935727632,-0.39717293991120234]), LabeledPoint(1.0, [-0.1311238427122638,1.1329028193717372,-1.275403406296264,-0.14982623822743735,1.9524069640525086,-0.39717293991120234]), LabeledPoint(1.0, [-2.362613238688235,-0.816011954697108,-0.15294789595655534,-0.14982623822743735,1.947732334532254,-0.39717293991120234]), LabeledPoint(1.0, [-0.1311238427122638,-0.49770263325820874,-0.6918568585370565,-0.14982623822743735,1.9430577050119993,1.145374245802015])]


In [0]:
weights = [.8, .2] # train/test split
seed = 42
parsedTrainData, parsedValData = normalizedSamplePoints.randomSplit(weights,seed)
parsedTrainData.cache()
parsedValData.cache()
nTrain = parsedTrainData.count()
nVal = parsedValData.count()

print(nTrain, nVal, nTrain + nVal)
print(normalizedSamplePoints.count())

728 208 936
936


In [0]:
averagetype = (parsedTrainData.map(lambda s: s.label)).mean()
print(averagetype)

labelsAndPredsTrain = parsedTrainData.map(lambda s: (s.label,averagetype))
rmseTrainBase = calcRMSE(labelsAndPredsTrain)

labelsAndPredsVal = parsedValData.map(lambda s: (s.label,averagetype))
rmseValBase = calcRMSE(labelsAndPredsVal)
print(rmseValBase)

0.32142857142857173
0.46174282498022856


In [0]:
# Values to use when training the linear regression model
numIters = 500  # iterations
alpha = 1.0  # step
miniBatchFrac = 1.0  # miniBatchFraction
reg = 1e-1  # regParam
regType = 'l2'  # regType
useIntercept = True  # intercept
firstModel = LinearRegressionWithSGD.train(parsedTrainData,numIters,alpha,miniBatchFrac,initialWeights=None,regParam=reg,regType=regType,intercept=useIntercept)

# weightsLR1 stores the model weights; interceptLR1 stores the model intercept
weightsLR1 = firstModel.weights
interceptLR1 = firstModel.intercept
print(weightsLR1, interceptLR1)
samplePoints = parsedValData.take(5)
print(samplePoints)
for i in range(5):
    samplePrediction = firstModel.predict(samplePoints[i].features)
    print(samplePrediction)
labelsAndPreds = parsedValData.map(lambda lp: (lp.label,firstModel.predict(lp.features)))
rmseValLR1 = calcRMSE(labelsAndPreds)

print(rmseValBase)
print(rmseValLR1)



[0.02326861464298533,-0.0462066428119244,-0.031714783140954425,-0.0013044930148267296,0.025514687148054598,0.2379615111280032] 0.29206596608031166
[LabeledPoint(0.0, [-0.1311238427122638,1.2840056514871308,-1.3863222497408927,-0.14982623822743735,1.9103352983702169,-0.39717293991120234]), LabeledPoint(0.0, [-0.1311238427122638,0.568817066773592,-0.6723953788384704,-0.14982623822743735,1.8869621507689436,-0.39717293991120234]), LabeledPoint(0.0, [-0.1311238427122638,-0.9574617134166541,0.9355479429383212,-0.14982623822743735,1.868263632687925,-0.39717293991120234]), LabeledPoint(1.0, [-0.1311238427122638,1.2177926649727335,0.17082292646035496,-0.14982623822743735,1.840215855566397,1.145374245802015]), LabeledPoint(0.0, [-0.1311238427122638,0.7608992627627377,0.943755517384239,-0.14982623822743735,1.8261919670056332,-0.39717293991120234])]
0.22807739672757266
0.2378854657667146
0.2569370236023187
0.5470301386151788
0.17620358492270105
0.46174282498022856
0.3617677996681902


In [0]:
### Apply linear regression with weights 2
numIters = 1000  # iterations
alpha = 1.0  # step
miniBatchFrac = 0.3  # miniBatchFraction
reg = 1e-1  # regParam
regType = 'l2'  # regType
useIntercept = True  # intercept
secondModel = LinearRegressionWithSGD.train(parsedTrainData,numIters,alpha,miniBatchFrac,initialWeights=None,regParam=reg,regType=regType,intercept=useIntercept)

# weightsLR1 stores the model weights; interceptLR1 stores the model intercept
weightsLR2 = secondModel.weights
interceptLR2 = secondModel.intercept
print(weightsLR2, interceptLR2)
labelsAndPreds = parsedValData.map(lambda lp: (lp.label,secondModel.predict(lp.features)))
rmseValLR2 = calcRMSE(labelsAndPreds)

print(rmseValBase)
print(rmseValLR2)

[0.02869086208773176,-0.047236840974027576,-0.029373717589844257,-0.006735946202819645,0.028454446578139697,0.23587209999129188] 0.2944926643477481
0.46174282498022856
0.3614619243391188


In [0]:
thirdModel = RandomForest.trainRegressor(parsedTrainData, categoricalFeaturesInfo={},
                                      numTrees=8, featureSubsetStrategy="auto",
                                      impurity='variance', maxDepth=5, maxBins=32)


samplePoints = parsedValData.take(5)
print(samplePoints)
for i in range(5):
    samplePrediction = thirdModel.predict(samplePoints[i].features)
    print(samplePrediction)
import numpy as np
labels = parsedValData.map(lambda x: x.label).collect()
predictions = thirdModel.predict(parsedValData.map(lambda x: x.features)).collect()
rmseDT1 = np.sqrt(np.mean((np.array(predictions)-np.array(labels))**2))

print(rmseValBase)
print(rmseDT1)

[LabeledPoint(0.0, [-0.1311238427122638,1.2840056514871308,-1.3863222497408927,-0.14982623822743735,1.9103352983702169,-0.39717293991120234]), LabeledPoint(0.0, [-0.1311238427122638,0.568817066773592,-0.6723953788384704,-0.14982623822743735,1.8869621507689436,-0.39717293991120234]), LabeledPoint(0.0, [-0.1311238427122638,-0.9574617134166541,0.9355479429383212,-0.14982623822743735,1.868263632687925,-0.39717293991120234]), LabeledPoint(1.0, [-0.1311238427122638,1.2177926649727335,0.17082292646035496,-0.14982623822743735,1.840215855566397,1.145374245802015]), LabeledPoint(0.0, [-0.1311238427122638,0.7608992627627377,0.943755517384239,-0.14982623822743735,1.8261919670056332,-0.39717293991120234])]
0.19892745580501398
0.18400703140183094
0.19060899914108412
1.0
0.174310666153604
0.46174282498022856
0.3440720770909709


In [0]:
thirdModel = RandomForest.trainRegressor(parsedTrainData, categoricalFeaturesInfo={},
                                      numTrees=10, featureSubsetStrategy="auto",
                                      impurity='variance', maxDepth=6, maxBins=32)
samplePoints = parsedValData.take(5)
print(samplePoints)
for i in range(5):
    samplePrediction = thirdModel.predict(samplePoints[i].features)
    print(samplePrediction)
labels = parsedValData.map(lambda x: x.label).collect()
predictions = thirdModel.predict(parsedValData.map(lambda x: x.features)).collect()
rmseDT2 = np.sqrt(np.mean((np.array(predictions)-np.array(labels))**2))

print(rmseValBase)
print(rmseDT2)

[LabeledPoint(0.0, [-0.1311238427122638,1.2840056514871308,-1.3863222497408927,-0.14982623822743735,1.9103352983702169,-0.39717293991120234]), LabeledPoint(0.0, [-0.1311238427122638,0.568817066773592,-0.6723953788384704,-0.14982623822743735,1.8869621507689436,-0.39717293991120234]), LabeledPoint(0.0, [-0.1311238427122638,-0.9574617134166541,0.9355479429383212,-0.14982623822743735,1.868263632687925,-0.39717293991120234]), LabeledPoint(1.0, [-0.1311238427122638,1.2177926649727335,0.17082292646035496,-0.14982623822743735,1.840215855566397,1.145374245802015]), LabeledPoint(0.0, [-0.1311238427122638,0.7608992627627377,0.943755517384239,-0.14982623822743735,1.8261919670056332,-0.39717293991120234])]
0.2743721663987147
0.1751034081455845
0.25154318168259704
1.0
0.1363511677254255
0.46174282498022856
0.35061579426617395


### Next Steps

1. Fix normalization slow run time
2. apply random forest model 1 for latitude and longitude
3. apply linear regression 2 for all others