# Main Processing

### Initial Imports

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.utils import AnalysisException
import pyspark.sql.functions as F
import pyspark.sql.types as T

spark = SparkSession.builder.getOrCreate()

### Reading In Data

In [0]:
import os
import time

# CONSTANTS
PATH = 'dbfs:/public/cleaned_data.csv'

# Import data
df = spark.read.csv(
                path=os.path.join(PATH, "part-00000-20c3d0ba-36a1-4da8-bc07-2827a614295e-c000.csv"),
                sep=",",
                inferSchema=True,
                header=True
)

In [0]:
# Get data count
print(df.count())

8556144


In [0]:
# Check Schema
df.printSchema()

root
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- resolution_time: double (nullable = true)
 |-- agency_label: double (nullable = true)
 |-- complaint_type_label: double (nullable = true)



### Additional Preprocessing

In [0]:
from pyspark.sql.functions import col

# Removing outlier of latitude ~ 0
df = df.where(F.col('latitude') > 1)

#Scaling latitude and longitude by distance
lat_info = df.describe("latitude")
long_info = df.describe("longitude")

lat_min = lat_info.where(col("summary") == 'min').head()['latitude']
print(lat_min)
long_min = long_info.where(col("summary") == 'min').head()['longitude']
print(long_min)
df = (df
        .withColumn("latitude", (col("latitude") - lat_min))
        .withColumn("longitude", (col("longitude") - long_min))
        )
 
lat_info = df.describe("latitude")
long_info = df.describe("longitude")

lat_max = lat_info.where(col("summary") == 'max').head()['latitude']
long_max = long_info.where(col("summary") == 'max').head()['longitude']

print(lat_max)
print(long_max)
df =  (df
        .withColumn("latitude", (col("latitude") / lat_max))
        .withColumn("longitude", (col("longitude") / long_max)))

40.498543
-74.254952
0.4149140000000031
0.5545759999999973


In [0]:
df.show()

+-------------------+-------------------+---------------+------------+--------------------+
|           latitude|          longitude|resolution_time|agency_label|complaint_type_label|
+-------------------+-------------------+---------------+------------+--------------------+
| 0.5639120396033857| 0.4870694007674338|           0.48|         0.0|                 8.0|
| 0.3818598552953145| 0.5278392862294812|            0.7|         0.0|                 1.0|
| 0.6452252755992772| 0.5927916101670575|            0.6|         7.0|                23.0|
| 0.3818598552953145| 0.5278392862294812|           0.78|         0.0|                 1.0|
|0.47526957393581265|0.46871123164362943|           0.95|         0.0|                 0.0|
| 0.6069402333977677|0.47584460921496735|           0.48|         0.0|                19.0|
| 0.6169085641843776| 0.5030888462537315|           1.36|         0.0|                19.0|
| 0.4442486876798561| 0.4871072675341237|           1.41|         0.0|          

In [0]:
# Make a subsample to prevent having to reload
# data = df.sample(False, 0.8  )
data = df
START_TIME = time.time()

### Creating Model Architecture

In [0]:
import time
from pyspark.ml import Pipeline
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import OneHotEncoder, VectorAssembler

# Hot Encode Categorical with OHE
categories = ['agency', 'complaint_type']
onehot_input_cols = [c + '_label' for c in categories]
onehot_output_cols = [c + '_oh' for c in categories]

encoder = OneHotEncoder(
    inputCols = onehot_input_cols,
    outputCols= onehot_output_cols
)

# model = encoder.fit(data)
# encoded = model.transform(data)

# Create Vector Assembler
feature_names = ['latitude', 'longitude'] + onehot_output_cols
assembler = VectorAssembler(inputCols=feature_names, outputCol='features')

# Split Train & Test
train, test = data.randomSplit(weights=[0.8, 0.2], seed=42)

# Linear Regression Model
label_name = "resolution_time"

lr = LinearRegression(
    featuresCol="features",
    labelCol=label_name
)

# Pipeline
lr_pipeline = Pipeline(stages=[encoder, assembler, lr])

In [0]:
encoded.show(10)

+-------------------+-------------------+---------------+------------+--------------------+--------------+-----------------+
|           latitude|          longitude|resolution_time|agency_label|complaint_type_label|     agency_oh|complaint_type_oh|
+-------------------+-------------------+---------------+------------+--------------------+--------------+-----------------+
| 0.5639120396033857| 0.4870694007674338|           0.48|         0.0|                 8.0|(18,[0],[1.0])|  (223,[8],[1.0])|
| 0.3818598552953145| 0.5278392862294812|            0.7|         0.0|                 1.0|(18,[0],[1.0])|  (223,[1],[1.0])|
| 0.6452252755992772| 0.5927916101670575|            0.6|         7.0|                23.0|(18,[7],[1.0])| (223,[23],[1.0])|
| 0.3818598552953145| 0.5278392862294812|           0.78|         0.0|                 1.0|(18,[0],[1.0])|  (223,[1],[1.0])|
|0.47526957393581265|0.46871123164362943|           0.95|         0.0|                 0.0|(18,[0],[1.0])|  (223,[0],[1.0])|


### Training

In [0]:
# Train model
model = lr_pipeline.fit(train)
predictions = model.transform(test)

# Isolate results for brevity
results = predictions.select('resolution_time', 'prediction').withColumnRenamed('resolution_time', 'label')

### Testing

In [0]:
results.show()

+-----+------------------+
|label|        prediction|
+-----+------------------+
| 2.93|3.2994483143462485|
| 2.57| 2.426788861461856|
| 3.24|3.4664999460473345|
| 3.66| 3.611433955042459|
|  3.4|3.6520448993379278|
| 4.67| 4.514522005775522|
| 2.97|3.7244267572136516|
|  4.0| 3.236510773165901|
| 2.37| 2.521673386262694|
| 2.44| 1.598721580913367|
| 4.23|4.0229637573372115|
| 2.31| 2.201737292780923|
| 3.63|3.6119424917661602|
| 3.64| 3.612419309881379|
| 4.79| 4.117831292304799|
| 2.97|3.8109056908183616|
| 1.76| 1.492563983304311|
| 4.68| 3.725150662477865|
|  1.3|1.4929525644066957|
| 1.62|1.4929525644066957|
+-----+------------------+
only showing top 20 rows



In [0]:
from pyspark.ml.evaluation import RegressionEvaluator

# Create Evaluator
evaluator = RegressionEvaluator(predictionCol = 'prediction')

# Run Evaluation on rmse & mae
rmse = evaluator.evaluate(results, {evaluator.metricName: "rmse"})
mae = evaluator.evaluate(results, {evaluator.metricName: "mae"})

print(f"Root Mean Squared Error (RMSE) on test data = {rmse}")
print(f"Mean Absolute Error (MAE) on test data = {mae}")

# Record execution time & results
# RMSE = [0.5924599017155424, 0.5932027183931239, 0.5940762882673567,  0.5940280721323961, 0.5935672799230375]
# MAE = [0.4348886663006307, 0.43569725601659665, 0.43573900703691487, 0.43545428278578374, 0.4358769029366571]
# core_times_two = [178.8632071018219, 183.33798360824585, 236.35156774520874, 240.50324726104736, 258.4136860370636]
# core_times_four = [152.64005160331726, 178.73205757141113, 210.48839592933655, 247.11537528038025, 285.8922829627991]
# core_times_six = [55.3837103843689, 57.47389340400696, 59.771382331848145, 68.23452496528625, 297.89829540252686]

print(f'Exeuction time: {time.time() - START_TIME}')

Root Mean Squared Error (RMSE) on test data = 0.594281771012872
Mean Absolute Error (MAE) on test data = 0.4355342535464535
Exeuction time: 285.8922829627991
