In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, log, exp, unix_timestamp, countDistinct, greatest, lit

spark = SparkSession.builder.appName("ML NYC311").getOrCreate()

In [None]:
uri = "enter your path"

original_data = spark.read.csv(uri, header=True, inferSchema=True)

In [None]:
original_data.printSchema()

root
 |-- agency: string (nullable = true)
 |-- complaint_type: string (nullable = true)
 |-- borough: string (nullable = true)
 |-- open_data_channel_type: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- resolution_time: integer (nullable = true)
 |-- created_day_of_week: integer (nullable = true)
 |-- created_hour_of_day: integer (nullable = true)



#### Run with a subset of data for efficiency

In [None]:
data = original_data.sample(False, 0.5)

In [None]:
data.count()

4278120

### ML Data Preprocessing

In [None]:
from pyspark.ml.feature import StringIndexer, VectorAssembler, OneHotEncoder

In [None]:
data = data.withColumn("resolution_time", log(col("resolution_time")))

#### StringIndexers

In [None]:
categories = ['agency', 'complaint_type', 'borough', 'open_data_channel_type']

string_indexers = [StringIndexer(inputCol=c, outputCol=c + "_label") for c in categories]


#### One Hot Encoding

In [None]:
onehot_input_cols = [c + "_label" for c in categories]
onehot_output_cols = [c + "_oh" for c in onehot_input_cols]

encoder = OneHotEncoder(inputCols=onehot_input_cols, outputCols=onehot_output_cols)

#### VectorAssembler

In [None]:
feature_names = onehot_output_cols + ['created_day_of_week', 'created_hour_of_day', 'latitude', 'longitude']
assembler = VectorAssembler(inputCols=feature_names, outputCol="features")

#### Train Test Split

In [None]:
train, test = data.randomSplit(weights=[0.8,0.2], seed=42)

In [None]:
print(train.count())

3422237


### Gradient Boost

In [None]:
import time
from pyspark.ml import Pipeline
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator

In [None]:
label_name = "resolution_time"
gbt = GBTRegressor(featuresCol="features", labelCol=label_name)
pipeline = Pipeline(stages=string_indexers + [encoder, assembler, gbt])

In [None]:
model = pipeline.fit(train)

In [None]:
predictions = model.transform(train)

### Evaluations

In [None]:
from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(labelCol="resolution_time", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

Root Mean Squared Error (RMSE) on test data = 1.36468
