In [None]:
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
sc = SparkContext('local')
spark = SparkSession(sc)

In [None]:
from pyspark.ml.regression import RandomForestRegressor

In [None]:
data = spark.read.csv('data/processed/cleaned_taxi_data.csv', header=True, inferSchema=True)
data.take(3)

In [None]:
print(data.cache())
print(data.printSchema())

In [None]:
data.describe().toPandas().transpose()

In [None]:
# Prepare for ML. Import VectorAssembler and create features
from pyspark.ml.feature import VectorAssembler
 
feat_cols = ['humidity', 'pressure', 'wind_speed', 'wind_direction', 'temperature']
 
vec_assembler = VectorAssembler(inputCols = feat_cols, outputCol='features')
 
prep_df = vec_assembler.transform(data)

# Select features and rides
prep_df = prep_df.select(['features', 'rides'])

print(prep_df.head().features)
print(prep_df.show(3))

In [None]:
plits = prep_df.randomSplit([0.7, 0.3])
train_df = splits[0]
test_df = splits[1]
print("Training Dataset Count: " + str(train_df.count()))
print("Test Dataset Count: " + str(test_df.count()))

In [None]:
# Train and predict
rf = RandomForestRegressor(labelCol='rides', featuresCol='features',numTrees=100)
rfModel = rf.fit(train_df)
predictions = rfModel.transform(test_df)

In [None]:
from pyspark.ml.evaluation import RegressionEvaluator
eval = RegressionEvaluator(labelCol="rides", predictionCol="prediction", metricName="rmse")

rmse = eval.evaluate(predictions)
print("RMSE: %.3f" % rmse)

# Mean Square Error
mse = eval.evaluate(predictions, {eval.metricName: "mse"})
print("MSE: %.3f" % mse)