#### Prepare train and test data

In [15]:
from pyspark.ml.feature import VectorAssembler

In [27]:
df = spark.read.csv("data/preprocessed_data.csv", header=True, sep=",", inferSchema=True)

feature_col = [c for c in df.columns if c != "RentedBikeCount"]

feat_assembler = VectorAssembler(inputCols=feature_col, outputCol='features')

df = feat_assembler.transform(df)

In [28]:
df.select("features").show(3,False)

+---------------------------------------------------------------------------------------------+
|features                                                                                     |
+---------------------------------------------------------------------------------------------+
|(24,[0,1,2,3,4,5,8,9,10,16,22],[3.0,1.9,91.0,1.4,218.0,0.5,0.9,1.0,12.0,1.0,1.0])            |
|(24,[0,1,2,3,4,5,8,9,10,18,22],[6.0,-8.2,66.0,0.9,1718.0,-13.4,2.0,1.0,12.0,1.0,1.0])        |
|(24,[0,1,2,3,4,5,6,8,9,10,18,22],[14.0,-0.5,44.0,1.0,1793.0,-11.2,0.71,1.7,1.0,12.0,1.0,1.0])|
+---------------------------------------------------------------------------------------------+
only showing top 3 rows



In [29]:
data = df.select("features", "RentedBikeCount")
train, test = data.randomSplit([0.75, 0.25])

In [30]:
print("number of features %d" % len(feature_col))
print("number of training data: %d" % train.count())
print("number of testing data: %d" % test.count())

number of features 24
number of training data: 6550
number of testing data: 2210


#### Random Forest

In [33]:
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator

In [56]:
rf = RandomForestRegressor(
    featuresCol="features",
    labelCol = "RentedBikeCount",
    predictionCol = "prediction",
    numTrees=100,
    maxDepth=10,
    subsamplingRate=1,
    seed=100
)

model = rf.fit(train)

pred = model.transform(test)
evaluator = RegressionEvaluator(labelCol="RentedBikeCount", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(pred, {evaluator.metricName: "rmse"})
r2 = evaluator.evaluate(pred, {evaluator.metricName: "r2"})

print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)
print("Root Mean Squared Error (R2) on test data = %g" % r2)

Root Mean Squared Error (RMSE) on test data = 222.451
Root Mean Squared Error (R2) on test data = 0.877283


In [63]:
feat_imp_arr = model.featureImportances.toArray()
feat_imp = [(feat, round(imp, 4)) for feat, imp in zip(feature_col, feat_imp_arr)]
feat_imp.sort(key=lambda x: x[1], reverse=True)

In [66]:
print("feature importances")
feat_imp

feature importances


[('Hour', 0.2953),
 ('Temperaturee', 0.2041),
 ('FunctioningDay', 0.0853),
 ('Humidity_pct', 0.0793),
 ('Winter', 0.0745),
 ('SolarRadiation', 0.0607),
 ('Month', 0.0468),
 ('Rainfall_mm', 0.0417),
 ('DewPointTemperature', 0.0407),
 ('Visibility_10m', 0.0153),
 ('WindSpeed_m_per_s', 0.0116),
 ('Weekend', 0.0113),
 ('Autumn', 0.0083),
 ('Summer', 0.0069),
 ('Spring', 0.0035),
 ('Sunday', 0.0034),
 ('Saturday', 0.0021),
 ('has_holiday', 0.0016),
 ('Monday', 0.0015),
 ('Tuesday', 0.0014),
 ('Thursday', 0.0013),
 ('Snowfall_cm', 0.0011),
 ('Wednesday', 0.0011),
 ('Friday', 0.0009)]