#### Prepare train and test data

In [1]:
from pyspark.ml.feature import VectorAssembler

In [2]:
df = spark.read.csv("data/preprocessed_data.csv", header=True, sep=",", inferSchema=True)

feature_col = [c for c in df.columns if c != "RentedBikeCount"]

feat_assembler = VectorAssembler(inputCols=feature_col, outputCol='features')

df = feat_assembler.transform(df)

In [28]:
df.select("features").show(3,False)

+---------------------------------------------------------------------------------------------+
|features                                                                                     |
+---------------------------------------------------------------------------------------------+
|(24,[0,1,2,3,4,5,8,9,10,16,22],[3.0,1.9,91.0,1.4,218.0,0.5,0.9,1.0,12.0,1.0,1.0])            |
|(24,[0,1,2,3,4,5,8,9,10,18,22],[6.0,-8.2,66.0,0.9,1718.0,-13.4,2.0,1.0,12.0,1.0,1.0])        |
|(24,[0,1,2,3,4,5,6,8,9,10,18,22],[14.0,-0.5,44.0,1.0,1793.0,-11.2,0.71,1.7,1.0,12.0,1.0,1.0])|
+---------------------------------------------------------------------------------------------+
only showing top 3 rows



In [39]:
data = df.select("features", "RentedBikeCount")
train, test = data.randomSplit([0.75, 0.25], seed=100)
train.cache()
test.cache()

DataFrame[features: vector, RentedBikeCount: int]

In [40]:
print("number of features %d" % len(feature_col))
print("number of training data: %d" % train.count())
print("number of testing data: %d" % test.count())

number of features 24
number of training data: 6608
number of testing data: 2152


#### Random Forest

In [41]:
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator

In [42]:
def evaluate(rf_model, df, mode="test"):
    pred = rf_model.transform(df)
    evaluator = RegressionEvaluator(labelCol="RentedBikeCount", predictionCol="prediction", metricName="rmse")
    rmse = evaluator.evaluate(pred, {evaluator.metricName: "rmse"})
    r2 = evaluator.evaluate(pred, {evaluator.metricName: "r2"})

    print("Root Mean Squared Error (RMSE) on %s data = %g" % (mode, rmse))
    print("Root Mean Squared Error (R2) on %s data = %g" % (mode, r2))
    
    return pred

#### param tuning
- numTrees: [80,100,110,120,130], when numTree=120, train RMSE: 217.25, R2:0.8865, test RMSE: 248.91, R2: 0.8513
- maxDepth: [4,6,8,10,12,14,16], when maxDepth=16, train RMSE: 106, R2:0.9727, test RMSE: 200, R2: 0.9038
    - but when maxDepth exceeds 12, the improvement in RMSE adn R2 in testset is very slow while the training time increases significantly. so might choose maxDepth=12
- subsamplingRate: [0.4,0.6,0.8,1.0]. Use 1.0 gives better performance
- featureSubsetStrategy: ["sqrt","all","log2","onethird"]. Use all features gives the best performance. train RMSE: 116, R2:0.0.9674, test RMSE: 176, R2: 0.9251
- minInstancesPerNode: [1,2,3,5,10]. Performance decreases slightly when minInstancesPerNode increases. So keep using 1

In [33]:
params_grid = {
    "numTrees": [80,100,110,120,130],
    "maxDepth": [4,6,8,10,12,14,16],
    "subsamplingRate": [0.4, 0.6, 0.8, 1.0],
    "featureSubsetStrategy": ["sqrt", "all", "log2", "onethird"],
    "minInstancesPerNode": [1,2,3,5,10]
}

In [50]:
params = {
    "numTrees": 120,
    "maxDepth": 12,
    "subsamplingRate": 1.0,
    "featureSubsetStrategy": "all",
    "minInstancesPerNode": 1
}

In [51]:
rf = RandomForestRegressor(
    featuresCol="features",
    labelCol = "RentedBikeCount",
    predictionCol = "prediction",
    numTrees=params["numTrees"],
    maxDepth=params["maxDepth"],
    minInstancesPerNode=params["minInstancesPerNode"],
    subsamplingRate=params["subsamplingRate"],
    featureSubsetStrategy=params["featureSubsetStrategy"],
    seed=100
)

model = rf.fit(train)
print(params)
evaluate(model, train, mode="train")
evaluate(model, test, mode="test")
print()

{'numTrees': 120, 'maxDepth': 12, 'subsamplingRate': 1.0, 'featureSubsetStrategy': 'all', 'minInstancesPerNode': 1}
Root Mean Squared Error (RMSE) on train data = 111.468
Root Mean Squared Error (R2) on train data = 0.970266
Root Mean Squared Error (RMSE) on test data = 191.417
Root Mean Squared Error (R2) on test data = 0.910627



#### Feature importance

In [37]:
feat_imp_arr = model.featureImportances.toArray()
feat_imp = [(feat, round(imp, 4)) for feat, imp in zip(feature_col, feat_imp_arr)]
feat_imp.sort(key=lambda x: x[1], reverse=True)

In [38]:
print("feature importances")
feat_imp

feature importances


[('Hour', 0.3005),
 ('Temperaturee', 0.286),
 ('FunctioningDay', 0.0912),
 ('SolarRadiation', 0.078),
 ('Humidity_pct', 0.0724),
 ('Rainfall_mm', 0.0416),
 ('Weekend', 0.0307),
 ('Winter', 0.0203),
 ('DewPointTemperature', 0.0196),
 ('Month', 0.0191),
 ('Autumn', 0.0143),
 ('WindSpeed_m_per_s', 0.0056),
 ('Visibility_10m', 0.0052),
 ('has_holiday', 0.0045),
 ('Spring', 0.0031),
 ('Monday', 0.0013),
 ('Sunday', 0.0012),
 ('Wednesday', 0.0012),
 ('Saturday', 0.001),
 ('Friday', 0.0007),
 ('Thursday', 0.0007),
 ('Summer', 0.0007),
 ('Snowfall_cm', 0.0005),
 ('Tuesday', 0.0004)]