In [1]:
from pyspark.ml.feature import VectorAssembler

In [2]:
df = spark.read.csv("data/preprocessed_data.csv", header=True, sep=",", inferSchema=True)

feature_col = [c for c in df.columns if c != "RentedBikeCount"]

feat_assembler = VectorAssembler(inputCols=feature_col, outputCol='features')

df = feat_assembler.transform(df)

In [3]:
df.select("features").show(3,False)

+---------------------------------------------------------------------------------------------+
|features                                                                                     |
+---------------------------------------------------------------------------------------------+
|(24,[0,1,2,3,4,5,8,9,10,16,22],[3.0,1.9,91.0,1.4,218.0,0.5,0.9,1.0,12.0,1.0,1.0])            |
|(24,[0,1,2,3,4,5,8,9,10,18,22],[6.0,-8.2,66.0,0.9,1718.0,-13.4,2.0,1.0,12.0,1.0,1.0])        |
|(24,[0,1,2,3,4,5,6,8,9,10,18,22],[14.0,-0.5,44.0,1.0,1793.0,-11.2,0.71,1.7,1.0,12.0,1.0,1.0])|
+---------------------------------------------------------------------------------------------+
only showing top 3 rows



In [4]:
data = df.select("features", "RentedBikeCount")
train, test = data.randomSplit([0.75, 0.25], seed=100)
train.cache()
test.cache()

DataFrame[features: vector, RentedBikeCount: int]

In [5]:
print("number of features %d" % len(feature_col))
print("number of training data: %d" % train.count())
print("number of testing data: %d" % test.count())

number of features 24
number of training data: 6608
number of testing data: 2152


In [6]:
from pyspark.ml import Pipeline
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator

In [14]:
def evaluate(rf_model, df, mode="test"):
    pred = rf_model.transform(df)
    evaluator = RegressionEvaluator(labelCol="RentedBikeCount", predictionCol="prediction", metricName="rmse")
    rmse = evaluator.evaluate(pred, {evaluator.metricName: "rmse"})
    r2 = evaluator.evaluate(pred, {evaluator.metricName: "r2"})

    print("Root Mean Squared Error (RMSE) on %s data = %g" % (mode, rmse))
    print("Root Mean Squared Error (R2) on %s data = %g" % (mode, r2))
    
    return pred

In [8]:
params_grid_GBT = {
    "maxDepth": [4,6,8,10,12,14,16],
    "maxBins" : [5,25,45,65,85,105,125],
    "subsamplingRate": [0.4, 0.6, 0.8, 1.0],
    "featureSubsetStrategy": ["sqrt", "all", "log2", "onethird"],
    "minInstancesPerNode": [1,2,3,5,10]
}

In [10]:
params_GBT = {
    "maxDepth": 12,
    "maxBins" : 45,
    "subsamplingRate": 1.0,
    "featureSubsetStrategy": "all",
    "minInstancesPerNode": 1
}

In [12]:
rf_GBT = GBTRegressor(
    featuresCol="features",
    labelCol = "RentedBikeCount",
    predictionCol = "prediction",
    maxDepth=params_GBT["maxDepth"],
    maxBins=params_GBT["maxBins"],
    minInstancesPerNode=params_GBT["minInstancesPerNode"],
    subsamplingRate=params_GBT["subsamplingRate"],
    featureSubsetStrategy=params_GBT["featureSubsetStrategy"],
    seed=100
)

In [15]:
model = rf_GBT.fit(train)
print(params_GBT)
evaluate(model, train, mode="train")
evaluate(model, test, mode="test")
print()

{'maxDepth': 12, 'maxBins': 45, 'subsamplingRate': 1.0, 'featureSubsetStrategy': 'all', 'minInstancesPerNode': 1}
Root Mean Squared Error (RMSE) on train data = 24.9557
Root Mean Squared Error (R2) on train data = 0.99851
Root Mean Squared Error (RMSE) on test data = 229.352
Root Mean Squared Error (R2) on test data = 0.871694



In [16]:
feat_imp_arr = model.featureImportances.toArray()
feat_imp = [(feat, round(imp, 4)) for feat, imp in zip(feature_col, feat_imp_arr)]
feat_imp.sort(key=lambda x: x[1], reverse=True)

In [17]:
print("feature importances")
feat_imp

feature importances


[('Temperaturee', 0.254),
 ('Hour', 0.2459),
 ('SolarRadiation', 0.095),
 ('Humidity_pct', 0.0947),
 ('FunctioningDay', 0.0623),
 ('WindSpeed_m_per_s', 0.0428),
 ('DewPointTemperature', 0.0401),
 ('Weekend', 0.0305),
 ('Visibility_10m', 0.0282),
 ('Month', 0.0264),
 ('Autumn', 0.0186),
 ('Rainfall_mm', 0.0122),
 ('has_holiday', 0.009),
 ('Wednesday', 0.007),
 ('Spring', 0.006),
 ('Monday', 0.0056),
 ('Thursday', 0.0044),
 ('Friday', 0.0039),
 ('Tuesday', 0.0039),
 ('Sunday', 0.0036),
 ('Saturday', 0.0027),
 ('Summer', 0.0024),
 ('Snowfall_cm', 0.0007),
 ('Winter', 0.0002)]