# Bike Rental DataSet from UCI Machine Learning Repository
## Citations
Fanaee-T, Hadi, and Gama, Joao, 'Event labeling combining ensemble detectors and background knowledge', Progress in Artificial Intelligence (2013): pp. 1-15, Springer Berlin Heidelber
## Attributes on original data


- season : season (1:spring, 2:summer, 3:fall, 4:winter)
- yr : year (0: 2011, 1:2012)
- mnth : month ( 1 to 12)
- hr : hour (0 to 23)
- holiday : weather day is holiday or not (extracted from [Web Link])
- weekday : day of the week
- workingday : if day is neither weekend nor holiday is 1, otherwise is 0.
- weathersit : 
 - 1: Clear, Few clouds, Partly cloudy, Partly cloudy
 - 2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist
 - 3: Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds
 - 4: Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog
- temp : Normalized temperature in Celsius. The values are derived via (t-t_min)/(t_max-t_min), t_min=-8, t_max=+39 (only in hourly scale)
- hum: Normalized humidity. The values are divided to 100 (max)
- windspeed: Normalized wind speed. The values are divided to 67 (max)


## URL:
https://archive.ics.uci.edu/ml/datasets/Bike+Sharing+Dataset

In [0]:
rowData = spark.read.csv("/FileStore/tables/Bike_Rental_UCI_dataset.csv", inferSchema=True, header = True)

In [0]:
rowData.show(n=5)

+------+---+----+---+-------+----------+----------+----+----+---------+---------+----+------+
|season| yr|mnth| hr|holiday|workingday|weathersit|temp| hum|windspeed|dayOfWeek|days|demand|
+------+---+----+---+-------+----------+----------+----+----+---------+---------+----+------+
|     1|  0|   1|  0|      0|         0|         1|0.24|0.81|      0.0|      Sat|   0|    16|
|     1|  0|   1|  1|      0|         0|         1|0.22| 0.8|      0.0|      Sat|   0|    40|
|     1|  0|   1|  2|      0|         0|         1|0.22| 0.8|      0.0|      Sat|   0|    32|
|     1|  0|   1|  3|      0|         0|         1|0.24|0.75|      0.0|      Sat|   0|    13|
|     1|  0|   1|  4|      0|         0|         1|0.24|0.75|      0.0|      Sat|   0|     1|
+------+---+----+---+-------+----------+----------+----+----+---------+---------+----+------+
only showing top 5 rows



In [0]:
rowData.printSchema()

root
 |-- season: integer (nullable = true)
 |-- yr: integer (nullable = true)
 |-- mnth: integer (nullable = true)
 |-- hr: integer (nullable = true)
 |-- holiday: integer (nullable = true)
 |-- workingday: integer (nullable = true)
 |-- weathersit: integer (nullable = true)
 |-- temp: double (nullable = true)
 |-- hum: double (nullable = true)
 |-- windspeed: double (nullable = true)
 |-- dayOfWeek: string (nullable = true)
 |-- days: integer (nullable = true)
 |-- demand: integer (nullable = true)



In [0]:
rowData.groupBy('dayOfWeek').count().show()

+---------+-----+
|dayOfWeek|count|
+---------+-----+
|      Sun| 2502|
|      Mon| 2479|
|      Sat| 2512|
|      Wed| 2475|
|      Tue| 2453|
|      Fri| 2487|
|      Thr| 2471|
+---------+-----+



In [0]:
rowData.groupBy('mnth').count().show()

+----+-----+
|mnth|count|
+----+-----+
|  12| 1483|
|   1| 1429|
|   6| 1440|
|   3| 1473|
|   5| 1488|
|   9| 1437|
|   4| 1437|
|   8| 1475|
|   7| 1488|
|  10| 1451|
|  11| 1437|
|   2| 1341|
+----+-----+



In [0]:
rowData.select("days").distinct().count()

Out[6]: 725

In [0]:
rowData.groupBy('yr').count().show()

+---+-----+
| yr|count|
+---+-----+
|  1| 8734|
|  0| 8645|
+---+-----+



In [0]:
rowData.groupBy('season').count().show()

+------+-----+
|season|count|
+------+-----+
|     1| 4242|
|     3| 4496|
|     4| 4232|
|     2| 4409|
+------+-----+



In [0]:
from pyspark.ml.feature import StringIndexer

In [0]:
indexer = StringIndexer(inputCol='dayOfWeek', outputCol='day_cat')

In [0]:
indexed_data =indexer.fit(rowData).transform(rowData)

In [0]:
indexed_data.show()

+------+---+----+---+-------+----------+----------+----+----+---------+---------+----+------+-------+
|season| yr|mnth| hr|holiday|workingday|weathersit|temp| hum|windspeed|dayOfWeek|days|demand|day_cat|
+------+---+----+---+-------+----------+----------+----+----+---------+---------+----+------+-------+
|     1|  0|   1|  0|      0|         0|         1|0.24|0.81|      0.0|      Sat|   0|    16|    0.0|
|     1|  0|   1|  1|      0|         0|         1|0.22| 0.8|      0.0|      Sat|   0|    40|    0.0|
|     1|  0|   1|  2|      0|         0|         1|0.22| 0.8|      0.0|      Sat|   0|    32|    0.0|
|     1|  0|   1|  3|      0|         0|         1|0.24|0.75|      0.0|      Sat|   0|    13|    0.0|
|     1|  0|   1|  4|      0|         0|         1|0.24|0.75|      0.0|      Sat|   0|     1|    0.0|
|     1|  0|   1|  5|      0|         0|         2|0.24|0.75|   0.0896|      Sat|   0|     1|    0.0|
|     1|  0|   1|  6|      0|         0|         1|0.22| 0.8|      0.0|      Sat| 

In [0]:
indexed_data.select('day_cat').distinct().orderBy('day_cat').show()

+-------+
|day_cat|
+-------+
|    0.0|
|    1.0|
|    2.0|
|    3.0|
|    4.0|
|    5.0|
|    6.0|
+-------+



In [0]:
indexed_data.groupBy('day_cat').count().orderBy('day_cat').show()

+-------+-----+
|day_cat|count|
+-------+-----+
|    0.0| 2512|
|    1.0| 2502|
|    2.0| 2487|
|    3.0| 2479|
|    4.0| 2475|
|    5.0| 2471|
|    6.0| 2453|
+-------+-----+



In [0]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler

In [0]:
indexed_data.columns


Out[16]: ['season',
 'yr',
 'mnth',
 'hr',
 'holiday',
 'workingday',
 'weathersit',
 'temp',
 'hum',
 'windspeed',
 'dayOfWeek',
 'days',
 'demand',
 'day_cat']

In [0]:
vec = VectorAssembler(
  inputCols= [
    'season',
    'yr',
    'mnth',
    'hr',
    'holiday',
    'workingday',
    'weathersit',
    'temp',
    'hum',
    'windspeed',
    'day_cat'
    ],
   outputCol = 'features'                  
 )

In [0]:
data = vec.transform(indexed_data)

In [0]:
data.show(truncate=False)

+------+---+----+---+-------+----------+----------+----+----+---------+---------+----+------+-------+---------------------------------------------------+
|season|yr |mnth|hr |holiday|workingday|weathersit|temp|hum |windspeed|dayOfWeek|days|demand|day_cat|features                                           |
+------+---+----+---+-------+----------+----------+----+----+---------+---------+----+------+-------+---------------------------------------------------+
|1     |0  |1   |0  |0      |0         |1         |0.24|0.81|0.0      |Sat      |0   |16    |0.0    |(11,[0,2,6,7,8],[1.0,1.0,1.0,0.24,0.81])           |
|1     |0  |1   |1  |0      |0         |1         |0.22|0.8 |0.0      |Sat      |0   |40    |0.0    |(11,[0,2,3,6,7,8],[1.0,1.0,1.0,1.0,0.22,0.8])      |
|1     |0  |1   |2  |0      |0         |1         |0.22|0.8 |0.0      |Sat      |0   |32    |0.0    |(11,[0,2,3,6,7,8],[1.0,1.0,2.0,1.0,0.22,0.8])      |
|1     |0  |1   |3  |0      |0         |1         |0.24|0.75|0.0      |Sat  

In [0]:
data.take(1)

Out[20]: [Row(season=1, yr=0, mnth=1, hr=0, holiday=0, workingday=0, weathersit=1, temp=0.24, hum=0.81, windspeed=0.0, dayOfWeek='Sat', days=0, demand=16, day_cat=0.0, features=SparseVector(11, {0: 1.0, 2: 1.0, 6: 1.0, 7: 0.24, 8: 0.81}))]

In [0]:
for item in data.take(1)[0]:
    print (item)

1
0
1
0
0
0
1
0.24
0.81
0.0
Sat
0
16
0.0
(11,[0,2,6,7,8],[1.0,1.0,1.0,0.24,0.81])


In [0]:
for item in data.take(3):
  print (item)
  print('\n')

Row(season=1, yr=0, mnth=1, hr=0, holiday=0, workingday=0, weathersit=1, temp=0.24, hum=0.81, windspeed=0.0, dayOfWeek='Sat', days=0, demand=16, day_cat=0.0, features=SparseVector(11, {0: 1.0, 2: 1.0, 6: 1.0, 7: 0.24, 8: 0.81}))


Row(season=1, yr=0, mnth=1, hr=1, holiday=0, workingday=0, weathersit=1, temp=0.22, hum=0.8, windspeed=0.0, dayOfWeek='Sat', days=0, demand=40, day_cat=0.0, features=SparseVector(11, {0: 1.0, 2: 1.0, 3: 1.0, 6: 1.0, 7: 0.22, 8: 0.8}))


Row(season=1, yr=0, mnth=1, hr=2, holiday=0, workingday=0, weathersit=1, temp=0.22, hum=0.8, windspeed=0.0, dayOfWeek='Sat', days=0, demand=32, day_cat=0.0, features=SparseVector(11, {0: 1.0, 2: 1.0, 3: 2.0, 6: 1.0, 7: 0.22, 8: 0.8}))




In [0]:
modelData = data.select('features', 'demand') 

In [0]:
modelData.show(truncate =False)

+---------------------------------------------------+------+
|features                                           |demand|
+---------------------------------------------------+------+
|(11,[0,2,6,7,8],[1.0,1.0,1.0,0.24,0.81])           |16    |
|(11,[0,2,3,6,7,8],[1.0,1.0,1.0,1.0,0.22,0.8])      |40    |
|(11,[0,2,3,6,7,8],[1.0,1.0,2.0,1.0,0.22,0.8])      |32    |
|(11,[0,2,3,6,7,8],[1.0,1.0,3.0,1.0,0.24,0.75])     |13    |
|(11,[0,2,3,6,7,8],[1.0,1.0,4.0,1.0,0.24,0.75])     |1     |
|[1.0,0.0,1.0,5.0,0.0,0.0,2.0,0.24,0.75,0.0896,0.0] |1     |
|(11,[0,2,3,6,7,8],[1.0,1.0,6.0,1.0,0.22,0.8])      |2     |
|(11,[0,2,3,6,7,8],[1.0,1.0,7.0,1.0,0.2,0.86])      |3     |
|(11,[0,2,3,6,7,8],[1.0,1.0,8.0,1.0,0.24,0.75])     |8     |
|(11,[0,2,3,6,7,8],[1.0,1.0,9.0,1.0,0.32,0.76])     |14    |
|[1.0,0.0,1.0,10.0,0.0,0.0,1.0,0.38,0.76,0.2537,0.0]|36    |
|[1.0,0.0,1.0,11.0,0.0,0.0,1.0,0.36,0.81,0.2836,0.0]|56    |
|[1.0,0.0,1.0,12.0,0.0,0.0,1.0,0.42,0.77,0.2836,0.0]|84    |
|[1.0,0.0,1.0,13.0,0.0,0

In [0]:
trainData, testData = modelData.randomSplit([0.7, 0.3])

In [0]:
modelData.describe().show()

+-------+------------------+
|summary|            demand|
+-------+------------------+
|  count|             17379|
|   mean|189.46308763450142|
| stddev| 181.3875990918646|
|    min|                 1|
|    max|               977|
+-------+------------------+



In [0]:
trainData.describe().show()

+-------+------------------+
|summary|            demand|
+-------+------------------+
|  count|             12143|
|   mean|189.95882401383514|
| stddev|182.09510526251742|
|    min|                 1|
|    max|               977|
+-------+------------------+



In [0]:
testData.describe().show()

+-------+------------------+
|summary|            demand|
+-------+------------------+
|  count|              5236|
|   mean|188.31340718105423|
| stddev|179.74811660987626|
|    min|                 1|
|    max|               976|
+-------+------------------+



In [0]:
help(LinearRegression)

Help on class LinearRegression in module pyspark.ml.regression:

class LinearRegression(_JavaRegressor, _LinearRegressionParams, pyspark.ml.util.JavaMLWritable, pyspark.ml.util.JavaMLReadable)
 |  LinearRegression(*, featuresCol: str = 'features', labelCol: str = 'label', predictionCol: str = 'prediction', maxIter: int = 100, regParam: float = 0.0, elasticNetParam: float = 0.0, tol: float = 1e-06, fitIntercept: bool = True, standardization: bool = True, solver: str = 'auto', weightCol: Optional[str] = None, aggregationDepth: int = 2, loss: str = 'squaredError', epsilon: float = 1.35, maxBlockSizeInMB: float = 0.0)
 |  
 |  Linear regression.
 |  
 |  The learning objective is to minimize the specified loss function, with regularization.
 |  This supports two kinds of loss:
 |  
 |  * squaredError (a.k.a squared loss)
 |  * huber (a hybrid of squared error for relatively small errors and absolute error for     relatively large ones, and we estimate the scale parameter from training data

In [0]:
lr = LinearRegression(labelCol='demand')

In [0]:
lr.explainParam("elasticNetParam")

Out[31]: 'elasticNetParam: the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty. (default: 0.0)'

In [0]:
lr.explainParams()

Out[32]: 'aggregationDepth: suggested depth for treeAggregate (>= 2). (default: 2)\nelasticNetParam: the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty. (default: 0.0)\nepsilon: The shape parameter to control the amount of robustness. Must be > 1.0. Only valid when loss is huber (default: 1.35)\nfeaturesCol: features column name. (default: features)\nfitIntercept: whether to fit an intercept term. (default: True)\nlabelCol: label column name. (default: label, current: demand)\nloss: The loss function to be optimized. Supported options: squaredError, huber. (default: squaredError)\nmaxBlockSizeInMB: maximum memory in MB for stacking input data into blocks. Data is stacked within partitions. If more than remaining data size in a partition then it is adjusted to the data size. Default 0.0 represents choosing optimal value, depends on specific algorithm. Must be >= 0. (default: 0.0)\nmaxIter: max number of itera

In [0]:
trainData.cache()

Out[33]: DataFrame[features: vector, demand: int]

In [0]:
testData.cache()

Out[34]: DataFrame[features: vector, demand: int]

In [0]:
lrModel = lr.fit(trainData)

In [0]:
summary = lrModel.summary

In [0]:
summary.explainedVariance

Out[37]: 13051.816703181148

In [0]:
summary.meanAbsoluteError

Out[38]: 105.83726673347506

In [0]:
data.select('demand').describe().show()


+-------+------------------+
|summary|            demand|
+-------+------------------+
|  count|             17379|
|   mean|189.46308763450142|
| stddev| 181.3875990918646|
|    min|                 1|
|    max|               977|
+-------+------------------+



In [0]:
summary.r2

Out[40]: 0.3936499389017416

In [0]:
summary.predictions.show(n=20, truncate = False)

+-----------------------------------------------+------+-------------------+
|features                                       |demand|prediction         |
+-----------------------------------------------+------+-------------------+
|(11,[0,1,2,6,7,8],[1.0,1.0,12.0,2.0,0.24,0.7]) |26.0  |7.767053255798576  |
|(11,[0,1,2,6,7,8],[2.0,1.0,3.0,1.0,0.58,0.68]) |156.0 |141.2224272738397  |
|(11,[0,1,2,6,7,8],[2.0,1.0,5.0,1.0,0.6,0.83])  |153.0 |116.46015625278602 |
|(11,[0,1,2,6,7,8],[3.0,1.0,6.0,1.0,0.64,0.83]) |116.0 |148.66718942759502 |
|(11,[0,1,2,6,7,8],[4.0,1.0,12.0,1.0,0.26,0.81])|108.0 |61.38349984462701  |
|(11,[0,1,2,6,7,8],[4.0,1.0,12.0,1.0,0.3,0.7])  |94.0  |94.32977790713895  |
|(11,[0,2,3,6,7,8],[1.0,1.0,1.0,1.0,0.22,0.8])  |40.0  |-79.12374036311121 |
|(11,[0,2,3,6,7,8],[1.0,1.0,2.0,1.0,0.22,0.8])  |32.0  |-71.4037726808723  |
|(11,[0,2,3,6,7,8],[1.0,1.0,2.0,2.0,0.18,0.55]) |16.0  |-39.38474260181509 |
|(11,[0,2,3,6,7,8],[1.0,1.0,3.0,1.0,0.24,0.75]) |13.0  |-48.18706218079777 |

In [0]:
print ("explainedVariance={}".format(summary.explainedVariance))
print ("meanAbsoluteError=%g" %summary.meanAbsoluteError)

explainedVariance=13051.816703181148
meanAbsoluteError=105.837


In [0]:
testResults = lrModel.evaluate(testData)

In [0]:
testResults.residuals.show(n=10)

+------------------+
|         residuals|
+------------------+
|  -67.288580342215|
| 41.46709449855885|
| 42.52390195191663|
|17.587223769603153|
|22.737933753835335|
| 1.162890034208516|
|138.37320030972083|
|109.93647806643759|
| 9.069636066139505|
| 84.09203013125111|
+------------------+
only showing top 10 rows



In [0]:
testResults.residuals.groupBy().avg().show() 

+-------------------+
|     avg(residuals)|
+-------------------+
|0.08950312008645038|
+-------------------+



- The average of the residuals does not reflect the reality as the residuals can be negative
- The mean absolute error is the average of the absolute values of the residuals

In [0]:
from pyspark.sql.functions import abs
df= testResults.residuals
df.select(abs(df.residuals)).groupBy().avg().show()

+-------------------+
|avg(abs(residuals))|
+-------------------+
| 106.67420837950966|
+-------------------+



In [0]:
print ("r2=%g"%testResults.r2)   # my model explains x % of the variance of the data
print ("rootMeanSquaredError=%g"%testResults.rootMeanSquaredError)

r2=0.372493
rootMeanSquaredError=142.375


In [0]:
print ("meanAbsoluteError=%g"%testResults.meanAbsoluteError)

meanAbsoluteError=106.674


## Underfitting !
- decrease regularization parameter? 
- Add more features? Feature Engineering?
- Polynomial Regression? other algorithms? Trees? 

### Anyway let's get some insights from our data !

In [0]:
data.printSchema()

root
 |-- season: integer (nullable = true)
 |-- yr: integer (nullable = true)
 |-- mnth: integer (nullable = true)
 |-- hr: integer (nullable = true)
 |-- holiday: integer (nullable = true)
 |-- workingday: integer (nullable = true)
 |-- weathersit: integer (nullable = true)
 |-- temp: double (nullable = true)
 |-- hum: double (nullable = true)
 |-- windspeed: double (nullable = true)
 |-- dayOfWeek: string (nullable = true)
 |-- days: integer (nullable = true)
 |-- demand: integer (nullable = true)
 |-- day_cat: double (nullable = false)
 |-- features: vector (nullable = true)



In [0]:
insights = lrModel.evaluate(data)

In [0]:
pred = insights.predictions

In [0]:
pred.take(1)

Out[52]: [Row(season=1, yr=0, mnth=1, hr=0, holiday=0, workingday=0, weathersit=1, temp=0.24, hum=0.81, windspeed=0.0, dayOfWeek='Sat', days=0, demand=16, day_cat=0.0, features=SparseVector(11, {0: 1.0, 2: 1.0, 6: 1.0, 7: 0.24, 8: 0.81}), prediction=-83.06371978855887)]

In [0]:
pred_res = pred.withColumn('res_abs', abs(pred.prediction-pred.demand))

In [0]:
#for item in pred_res.take(1)[0]: 
#  print (item)


In [0]:
pred_res.take(1)

Out[55]: [Row(season=1, yr=0, mnth=1, hr=0, holiday=0, workingday=0, weathersit=1, temp=0.24, hum=0.81, windspeed=0.0, dayOfWeek='Sat', days=0, demand=16, day_cat=0.0, features=SparseVector(11, {0: 1.0, 2: 1.0, 6: 1.0, 7: 0.24, 8: 0.81}), prediction=-83.06371978855887, res_abs=99.06371978855887)]

In [0]:

from pyspark.sql.functions import avg, stddev, format_number

In [0]:
from pyspark.sql.functions import format_number
pred_res.groupBy('hr').agg(format_number(avg('res_abs'), 2).alias('avg_abs_residual'), 
                           format_number(avg('demand'), 2).alias('avg_demand'), 
                           format_number(stddev('prediction'), 2).alias('stddev_prediction'), 
                           format_number(stddev('demand'), 2).alias('stddev_demand')
                          ).sort('hr').show()

+---+----------------+----------+-----------------+-------------+
| hr|avg_abs_residual|avg_demand|stddev_prediction|stddev_demand|
+---+----------------+----------+-----------------+-------------+
|  0|           60.43|     53.90|            78.42|        42.31|
|  1|           71.89|     33.38|            77.35|        33.54|
|  2|           79.65|     22.87|            75.96|        26.58|
|  3|           89.61|     11.73|            73.66|        13.24|
|  4|           96.52|      6.35|            72.70|         4.14|
|  5|           87.33|     19.89|            73.64|        13.20|
|  6|           53.73|     76.04|            74.43|        55.08|
|  7|          143.15|    212.06|            77.61|       161.44|
|  8|          249.31|    359.01|            82.39|       235.19|
|  9|           77.27|    219.31|            85.06|        93.70|
| 10|           69.87|    173.67|            88.53|       102.21|
| 11|           80.61|    208.14|            90.11|       127.50|
| 12|     