# Bike Rental DataSet from UCI Machine Learning Repository
## Citations
Fanaee-T, Hadi, and Gama, Joao, 'Event labeling combining ensemble detectors and background knowledge', Progress in Artificial Intelligence (2013): pp. 1-15, Springer Berlin Heidelber
## Attributes on original data


- season : season (1:spring, 2:summer, 3:fall, 4:winter)
- yr : year (0: 2011, 1:2012)
- mnth : month ( 1 to 12)
- hr : hour (0 to 23)
- holiday : weather day is holiday or not (extracted from [Web Link])
- weekday : day of the week
- workingday : if day is neither weekend nor holiday is 1, otherwise is 0.
- weathersit : 
 - 1: Clear, Few clouds, Partly cloudy, Partly cloudy
 - 2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist
 - 3: Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds
 - 4: Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog
- temp : Normalized temperature in Celsius. The values are derived via (t-t_min)/(t_max-t_min), t_min=-8, t_max=+39 (only in hourly scale)
- hum: Normalized humidity. The values are divided to 100 (max)
- windspeed: Normalized wind speed. The values are divided to 67 (max)


## URL:
https://archive.ics.uci.edu/ml/datasets/Bike+Sharing+Dataset

In [0]:
rowData = spark.read.csv("/FileStore/tables/ml/Bike_Rental_UCI_dataset-bb6c6.csv", inferSchema=True, header = True)

In [0]:
rowData.show(n=5)

In [0]:
rowData.printSchema()

In [0]:
rowData.groupBy('dayOfWeek').count().show()

In [0]:
rowData.groupBy('mnth').count().show()

In [0]:
rowData.select("days").distinct().count()

In [0]:
rowData.groupBy('yr').count().show()

In [0]:
rowData.groupBy('season').count().show()

In [0]:
from pyspark.ml.feature import StringIndexer

In [0]:
indexer = StringIndexer(inputCol='dayOfWeek', outputCol='day_cat')

In [0]:
indexed_data =indexer.fit(rowData).transform(rowData)

In [0]:
indexed_data.show()

In [0]:
indexed_data.select('day_cat').distinct().orderBy('day_cat').show()

In [0]:
indexed_data.groupBy('day_cat').count().orderBy('day_cat').show()

In [0]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler

In [0]:
indexed_data.columns


In [0]:
vec = VectorAssembler(
  inputCols= [
    'season',
    'yr',
    'mnth',
    'hr',
    'holiday',
    'workingday',
    'weathersit',
    'temp',
    'hum',
    'windspeed',
    'day_cat'
    ],
   outputCol = 'features'                  
 )

In [0]:
data = vec.transform(indexed_data)

In [0]:
data.show(truncate=False)

In [0]:
data.take(1)

In [0]:
for item in data.take(1)[0]:
    print (item)

In [0]:
for item in data.take(3):
  print (item)
  print('\n')

In [0]:
modelData = data.select('features', 'demand') 

In [0]:
modelData.show(truncate =False)

In [0]:
trainData, testData = modelData.randomSplit([0.7, 0.3])

In [0]:
modelData.describe().show()

In [0]:
trainData.describe().show()

In [0]:
testData.describe().show()

In [0]:
help(LinearRegression)

In [0]:
lr = LinearRegression(labelCol='demand')

In [0]:
lr.explainParam("elasticNetParam")

In [0]:
lr.explainParams()

In [0]:
trainData.cache()

In [0]:
testData.cache()

In [0]:
lrModel = lr.fit(trainData)

In [0]:
summary = lrModel.summary

In [0]:
summary.explainedVariance

In [0]:
summary.meanAbsoluteError

In [0]:
data.select('demand').describe().show()


In [0]:
summary.r2

In [0]:
summary.predictions.show(n=20, truncate = False)

In [0]:
print ("explainedVariance={}".format(summary.explainedVariance))
print ("meanAbsoluteError=%g" %summary.meanAbsoluteError)

In [0]:
testResults = lrModel.evaluate(testData)

In [0]:
testResults.residuals.show(n=10)

In [0]:
testResults.residuals.groupBy().avg().show() 

- The average of the residuals does not reflect the reality as the residuals can be negative
- The mean absolute error is the average of the absolute values of the residuals

In [0]:
from pyspark.sql.functions import abs
df= testResults.residuals
df.select(abs(df.residuals)).groupBy().avg().show()

In [0]:
print ("r2=%g"%testResults.r2)   # my model explains x % of the variance of the data
print ("rootMeanSquaredError=%g"%testResults.rootMeanSquaredError)

In [0]:
print ("meanAbsoluteError=%g"%testResults.meanAbsoluteError)

## Underfitting !
- decrease regularization parameter? 
- Add more features? Feature Engineering?
- Polynomial Regression? other algorithms? Trees? 

### Anyway let's get some insights from our data !

In [0]:
data.printSchema()

In [0]:
insights = lrModel.evaluate(data)

In [0]:
pred = insights.predictions

In [0]:
pred.take(1)

In [0]:
pred_res = pred.withColumn('res_abs', abs(pred.prediction-pred.demand))

In [0]:
#for item in pred_res.take(1)[0]: 
#  print (item)


In [0]:
pred_res.take(1)

In [0]:

from pyspark.sql.functions import avg, stddev, format_number

In [0]:
from pyspark.sql.functions import format_number
pred_res.groupBy('hr').agg(format_number(avg('res_abs'), 2).alias('avg_abs_residual'), 
                           format_number(avg('demand'), 2).alias('avg_demand'), 
                           format_number(stddev('prediction'), 2).alias('stddev_prediction'), 
                           format_number(stddev('demand'), 2).alias('stddev_demand')
                          ).sort('hr').show()