## Bike Rental DataSet from UCI Machine Learning Repository

### Model improovement

#### By Matthieu Hanania & Karis Gwet

### Data Visualization

In this part, I try to understant what is the dataset

In [0]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler, StandardScaler,StringIndexer,MinMaxScaler

In [0]:
rowData = spark.read.csv("/FileStore/tables/Bike_Rental_UCI_dataset.csv", inferSchema=True, header = True)

In [0]:
# I transform all the data into numericals, and I drop the non numericals
indexer = StringIndexer(inputCol='dayOfWeek', outputCol='day_cat')

indexed_data =indexer.fit(rowData).transform(rowData)

indexed_data = indexed_data.drop('dayOfWeek','days')

indexed_data.show(5)

+------+---+----+---+-------+----------+----------+----+----+---------+------+-------+
|season| yr|mnth| hr|holiday|workingday|weathersit|temp| hum|windspeed|demand|day_cat|
+------+---+----+---+-------+----------+----------+----+----+---------+------+-------+
|     1|  0|   1|  0|      0|         0|         1|0.24|0.81|      0.0|    16|    0.0|
|     1|  0|   1|  1|      0|         0|         1|0.22| 0.8|      0.0|    40|    0.0|
|     1|  0|   1|  2|      0|         0|         1|0.22| 0.8|      0.0|    32|    0.0|
|     1|  0|   1|  3|      0|         0|         1|0.24|0.75|      0.0|    13|    0.0|
|     1|  0|   1|  4|      0|         0|         1|0.24|0.75|      0.0|     1|    0.0|
+------+---+----+---+-------+----------+----------+----+----+---------+------+-------+
only showing top 5 rows



In [0]:
# I put all the features in one column
vector= VectorAssembler(inputCols=indexed_data.columns, outputCol = 'features')

data= vector.transform(indexed_data)

data.show(5,truncate=False)

+------+---+----+---+-------+----------+----------+----+----+---------+------+-------+----------------------------------------------------+
|season|yr |mnth|hr |holiday|workingday|weathersit|temp|hum |windspeed|demand|day_cat|features                                            |
+------+---+----+---+-------+----------+----------+----+----+---------+------+-------+----------------------------------------------------+
|1     |0  |1   |0  |0      |0         |1         |0.24|0.81|0.0      |16    |0.0    |(12,[0,2,6,7,8,10],[1.0,1.0,1.0,0.24,0.81,16.0])    |
|1     |0  |1   |1  |0      |0         |1         |0.22|0.8 |0.0      |40    |0.0    |[1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.22,0.8,0.0,40.0,0.0] |
|1     |0  |1   |2  |0      |0         |1         |0.22|0.8 |0.0      |32    |0.0    |[1.0,0.0,1.0,2.0,0.0,0.0,1.0,0.22,0.8,0.0,32.0,0.0] |
|1     |0  |1   |3  |0      |0         |1         |0.24|0.75|0.0      |13    |0.0    |[1.0,0.0,1.0,3.0,0.0,0.0,1.0,0.24,0.75,0.0,13.0,0.0]|
|1     |0  |1   |4  

### Data Preparation

In this part, I normalize the data. It will help to have a good model

In [0]:
standarScaler = StandardScaler(inputCol="features",outputCol="STfeatures")

STdata = standarScaler.fit(data).transform(data)

In [0]:
mmScaler = MinMaxScaler(inputCol="STfeatures",outputCol="NRfeatures")

NRdata = mmScaler.fit(STdata).transform(STdata)

NRdata.select("NRfeatures").show(2,truncate=False)

+---------------------------------------------------------------------------------+
|NRfeatures                                                                       |
+---------------------------------------------------------------------------------+
|(12,[7,8,10],[0.22448979591836735,0.8100000000000002,0.015368852459016391])      |
|(12,[3,7,8,10],[0.04347826086956522,0.2040816326530612,0.8,0.039959016393442626])|
+---------------------------------------------------------------------------------+
only showing top 2 rows



### Model creation

here, I create a linear regression model

In [0]:
# create the data and split the dataset
modelData = NRdata.select('NRfeatures', 'demand') 
trainData, testData = modelData.randomSplit([0.8, 0.2])

In [0]:
#create the linear regression 
lr = LinearRegression(featuresCol="NRfeatures",labelCol='demand')

lrModel = lr.fit(trainData)


# we can see that the model is good !
print(lrModel.summary.r2)
lrModel.summary.meanAbsoluteError

1.0
Out[56]: 1.1240905133292503e-11

In [0]:
#the model has good predictions !
lrModel.summary.predictions.show(n=10, truncate = False)

+-------------------------------------------------------------------------------------------------------------------------------------------+------+------------------+
|NRfeatures                                                                                                                                 |demand|prediction        |
+-------------------------------------------------------------------------------------------------------------------------------------------+------+------------------+
|(12,[0,1,2,7,8,10],[0.33333333333333337,1.0,0.18181818181818185,0.5714285714285714,0.68,0.15881147540983606])                              |156.0 |156.0000000000049 |
|(12,[0,1,2,7,8,10],[0.33333333333333337,1.0,0.3636363636363636,0.5918367346938775,0.8300000000000001,0.1557377049180328])                  |153.0 |153.00000000001134|
|(12,[0,1,2,7,8,10],[1.0,1.0,1.0,0.2857142857142857,0.7,0.09528688524590163])                                                               |94.0  |93.999999999

In [0]:
print ("explainedVariance={}".format(lrModel.summary.explainedVariance))
print ("meanAbsoluteError=%g" %lrModel.summary.meanAbsoluteError)

explainedVariance=33217.17626143447
meanAbsoluteError=1.12409e-11


In [0]:
# When we evaluate it, the resitduals are very slow
testResults = lrModel.evaluate(testData)

testResults.residuals.show(n=10,truncate=False)


from pyspark.sql.functions import abs
df= testResults.residuals
df.select(abs(df.residuals)).groupBy().avg().show(truncate=False)


+----------------------+
|residuals             |
+----------------------+
|-8.910205906431656E-12|
|-6.110667527536862E-13|
|1.4097167877480388E-11|
|2.6147972675971687E-12|
|-3.637978807091713E-12|
|5.6274984672199935E-12|
|1.3241852059309167E-11|
|1.4779288903810084E-11|
|-5.229594535194337E-12|
|1.092459456231154E-11 |
+----------------------+
only showing top 10 rows

+----------------------+
|avg(abs(residuals))   |
+----------------------+
|1.0954552648854687E-11|
+----------------------+



In [0]:
print ("r2=%g"%testResults.r2)   # my model explains x % of the variance of the data

r2=1
