In [1]:
from pyspark.sql import SparkSession 
sc = SparkSession.builder.master("local[*]").getOrCreate()


In [2]:
data = sc.read.csv("/FileStore/tables/Bike_Rental_UCI_dataset-bb6c6.csv",inferSchema=True,header=True)

In [3]:
data.show()

In [4]:
from pyspark.ml.classification import (LogisticRegression,DecisionTreeClassifier,RandomForestClassifier)
from pyspark.ml import Pipeline
from pyspark.ml.tuning import ParamGridBuilder,CrossValidator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import StringIndexer

In [5]:
indexer = StringIndexer(inputCol="dayOfWeek",outputCol="day_cat")

In [6]:
indexed_data = indexer.fit(data).transform(data)
indexed_data.show()

In [7]:
indexed_data.select('day_cat').distinct().orderBy('day_cat').show()

In [8]:
from pyspark.ml.feature import VectorAssembler

vec = VectorAssembler(inputCols=['season','yr','mnth','hr','holiday','workingday','weathersit','days','day_cat'],outputCol='features')

In [9]:
data1 = vec.transform(indexed_data)

In [10]:
data1.show()

In [11]:
modelData = data1.select('features','demand')

In [12]:
modelData.describe().show()

In [13]:
modelData.show(truncate=False)

In [14]:
trainData,testData = modelData.randomSplit([0.7,0.3])

In [15]:
trainData = trainData.withColumnRenamed(('demand'),('label'))

In [16]:
testData = testData.withColumnRenamed(('demand'), ('label'))

In [17]:
trainData.show(truncate=False)

In [18]:
testData.show(truncate=False)

In [19]:
from pyspark.ml.regression import LinearRegression
lr = LinearRegression(featuresCol ='features', labelCol='label', maxIter=1000, regParam=0.8, elasticNetParam=1)
lr_model = lr.fit(trainData)
print("Coefficients: " + str(lr_model.coefficients))
print("Intercept: " + str(lr_model.intercept))

In [20]:
trainingSummary = lr_model.summary
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)

In [21]:
from pyspark.ml import Pipeline
from pyspark.ml.tuning import ParamGridBuilder,CrossValidator

In [22]:
from pyspark.ml.classification import MultilayerPerceptronClassifier

In [23]:
layers =[4,5,4,3]

In [24]:
classifier = MultilayerPerceptronClassifier(maxIter=1000,layers=layers)

In [25]:
gridBuilder=ParamGridBuilder().addGrid(classifier.layers,[[4,2,3,3],[4,2,2,3],[4,10,3]]).addGrid(classifier.stepSize,[0.03,0.01]).build()

In [26]:
pipeLine=Pipeline()
pipeLine.setStages((classifier))

In [27]:
evaluator=MulticlassClassificationEvaluator(metricName='f1')

In [28]:
cv = CrossValidator(estimator=pipeLine,estimatorParamMaps=gridBuilder,evaluator=evaluator)

In [29]:
model = pipeLine.fit(trainData)



In [30]:
preds = classifier.fit(trainData)

In [31]:
lr = LogisticRegression()
dt = DecisionTreeClassifier()
rf = RandomForestClassifier(maxBins=10)