### Initialize Spark Session 

In [1]:
# initialize spark session
import findspark
findspark.init('/home/ubuntu/spark-2.1.1-bin-hadoop2.7')
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('iteration4').getOrCreate()

### Data Import and Exploration

In [None]:
# import without schema
df = spark.read.csv('dataset.csv', header=True)

In [None]:
# data format
from pyspark.sql.types import (StructField, StructType,
                               TimestampType, IntegerType, FloatType)
# define data schema (or use inferSchema =True when loading dataframe)
data_schema = [StructField('instant', IntegerType(), True),
               StructField('dteday', TimestampType(), True),
               StructField('season', IntegerType(), True),
               StructField('yr', IntegerType(), True),
               StructField('mnth', IntegerType(), True),
               StructField('hr', IntegerType(), True),
               StructField('holiday', IntegerType(), True),
               StructField('weekday', IntegerType(), True),
               StructField('workingday', IntegerType(), True),
               StructField('weathersit', IntegerType(), True),
               StructField('temp', FloatType(), True),
               StructField('atemp', FloatType(), True),
               StructField('hum', FloatType(), True),
               StructField('windspeed', FloatType(), True),
               StructField('casual', IntegerType(), True),
               StructField('registered', IntegerType(), True),
               StructField('cnt', IntegerType(), True)]

final_struct = StructType(fields = data_schema)
#import with self-defined schema
df = spark.read.csv('dataset.csv', schema=final_struct, header=True)

In [2]:
# import with inferred schema automatically, only for csv
df = spark.read.csv('dataset.csv', header=True, inferSchema=True)

In [None]:
#df.show(5) #df.head()
# check features and data type
df.printSchema() #df.columns & df.dtypes
# check number of records
#print('Number of Rows: ', df.count())

In [None]:
# Import the relevant Python libraries.
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn
data = df.toPandas()

In [None]:
sn.boxplot(data['yr'], data['cnt'])
plt.title('the influnce of year')

In [None]:
sn.pointplot(data['mnth'], data['cnt'])
plt.title('the influnce of month')

In [None]:
sn.barplot(data['hr'], data['cnt'])
plt.title('the influnce of hours in a day')

In [None]:
sn.barplot(data['weathersit'], data['cnt'])
plt.title('the influnce of weather')

In [3]:
# data audit
df.describe('instant', 'temp', 'atemp', 'hum', 'cnt').show(1)

+-------+-------+-----+-----+-----+-----+
|summary|instant| temp|atemp|  hum|  cnt|
+-------+-------+-----+-----+-----+-----+
|  count|  13411|16879|16655|16664|17372|
+-------+-------+-----+-----+-----+-----+
only showing top 1 row



### Data Preparation

In [None]:
# Data Select
sn.barplot(data['weekday'], data['cnt'])
plt.title('rental counts in each weekday')

In [None]:
sn.barplot(data['season'], data['cnt'])
plt.title('the influnce of season')

In [4]:
df_selected = df.drop('dteday', 'registered', 'casual', 'season', 'weekday')

In [None]:
df_selected.printSchema()

In [5]:
# Data Clean
# drop feature 'instant'
df_cleaned = df_selected.drop('instant')
# remove rows where 'cnt' is null
df_cleaned = df_cleaned.na.drop(subset='cnt')

In [None]:
df_cleaned.show(2)
df_cleaned.describe('cnt').show()
df_cleaned.count()

In [6]:
# fill null values with mean of values in 'temp', 'atemp', 'hum'
from pyspark.sql.functions import mean
mean_temp = df.select(mean(df.temp)).collect()[0][0]
mean_atemp = df.select(mean(df.atemp)).collect()[0][0]
mean_hum = df.select(mean(df.hum)).collect()[0][0]
mean = {'temp': mean_temp, 'atemp': mean_atemp, 'hum': mean_hum}
df_cleaned = df_cleaned.na.fill(mean)

In [7]:
df_cleaned.describe('temp', 'atemp', 'hum', 'cnt').show(1)
df_cleaned.printSchema()

+-------+-----+-----+-----+-----+
|summary| temp|atemp|  hum|  cnt|
+-------+-----+-----+-----+-----+
|  count|17372|17372|17372|17372|
+-------+-----+-----+-----+-----+
only showing top 1 row

root
 |-- yr: integer (nullable = true)
 |-- mnth: integer (nullable = true)
 |-- hr: integer (nullable = true)
 |-- holiday: integer (nullable = true)
 |-- workingday: integer (nullable = true)
 |-- weathersit: integer (nullable = true)
 |-- temp: double (nullable = false)
 |-- atemp: double (nullable = false)
 |-- hum: double (nullable = false)
 |-- windspeed: double (nullable = true)
 |-- cnt: integer (nullable = true)



In [None]:
# Construct Data
from pyspark.ml.feature import OneHotEncoder
# one hot encode: convert numbers into a vector
mnthEncoder = OneHotEncoder(inputCol='mnth', outputCol='mnthVec')
hrEncoder = OneHotEncoder(inputCol='hr', outputCol='hrVec')
weatherEncoder = OneHotEncoder(inputCol='weathersit', outputCol='weatherVec')
from pyspark.ml import Pipeline
pipeline = Pipeline(stages = [mnthEncoder, hrEncoder, weatherEncoder])
df_constructed = pipeline.fit(df_cleaned).transform(df_cleaned)

In [None]:
df_constructed.printSchema()

### Data Transform

In [None]:
# Reduce the data
df_reduced = df_constructed.drop('mnth', 'hr', 'weathersit')

In [None]:
df_reduced.printSchema()

In [None]:
# Project the data
# assemble features into a vector for modeling
from pyspark.ml.feature import VectorAssembler
featuresCol = df_reduced.drop('cnt').columns
assembler = VectorAssembler(inputCols = featuresCol, outputCol = 'features')
df_projected = assembler.transform(df_reduced)

In [None]:
df_projected.show()

In [None]:
final_data = df_projected.select('cnt', 'features')
final_data.show(5)

### Model Select

In [None]:
from pyspark.ml.regression import (RandomForestRegressor, 
                                   GBTRegressor, 
                                   DecisionTreeRegressor)
# create evaluator with R2
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(labelCol='cnt', predictionCol='prediction',
                                metricName='r2')
import numpy as np
import matplotlib.pyplot as plt

In [None]:
#create a sample for model test
sample, x = final_data.randomSplit([0.1, 0.9])

In [None]:
# fit models with different parameters and evaluate
# random forest regression model with maxDepth: 3, 6, 9,..., 30
r2_rfr = np.zeros(10)
for i in np.arange(10):
    rfr = RandomForestRegressor(labelCol='cnt', maxDepth=(i+1)*3)
    rfrModel = rfr.fit(sample)
    prediction_rfr = rfrModel.transform(sample)
    r2_rfr[i] = evaluator.evaluate(prediction_rfr)
plt.plot(np.arange(3, 33, 3), r2_rfr)
r2_rfr

In [None]:
# Gradient Boosted Trees model with maxIter: 10, 20, 30,..., 100
r2_gbt = np.zeros(10)
for i in np.arange(10):
    gbt = GBTRegressor(labelCol='cnt', maxIter = (i+1)*10)
    gbtModel = gbt.fit(sample)
    prediction_gbt = gbtModel.transform(sample)
    r2_gbt[i] = evaluator.evaluate(prediction_gbt)
plt.plot(np.arange(10, 105, 10), r2_gbt)
r2_gbt

In [None]:
# Decision Tree Regression model with maxDepth: 3, 6, 9,..., 30
r2_dtr = np.zeros(10)
for i in np.arange(10):
    dtr = DecisionTreeRegressor(labelCol='cnt', maxDepth= (i+1)*3)
    dtrModel = dtr.fit(sample)
    prediction_dtr = dtrModel.transform(sample)
    r2_dtr[i] = evaluator.evaluate(prediction_dtr)
plt.plot(np.arange(3, 33, 3), r2_dtr)
r2_dtr

### Data Mining

In [None]:
# split data into train and test
train, test = final_data.randomSplit([0.7, 0.3])

In [None]:
#create model
GBT = GBTRegressor(labelCol='cnt', maxIter = 80)
# train the model
GBTmodel = GBT.fit(train)
prediction_GBT = GBTmodel.transform(test)

In [None]:
DTR = DecisionTreeRegressor(labelCol='cnt', maxDepth=20)
DTRmodel = DTR.fit(train)
prediction_DTR = DTRmodel.transform(test)

In [None]:
RFR = RandomForestRegressor(labelCol='cnt', maxDepth=20)
RFRmodel = RFR.fit(train)
prediction_RFR = RFRmodel.transform(test)

In [None]:
# search patterns
axes = plt.subplots(2, 1, figsize = (16,10))
ax1 = plt.subplot(2, 1, 1)
sn.pointplot(data['hr'], data['cnt'], hue=data['workingday'], ax=ax1)
ax1.set_title('the influnce of hour in workingday')
ax2 = plt.subplot(2, 2, 3)
sn.pointplot(data['hr'], data['cnt'], hue=data['holiday'], ax=ax2)
ax2.set_title('the influnce of hour in holiday')
ax3 = plt.subplot(2, 2, 4)
sn.pointplot(data['hr'], data['cnt'], hue=data['weekday'], ax=ax3)
ax3.set_title('the influnce of hour in weekday')

### Interpretation

In [None]:
#virsualize patterns
axes = plt.subplots(2, 1, figsize = (16,10))
ax1 = plt.subplot(2, 1, 1)
sn.pointplot(data['hr'], data['registered'], ax=ax1)
ax1.set_title('registered rental counts in a day')
ax2 = plt.subplot(2, 1, 2)
sn.pointplot(data['hr'], data['casual'], ax=ax2)
ax2.set_title('casual rental counts in a day')

In [None]:
#evaluate model
r2_GBT = evaluator.evaluate(prediction_GBT)
r2_DTR = evaluator.evaluate(prediction_DTR)
r2_RFR = evaluator.evaluate(prediction_RFR)
print('R2 Score of GBT Regression: ', r2_GBT)
print('R2 Score of Decision Tree Regression: ', r2_DTR)
print('R2 Score of Random Forest Regression: ', r2_RFR)