# Linear Regression

In [1]:
import numpy as np
import pandas as pd
import pyspark
import sys

In [2]:
import pyspark.sql.functions as fn

In [3]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.linalg import Vector
from pyspark.ml.feature import VectorAssembler

In [None]:
# Local mode
spark = SparkSession\
        .builder\
        .appName("lr")\
        .getOrCreate()

In [None]:
# yarn mode
spark = SparkSession\
        .builder\
        .master("yarn")\
        .config('spark.executor.instances','99')\
        .config('spark.executor.memory','4G')\
        .appName("iris")\
        .getOrCreate()

In [4]:
# Check spark app name
spark.sparkContext.appName

'PySparkShell'

In [5]:
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", True)

In [6]:
# print runtime versions
# Python version
sys.version

'3.8.5 (default, Jul 28 2020, 12:59:40) \n[GCC 9.3.0]'

In [7]:
# Spark version
spark.version

'3.0.1'

### Exploring Data

In [8]:
# load iris.csv into Spark dataframe
#df = spark.read.csv('file:///vagrant/data/lr_dataset.csv', header=True, inferSchema=True)
df = spark.read.csv('data/lr_dataset.csv', header=True, inferSchema=True)

In [9]:
#validate the size of data
df.count(), len(df.columns)

(1232, 6)

In [10]:
# First 5 rows of Iris dataset
df.show(5)

+-----+-----+-----+-----+-----+------+
|var_1|var_2|var_3|var_4|var_5|output|
+-----+-----+-----+-----+-----+------+
|  734|  688|   81|0.328|0.259| 0.418|
|  700|  600|   94| 0.32|0.247| 0.389|
|  712|  705|   93|0.311|0.247| 0.417|
|  734|  806|   69|0.315| 0.26| 0.415|
|  613|  759|   61|0.302| 0.24| 0.378|
+-----+-----+-----+-----+-----+------+
only showing top 5 rows



In [11]:
df.printSchema()

root
 |-- var_1: integer (nullable = true)
 |-- var_2: integer (nullable = true)
 |-- var_3: integer (nullable = true)
 |-- var_4: double (nullable = true)
 |-- var_5: double (nullable = true)
 |-- output: double (nullable = true)



In [12]:
df.describe().show()

+-------+-----------------+-----------------+------------------+--------------------+--------------------+-------------------+
|summary|            var_1|            var_2|             var_3|               var_4|               var_5|             output|
+-------+-----------------+-----------------+------------------+--------------------+--------------------+-------------------+
|  count|             1232|             1232|              1232|                1232|                1232|               1232|
|   mean|715.0819805194806|715.0819805194806| 80.90422077922078|  0.3263311688311693| 0.25927272727272715|0.39734172077922014|
| stddev| 91.5342940441652|93.07993263118064|11.458139049993724|0.015012772334166148|0.012907228928000298|0.03326689862173776|
|    min|              463|              472|                40|               0.277|               0.214|              0.301|
|    max|             1009|             1103|               116|               0.373|               0.294|     

In [13]:
# check for correlation
df.select(fn.corr('var_1','output')).show()

+-------------------+
|corr(var_1, output)|
+-------------------+
| 0.9187399607627283|
+-------------------+



### Feature Engineering

In [14]:
# display all column names
df.columns

['var_1', 'var_2', 'var_3', 'var_4', 'var_5', 'output']

In [15]:
# vectorize all numerical columns into a single feature column
feature_cols = df.columns[:-1]
assembler = VectorAssembler(inputCols=feature_cols, outputCol='features')
features_df = assembler.transform(df)

In [16]:
# validate the presence of dense vectors 
features_df.printSchema()

root
 |-- var_1: integer (nullable = true)
 |-- var_2: integer (nullable = true)
 |-- var_3: integer (nullable = true)
 |-- var_4: double (nullable = true)
 |-- var_5: double (nullable = true)
 |-- output: double (nullable = true)
 |-- features: vector (nullable = true)



In [17]:
# view the details of dense vector
features_df.select('features').show(5,False)

+------------------------------+
|features                      |
+------------------------------+
|[734.0,688.0,81.0,0.328,0.259]|
|[700.0,600.0,94.0,0.32,0.247] |
|[712.0,705.0,93.0,0.311,0.247]|
|[734.0,806.0,69.0,0.315,0.26] |
|[613.0,759.0,61.0,0.302,0.24] |
+------------------------------+
only showing top 5 rows



In [18]:
# only select the features and label column
model_df = features_df.select(['features', 'output'])

In [19]:
# Reading for machine learning
model_df.show(10,False)

+------------------------------+------+
|features                      |output|
+------------------------------+------+
|[734.0,688.0,81.0,0.328,0.259]|0.418 |
|[700.0,600.0,94.0,0.32,0.247] |0.389 |
|[712.0,705.0,93.0,0.311,0.247]|0.417 |
|[734.0,806.0,69.0,0.315,0.26] |0.415 |
|[613.0,759.0,61.0,0.302,0.24] |0.378 |
|[748.0,676.0,85.0,0.318,0.255]|0.422 |
|[669.0,588.0,97.0,0.315,0.251]|0.411 |
|[667.0,845.0,68.0,0.324,0.251]|0.381 |
|[758.0,890.0,64.0,0.33,0.274] |0.436 |
|[726.0,670.0,88.0,0.335,0.268]|0.422 |
+------------------------------+------+
only showing top 10 rows



In [20]:
# size of model df
model_df.count(), len(model_df.columns)

(1232, 2)

### Split Data - Train & Test sets

In [21]:
# use Logistic Regression to train on the training set
train_df, test_df = model_df.randomSplit([0.70, 0.30], seed=42)

In [22]:
train_df.count(), len(train_df.columns)

(913, 2)

In [23]:
test_df.count(), len(test_df.columns)

(319, 2)

### Build Linear Regression Model 

In [24]:
# Build Linear Regression model 
lin_Reg=LinearRegression(labelCol='output')

In [25]:
# fit the linear regression model on training data set 
lr_model=lin_Reg.fit(train_df)

In [26]:
lr_model.intercept

0.18248726674414367

In [27]:
lr_model.coefficients

DenseVector([0.0003, 0.0001, 0.0002, -0.6157, 0.484])

In [28]:
training_predictions=lr_model.evaluate(train_df)

In [29]:
training_predictions.meanSquaredError

0.00014312933805064296

In [30]:
training_predictions.r2

0.8666409386017969

### Evaluate Model

In [31]:
# make predictions on test data 
test_results=lr_model.evaluate(test_df)

In [32]:
# view the residual errors based on predictions 
test_results.residuals.show(10,False)

+----------------------+
|residuals             |
+----------------------+
|0.008804031865834283  |
|-0.012636056005557339 |
|-0.005485406974868601 |
|-0.007478062289684506 |
|0.007308779532406662  |
|0.008710810791833057  |
|-0.0024728583509439073|
|0.00662982108683513   |
|-0.011228796192275292 |
|0.010606878459008873  |
+----------------------+
only showing top 10 rows



In [33]:
# coefficient of determination value for model
test_results.r2

0.8751967254722466

In [34]:
# RMSE
test_results.rootMeanSquaredError

0.01222266886976356

In [35]:
# MSE
test_results.meanSquaredError

0.00014939363429988724