# Introduction to SparkML

#### Tabnine Extension for intellisense

In [1]:
# !pip install jupyter-tabnine
# !jupyter nbextension install --py jupyter_tabnine
!jupyter nbextension enable --py jupyter_tabnine
!jupyter serverextension enable --py jupyter_tabnine

Enabling notebook extension jupyter_tabnine/main...
      - Validating: [32mOK[0m
Enabling: jupyter_tabnine
- Writing config: /home/ateeb/.jupyter
    - Validating...
      jupyter_tabnine  [32mOK[0m


### Spark Initiallization Script

In [2]:
#!/usr/bin/env python3
import sys
import os

# getting the directory where Spark was installed
if 'SPARK_HOME' not in os.environ:
    os.environ['SPARK_HOME'] = '/opt/spark'

# python variable to store the root path for later reference
SPARK_HOME = os.environ['SPARK_HOME']

# adding pyspark and py4j packages paths to python path env variable
sys.path.insert(0,os.path.join(SPARK_HOME, "python"))
sys.path.insert(0,os.path.join(SPARK_HOME, "python", "lib"))
sys.path.insert(0,os.path.join(SPARK_HOME, "python", "lib", 'py4j-0.10.9-src.zip'))
sys.path.insert(0, os.path.join(SPARK_HOME, 'python', 'lib', 'pyspark.zip'))

from pyspark import SparkContext
from pyspark import SparkConf

conf = SparkConf()
conf.set('spark.executer.memory','1g')
conf.set('spark.cores.max','2')

# give name to your spark application
conf.setAppName("SparkMLApp")

# create a spark context object 
# note: Execute only once otherwise results in Context Errors
# create the spark context with 2 threads for streaming
sc = SparkContext('local',conf=conf)

#### Once the above script is executed you can view the Spark instance info here http://localhost:4040

## Linear Regression
Problem Statement
*****************
The input data set contains data about details of various car 
models. Based on the information provided, the goal is to come up 
with a model to predict Miles-per-gallon of a given model.

Techniques Used:

1. Linear Regression ( multi-variate)
2. Data Imputation - replacing non-numeric data with numeric ones
3. Variable Reduction - picking up only relevant features

### Loading Data

In [50]:
# reading csv file into an RDD
autoData = sc.textFile("data/auto-miles-per-gallon.csv")
autoData.cache()
autoData.take(3)

['MPG,CYLINDERS,DISPLACEMENT,HORSEPOWER,WEIGHT,ACCELERATION,MODELYEAR,NAME',
 '18,8,307,130,3504,12,70,chevrolet chevelle malibu',
 '15,8,350,165,3693,11.5,70,buick skylark 320']

In [51]:
#Remove the first line (contains headers)
dataLines = autoData.filter(lambda line: "MPG" not in line)
dataLines.count()

398

### Cleaning data

In [52]:
import math
from pyspark.ml.linalg import Vectors

# set a random value for avg hp to replace with missing values in data
avgHP = sc.broadcast(80.0)

# function to perform data cleaning
def CleanupData(inputStr):
    global avgHP
    attList = inputStr.split(",")
    
    #Replace ? values with a normal value
    hpValue = attList[3]
    if hpValue == "?":
        hpValue = avgHP.value
        
    #Create a row with cleaned up and converted data    
    values = Vectors.dense([float(attList[0]), float(attList[1]), hpValue, float(attList[5]), float(attList[6])])
    return values

#Run map for cleanup
autoVectors = dataLines.map(CleanupData)
autoVectors.take(3)

[DenseVector([18.0, 8.0, 130.0, 12.0, 70.0]),
 DenseVector([15.0, 8.0, 165.0, 11.5, 70.0]),
 DenseVector([18.0, 8.0, 150.0, 11.0, 70.0])]

In [None]:
# from pyspark.ml.stat import mean
# from pyspark.ml.stat import Summarizer
# autoStats = Statistics.colStats(autoVectors)
# print("Mean:\n", autoStats.mean())
# print("Variance:\n", autoStats.variance())
# print("Max:\n", autoStats.max())
# print("Min:\n", autoStats.min())

In [62]:
from pyspark.sql import SQLContext

sqlContext  = SQLContext(sc)

# creating a label point
def transformToLabelPoint(inputStr):
    lp = (float(inputStr[0]),Vectors.dense([float(inputStr[1]), float(inputStr[2]), float(inputStr[4])]))
    return lp

autoLp = autoVectors.map(transformToLabelPoint)
autoDf = sqlContext.createDataFrame(autoLp,["label","features"])
autoDf.select("label","features").show(5)

+-----+----------------+
|label|        features|
+-----+----------------+
| 18.0|[8.0,130.0,70.0]|
| 15.0|[8.0,165.0,70.0]|
| 18.0|[8.0,150.0,70.0]|
| 16.0|[8.0,150.0,70.0]|
| 17.0|[8.0,140.0,70.0]|
+-----+----------------+
only showing top 5 rows



In [63]:
# finding correlations
numFeatures = autoDf.take(1)[0].features.size
labelRDD  = autoDf.rdd.map(lambda lp: float(lp.label))

for i in range(numFeatures):
    featuresRDD = autoDf.rdd.map(lambda lp: lp.features[i])
    corr = Statistics.corr(labelRDD,featuresRDD,'pearson')
    print("%d\t%g" % (i, corr))

0	-0.775396
1	-0.774631
2	0.579267


In [64]:
(trainingData, testData) = autoDf.randomSplit([0.9,0.1])
print( trainingData.count() ,testData.count())

351 47


In [65]:
# build a Linear Regression Model
from pyspark.ml.regression import LinearRegression
lr = LinearRegression(maxIter=10)
lrModel = lr.fit(trainingData)

print("Intercept: ", str(lrModel.intercept))
print("Coefficients: ", str(lrModel.coefficients))

Intercept:  -10.591761761379919
Coefficients:  [-1.9753840213069946,-0.05686563255267235,0.6704506882901058]


In [86]:
# predicting on the test data
pred = lrModel.transform(testData)
pred.select("prediction","label","features").show()

+------------------+-----+----------------+
|        prediction|label|        features|
+------------------+-----+----------------+
|12.312252453860824| 11.0|[8.0,180.0,73.0]|
|14.485083393204333| 13.0|[8.0,130.0,72.0]|
| 16.02957349531131| 13.0|[8.0,150.0,76.0]|
|14.757474653625737| 14.0|[8.0,137.0,73.0]|
|15.472854072126546| 14.0|[8.0,148.0,75.0]|
|14.018221430440997| 14.0|[8.0,150.0,73.0]|
|11.824335565570697| 14.0|[8.0,165.0,71.0]|
|11.153884877280587| 15.0|[8.0,165.0,70.0]|
|15.359122807021203| 16.0|[8.0,150.0,75.0]|
|19.178238211235076| 17.0|[8.0,130.0,79.0]|
|22.925417528322292| 17.5|[6.0,110.0,77.0]|
| 17.93913119741825| 17.5|[8.0,140.0,78.0]|
|19.471369724108392| 18.0|[6.0,100.0,71.0]|
|21.198393626215342| 18.0|[6.0,105.0,74.0]|
|18.902713398581668| 18.0|[6.0,110.0,71.0]|
| 17.99599682997092| 18.1|[8.0,139.0,78.0]|
|23.607805118954367| 18.5| [6.0,98.0,77.0]|
|27.456778798804976| 19.0| [4.0,88.0,76.0]|
|22.721828802795528| 19.0| [6.0,90.0,75.0]|
|19.471369724108392| 19.0|[6.0,1

### Evaluating the accuracy of the model

In [88]:
from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(predictionCol='prediction',labelCol='label',metricName='r2')
print("r2 Accuracy: ", evaluator.evaluate(pred)*100)

r2 Accuracy:  61.25065305605055
