In [2]:
import os
import sys

os.chdir("C:/dataanalytics/python")
os.curdir

#Configure the environment . Set this up to the directory where spark is installed
if 'SPARK_HOME' not in os.environ:
    os.environ['SPARK_HOME'] = 'C:\\spark'
    
#create a variable for our root path
SPARK_HOME = os.environ['SPARK_HOME']

#Add the following paths to the system path. Please check your installation
#to make sure that these zip files actually exists. The names might change as
#versions change
sys.path.insert(0,os.path.join(SPARK_HOME,"python"))
sys.path.insert(0,os.path.join(SPARK_HOME,"python","lib"))
sys.path.insert(0,os.path.join(SPARK_HOME,"python","lib","py4j-0.10.6-src.zip"))
sys.path.insert(0,os.path.join(SPARK_HOME,"python","lib","pyspark.zip"))
 
#Initialize a spark context
from pyspark import SparkContext
from pyspark import SparkConf

#optionally configure spark
conf = (SparkConf().setAppName("V2Maestros").setMaster("local[2]").set("spark.executor.memory", "1g"))

#Initalize spark context onl runs once
sc = SparkContext(conf=conf)

In [3]:
#loading the autodata file
autodata = sc.textFile(r'C:\Users\jeffnerd\Desktop\spark\auto-miles-per-gallon.csv')

In [4]:
autodata.cache()

C:\Users\jeffnerd\Desktop\spark\auto-miles-per-gallon.csv MapPartitionsRDD[1] at textFile at <unknown>:0

In [5]:
#removing te header
header = autodata.first()
dataLines = autodata.filter(lambda x: x!=header)

In [6]:
dataLines.count()

398

In [16]:
#convert the RDD into a dense vector. As part of these eercise 1. Remove unwanted columns and change non numeric 
#values to numeric values
import math
from pyspark.mllib.linalg import Vectors

In [22]:
from pyspark.ml.linalg import Vectors

In [23]:
#to impute missing values in horsepower we can set an accumulator
avgHP = sc.broadcast(80.0)

In [24]:
def transformToNumeric( inputStr) :
    global avgHP
    attList=inputStr.split(",")
    
    #Replace ? values with a normal value
    hpValue = attList[3]
    if hpValue == "?":
        hpValue=avgHP.value
       
    #Filter out columns not wanted at this stage
    values= Vectors.dense([ float(attList[0]), \
                     float(attList[1]),  \
                     hpValue,    \
                     float(attList[5]),  \
                     float(attList[6])
                     ])
    return values

In [25]:
#keep only MPG,CYLINDERS,HP ACCELERATION AND MODELYEAR
autoVectors = dataLines.map(transformToNumeric)
autoVectors.collect()

[DenseVector([18.0, 8.0, 130.0, 12.0, 70.0]),
 DenseVector([15.0, 8.0, 165.0, 11.5, 70.0]),
 DenseVector([18.0, 8.0, 150.0, 11.0, 70.0]),
 DenseVector([16.0, 8.0, 150.0, 12.0, 70.0]),
 DenseVector([17.0, 8.0, 140.0, 10.5, 70.0]),
 DenseVector([15.0, 8.0, 198.0, 10.0, 70.0]),
 DenseVector([14.0, 8.0, 220.0, 9.0, 70.0]),
 DenseVector([14.0, 8.0, 215.0, 8.5, 70.0]),
 DenseVector([14.0, 8.0, 225.0, 10.0, 70.0]),
 DenseVector([15.0, 8.0, 190.0, 8.5, 70.0]),
 DenseVector([15.0, 8.0, 170.0, 10.0, 70.0]),
 DenseVector([14.0, 8.0, 160.0, 8.0, 70.0]),
 DenseVector([15.0, 8.0, 150.0, 9.5, 70.0]),
 DenseVector([14.0, 8.0, 225.0, 10.0, 70.0]),
 DenseVector([24.0, 4.0, 95.0, 15.0, 70.0]),
 DenseVector([22.0, 6.0, 95.0, 15.5, 70.0]),
 DenseVector([18.0, 6.0, 97.0, 15.5, 70.0]),
 DenseVector([21.0, 6.0, 85.0, 16.0, 70.0]),
 DenseVector([27.0, 4.0, 88.0, 14.5, 70.0]),
 DenseVector([26.0, 4.0, 46.0, 20.5, 70.0]),
 DenseVector([25.0, 4.0, 87.0, 17.5, 70.0]),
 DenseVector([24.0, 4.0, 90.0, 14.5, 70.0]),
 

In [20]:
#perfoming statiscal analysis
from pyspark.mllib.stat import Statistics 

In [21]:
autoStat = Statistics.colStats(autoVectors)

In [25]:
autoStat.mean()
autoStat.variance()
autoStat.min()
autoStat.max()

array([ 46.6,   8. , 230. ,  24.8,  82. ])

In [18]:
Statistics.corr(autoVectors)

array([[ 1.        , -0.77539629, -0.77463084,  0.42028891,  0.57926713],
       [-0.77539629,  1.        ,  0.84275215, -0.50541949, -0.3487458 ],
       [-0.77463084,  0.84275215,  1.        , -0.68829885, -0.41559383],
       [ 0.42028891, -0.50541949, -0.68829885,  1.        ,  0.28813695],
       [ 0.57926713, -0.3487458 , -0.41559383,  0.28813695,  1.        ]])

In [29]:
from pyspark.ml.stat import Correlation

In [26]:
#transforoming this to a dataframe for input machine learning. drop columns that are not required (low correlation
from pyspark.sql import SQLContext
sqlcontext = SQLContext(sc)

In [27]:
def transformLabeledPoint(instr):
    lp = (float(instr[0]), Vectors.dense([instr[1], instr[2], instr[4]]))
    return lp

In [28]:
autoLp = autoVectors.map(transformLabeledPoint)

In [29]:
autoDf = sqlcontext.createDataFrame(autoLp, ["label", "features"])

In [30]:
autoDf.select("label", "features").show(10)

+-----+----------------+
|label|        features|
+-----+----------------+
| 18.0|[8.0,130.0,70.0]|
| 15.0|[8.0,165.0,70.0]|
| 18.0|[8.0,150.0,70.0]|
| 16.0|[8.0,150.0,70.0]|
| 17.0|[8.0,140.0,70.0]|
| 15.0|[8.0,198.0,70.0]|
| 14.0|[8.0,220.0,70.0]|
| 14.0|[8.0,215.0,70.0]|
| 14.0|[8.0,225.0,70.0]|
| 15.0|[8.0,190.0,70.0]|
+-----+----------------+
only showing top 10 rows



In [35]:
#finding correlation another way
numFeatures = autoDf.take(1)[0].features.size


In [37]:
labelRDD = autoDf.map(lambda lp: float(lp.label))
for i in range(numFeatures):
    featureRDD = autoDf.map(lambda lp: lp.features[i])
    corr = Statistics.corr(labelRDD, featureRDD, "pearson")
    print("%d\t%g", %(i,corr))

SyntaxError: invalid syntax (<ipython-input-37-d8a4a583ee66>, line 5)

In [31]:
#splitting the data into training and test data sets
(trainingData, testData) = autoDf.randomSplit([0.9, 0.1])
trainingData.count()
testData.count()

31

In [32]:
trainingData.count()

367

In [33]:
testData.count()

31

In [34]:
#Build the model on training data
from pyspark.ml.regression import LinearRegression
lr = LinearRegression(maxIter=10)
lrModel = lr.fit(trainingData)

In [35]:
print("coefficients" + str(lrModel.coefficients))

coefficients[-1.8509094846779293,-0.06395447226482245,0.6673981728253853]


In [36]:
print("intercept" + str(lrModel.intercept))

intercept-10.410281024102012


In [38]:
#predict on the test data
predictions = lrModel.transform(testData)
predictions.select("prediction","label").show(10)

+------------------+-----+
|        prediction|label|
+------------------+-----+
|14.521030147475376| 13.0|
|14.229111236328425| 13.0|
|10.975680722732982| 13.0|
|12.310477068383754| 13.0|
|14.740747014447003| 14.0|
|15.372044165184725| 14.0|
| 7.430331297990584| 14.0|
| 21.82390544192795| 16.0|
|15.911533393480465| 16.0|
| 8.792981093818518| 16.0|
+------------------+-----+
only showing top 10 rows



In [39]:
from pyspark.ml.evaluation import RegressionEvaluator

In [40]:
evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="label", metricName="r2")
evaluator.evaluate(predictions)

0.7116810118562784

In [13]:
from pyspark.ml.linalg import Vectors

In [15]:
Vectors.dense([1,2,3,4,5])

DenseVector([1.0, 2.0, 3.0, 4.0, 5.0])