In [1]:
import os
import sys

os.chdir("C:/dataanalytics/python")
os.curdir

#Configure the environment . Set this up to the directory where spark is installed
if 'SPARK_HOME' not in os.environ:
    os.environ['SPARK_HOME'] = 'C:\\spark'
    
#create a variable for our root path
SPARK_HOME = os.environ['SPARK_HOME']

#Add the following paths to the system path. Please check your installation
#to make sure that these zip files actually exists. The names might change as
#versions change
sys.path.insert(0,os.path.join(SPARK_HOME,"python"))
sys.path.insert(0,os.path.join(SPARK_HOME,"python","lib"))
sys.path.insert(0,os.path.join(SPARK_HOME,"python","lib","py4j-0.10.6-src.zip"))
sys.path.insert(0,os.path.join(SPARK_HOME,"python","lib","pyspark.zip"))
 
#Initialize a spark context
from pyspark import SparkContext
from pyspark import SparkConf

#optionally configure spark
conf = (SparkConf().setAppName("V2Maestros").setMaster("local[2]").set("spark.executor.memory", "1g"))

#Initalize spark context onl runs once
sc = SparkContext(conf=conf)

In [2]:
#loading the auto data file
autodata = sc.textFile(r'C:\Users\jeffnerd\Desktop\spark\auto-data.csv')

In [3]:
autodata.cache()

C:\Users\jeffnerd\Desktop\spark\auto-data.csv MapPartitionsRDD[1] at textFile at <unknown>:0

In [5]:
autodata.count()

198

In [6]:
autodata.first()

'MAKE,FUELTYPE,ASPIRE,DOORS,BODY,DRIVE,CYLINDERS,HP,RPM,MPG-CITY,MPG-HWY,PRICE'

In [8]:
autodata.take(5)

['MAKE,FUELTYPE,ASPIRE,DOORS,BODY,DRIVE,CYLINDERS,HP,RPM,MPG-CITY,MPG-HWY,PRICE',
 'subaru,gas,std,two,hatchback,fwd,four,69,4900,31,36,5118',
 'chevrolet,gas,std,two,hatchback,fwd,three,48,5100,47,53,5151',
 'mazda,gas,std,two,hatchback,fwd,four,68,5000,30,31,5195',
 'toyota,gas,std,two,hatchback,fwd,four,62,4800,35,39,5348']

In [7]:
for line in autodata.collect():
    print (line)

MAKE,FUELTYPE,ASPIRE,DOORS,BODY,DRIVE,CYLINDERS,HP,RPM,MPG-CITY,MPG-HWY,PRICE
subaru,gas,std,two,hatchback,fwd,four,69,4900,31,36,5118
chevrolet,gas,std,two,hatchback,fwd,three,48,5100,47,53,5151
mazda,gas,std,two,hatchback,fwd,four,68,5000,30,31,5195
toyota,gas,std,two,hatchback,fwd,four,62,4800,35,39,5348
mitsubishi,gas,std,two,hatchback,fwd,four,68,5500,37,41,5389
honda,gas,std,two,hatchback,fwd,four,60,5500,38,42,5399
nissan,gas,std,two,sedan,fwd,four,69,5200,31,37,5499
dodge,gas,std,two,hatchback,fwd,four,68,5500,37,41,5572
plymouth,gas,std,two,hatchback,fwd,four,68,5500,37,41,5572
mazda,gas,std,two,hatchback,fwd,four,68,5000,31,38,6095
mitsubishi,gas,std,two,hatchback,fwd,four,68,5500,31,38,6189
dodge,gas,std,four,hatchback,fwd,four,68,5500,31,38,6229
plymouth,gas,std,four,hatchback,fwd,four,68,5500,31,38,6229
chevrolet,gas,std,two,hatchback,fwd,four,70,5400,38,43,6295
toyota,gas,std,two,hatchback,fwd,four,62,4800,31,38,6338
dodge,gas,std,two,hatchback,fwd,four,68,5500,31,38,6377

In [9]:
#map data
tsvdata = autodata.map(lambda x : x.replace(",", "\t"))
tsvdata.take(5)

['MAKE\tFUELTYPE\tASPIRE\tDOORS\tBODY\tDRIVE\tCYLINDERS\tHP\tRPM\tMPG-CITY\tMPG-HWY\tPRICE',
 'subaru\tgas\tstd\ttwo\thatchback\tfwd\tfour\t69\t4900\t31\t36\t5118',
 'chevrolet\tgas\tstd\ttwo\thatchback\tfwd\tthree\t48\t5100\t47\t53\t5151',
 'mazda\tgas\tstd\ttwo\thatchback\tfwd\tfour\t68\t5000\t30\t31\t5195',
 'toyota\tgas\tstd\ttwo\thatchback\tfwd\tfour\t62\t4800\t35\t39\t5348']

In [10]:
#filter and create a new rdd
toyotadata = autodata.filter(lambda x : "toyota" in x)
toyotadata.count()

32

In [11]:
#filter map
words = toyotadata.flatMap(lambda line:line.split(","))
words.take(20)

['toyota',
 'gas',
 'std',
 'two',
 'hatchback',
 'fwd',
 'four',
 '62',
 '4800',
 '35',
 '39',
 '5348',
 'toyota',
 'gas',
 'std',
 'two',
 'hatchback',
 'fwd',
 'four',
 '62']

In [12]:
#set operations
words1 = sc.parallelize(["hello", "war", "peace", "world"])
words2 = sc.parallelize(["war" "peace", "universe"])

In [14]:
for unions in words1.union(words2).distinct().collect():
    print(unions)

hello
universe
warpeace
peace
world
war


In [16]:
for intersects in words1.intersection(words2).collect():
    print(intersects)

In [17]:
coldata = sc.parallelize([3,4,4,7,4])

In [18]:
coldata.stats()

(count: 5, mean: 4.4, stdev: 1.3564659966250536, max: 7.0, min: 3.0)

In [23]:
#reducing the data through summing
coldata.reduce(lambda x,y: x+y)

22

In [24]:
#autodata reduce
autodata.reduce(lambda x,y: x if len(x) < len(y) else y)

'bmw,gas,std,two,sedan,rwd,six,182,5400,16,22,41315'

In [25]:
#aggregations
seqOp = (lambda x,y: (x+y))
combOp = (lambda x,y: (x+y))
coldata.aggregate((0), seqOp, combOp)

22

In [26]:
#adding and mutiplication at the same time
seqOp = (lambda x,y: (x[0]+y, x[1]*y))
combOp = (lambda x,y: (x[0]+y[0], x[1]*y[1]))
coldata.aggregate((0,1), seqOp, combOp)

(22, 1344)

In [28]:
#cleanse and transform an RDD
def cleanseRDD(autoStr) :
    if isinstance(autoStr, int) :
        return autoStr
    attList=autoStr.split(",")
    #convert doors to a number
    if attList[3] == "two" :
         attList[3]="2"
    else :
         attList[3]="4"
    #Convert Drive to uppercase
    attList[5] = attList[5].upper()
    return ",".join(attList)
    
cleanedData=autodata.map(cleanseRDD)
cleanedData.collect()


['MAKE,FUELTYPE,ASPIRE,4,BODY,DRIVE,CYLINDERS,HP,RPM,MPG-CITY,MPG-HWY,PRICE',
 'subaru,gas,std,2,hatchback,FWD,four,69,4900,31,36,5118',
 'chevrolet,gas,std,2,hatchback,FWD,three,48,5100,47,53,5151',
 'mazda,gas,std,2,hatchback,FWD,four,68,5000,30,31,5195',
 'toyota,gas,std,2,hatchback,FWD,four,62,4800,35,39,5348',
 'mitsubishi,gas,std,2,hatchback,FWD,four,68,5500,37,41,5389',
 'honda,gas,std,2,hatchback,FWD,four,60,5500,38,42,5399',
 'nissan,gas,std,2,sedan,FWD,four,69,5200,31,37,5499',
 'dodge,gas,std,2,hatchback,FWD,four,68,5500,37,41,5572',
 'plymouth,gas,std,2,hatchback,FWD,four,68,5500,37,41,5572',
 'mazda,gas,std,2,hatchback,FWD,four,68,5000,31,38,6095',
 'mitsubishi,gas,std,2,hatchback,FWD,four,68,5500,31,38,6189',
 'dodge,gas,std,4,hatchback,FWD,four,68,5500,31,38,6229',
 'plymouth,gas,std,4,hatchback,FWD,four,68,5500,31,38,6229',
 'chevrolet,gas,std,2,hatchback,FWD,four,70,5400,38,43,6295',
 'toyota,gas,std,2,hatchback,FWD,four,62,4800,31,38,6338',
 'dodge,gas,std,2,hatchback

In [29]:
#Sue a function to perform reduce 
def getMPG( autoStr) :
    if isinstance(autoStr, int) :
        return autoStr
    attList=autoStr.split(",")
    if attList[9].isdigit() :
        return int(attList[9])
    else:
        return 0

#find average MPG-City for all cars    
autodata.reduce(lambda x,y : getMPG(x) + getMPG(y)) \
    / (autodata.count()-1)

25.15228426395939

In [33]:
#creating key value pairsin our auto data. key is brand and value is
cylData = autodata.map(lambda x: (x.split(",")[0], x.split(",")[7]))

In [34]:
cylData.take(5)

[('MAKE', 'HP'),
 ('subaru', '69'),
 ('chevrolet', '48'),
 ('mazda', '68'),
 ('toyota', '62')]

In [None]:
cylData.keys().collect(), cylData.values().collect()

In [38]:
#removing the header
header = cylData.first()

In [40]:
cylHPData = cylData.filter(lambda line: line != header)

In [41]:
#Add a count 1 to each record and then reduce to find totals of HP and counts
brandValues=cylHPData.mapValues(lambda x: (x, 1)) \
    .reduceByKey(lambda x, y: (int(x[0]) + int(y[0]), \
    x[1] + y[1])) 
brandValues.collect()

[('volvo', (1408, 11)),
 ('saab', (760, 6)),
 ('mercedes-benz', (1170, 8)),
 ('alfa-romero', (376, 3)),
 ('plymouth', (607, 7)),
 ('jaguar', (614, 3)),
 ('nissan', (1846, 18)),
 ('dodge', (675, 8)),
 ('mitsubishi', (1353, 13)),
 ('mazda', (1390, 16)),
 ('chevrolet', (188, 3)),
 ('honda', (1043, 13)),
 ('subaru', (1035, 12)),
 ('porsche', (764, 4)),
 ('mercury', ('175', 1)),
 ('toyota', (2969, 32)),
 ('peugot', (1098, 11)),
 ('isuzu', (168, 2)),
 ('bmw', (1111, 8)),
 ('audi', (687, 6)),
 ('volkswagen', (973, 12))]

In [42]:
#find average by dividing HP total by count total
brandValues.mapValues(lambda x: int(x[0])/int(x[1])). \
    collect()


[('volvo', 128.0),
 ('saab', 126.66666666666667),
 ('mercedes-benz', 146.25),
 ('alfa-romero', 125.33333333333333),
 ('plymouth', 86.71428571428571),
 ('jaguar', 204.66666666666666),
 ('nissan', 102.55555555555556),
 ('dodge', 84.375),
 ('mitsubishi', 104.07692307692308),
 ('mazda', 86.875),
 ('chevrolet', 62.666666666666664),
 ('honda', 80.23076923076923),
 ('subaru', 86.25),
 ('porsche', 191.0),
 ('mercury', 175.0),
 ('toyota', 92.78125),
 ('peugot', 99.81818181818181),
 ('isuzu', 84.0),
 ('bmw', 138.875),
 ('audi', 114.5),
 ('volkswagen', 81.08333333333333)]

In [43]:
#............................................................................
##   Advanced Spark : Accumulators & Broadcast Variables
#............................................................................

#function that splits the line as well as counts sedans and hatchbacks
#Speed optimization

    
#Initialize accumulator
sedanCount = sc.accumulator(0)
hatchbackCount =sc.accumulator(0)


In [44]:
#Set Broadcast variable
sedanText=sc.broadcast("sedan")
hatchbackText=sc.broadcast("hatchback")

In [45]:
def splitLines(line) :

    global sedanCount
    global hatchbackCount

    #Use broadcast variable to do comparison and set accumulator
    if sedanText.value in line:
        sedanCount +=1
    if hatchbackText.value in line:
        hatchbackCount +=1
        
    return line.split(",")


In [47]:
#do the map
splitData=autodata.map(splitLines)

In [48]:
#Make it execute the map (lazy execution)
splitData.count()

198

In [49]:
print (sedanCount, hatchbackCount)

92 67


In [50]:
coldata.getNumPartitions()

2

In [2]:
#sqlcontext
from pyspark.sql import SQLContext
sqlcontext = SQLContext(sc)

In [53]:
empDF = sqlcontext.read.json(r'C:\Users\jeffnerd\Desktop\spark\customerData.json')

In [54]:
empDF.show()

+---+------+------+-----------------+------+
|age|deptid|gender|             name|salary|
+---+------+------+-----------------+------+
| 32|   100|  male|Benjamin Garrison|  3000|
| 40|   200|  male|    Holland Drake|  4500|
| 26|   100|  male|  Burks Velasquez|  2700|
| 51|   100|female|    June Rutledge|  4300|
| 44|   200|  male|    Nielsen Knapp|  6500|
+---+------+------+-----------------+------+



In [55]:
empDF.printSchema()

root
 |-- age: string (nullable = true)
 |-- deptid: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- name: string (nullable = true)
 |-- salary: string (nullable = true)



In [56]:
empDF.filter(empDF["age"] == 40).show()

+---+------+------+-------------+------+
|age|deptid|gender|         name|salary|
+---+------+------+-------------+------+
| 40|   200|  male|Holland Drake|  4500|
+---+------+------+-------------+------+



In [57]:
empDF.select("name").show()

+-----------------+
|             name|
+-----------------+
|Benjamin Garrison|
|    Holland Drake|
|  Burks Velasquez|
|    June Rutledge|
|    Nielsen Knapp|
+-----------------+



In [58]:
empDF.groupby("gender").count().show()

+------+-----+
|gender|count|
+------+-----+
|female|    1|
|  male|    4|
+------+-----+



In [59]:
empDF.groupby("deptid").agg({"salary":"avg", "age":"max"}).show()

+------+------------------+--------+
|deptid|       avg(salary)|max(age)|
+------+------------------+--------+
|   200|            5500.0|      44|
|   100|3333.3333333333335|      51|
+------+------------------+--------+



In [2]:
y = sc.parallelize([0,1,2,3,4,5,6,7,9,10])
x = sc.parallelize([0,1,2,3,4,5,6,7,8,9,10])

In [3]:
y.count()
y.max()
y.min()
y.stats()

(count: 10, mean: 4.7, stdev: 3.1638584039112754, max: 10.0, min: 0.0)