# Pyspark Exploration and Testing
run 'pyspark' from command line to get here

In [1]:
sc = SparkContext.getOrCreate()

In [2]:
sc

In [3]:
sc.parallelize(range(1,10))

ParallelCollectionRDD[0] at parallelize at PythonRDD.scala:489

In [4]:
sc.parallelize(range(1,10)).first()

1

In [7]:
list2 = sc.parallelize(range(1,1000)).map(lambda x: x*10)

In [8]:
list2.first()

10

In [9]:
list2.reduce(lambda x,y:x+y)

4995000

In [13]:
# .collect() will make a list of all results
# .take(#) will give you the first # results
# .first() will give the first result
list2.filter(lambda x: x%100==0).take(5)

[100, 200, 300, 400, 500]

In [14]:
rdd1=sc.parallelize(range(1,100))

In [15]:
rdd1.map(lambda x:x*x).sum()

328350

In [16]:
rdd1.map(lambda x:x*x).max()

9801

In [17]:
rdd1.map(lambda x:x*x).min()

1

In [18]:
rdd1.map(lambda x:x*x).mean()

3316.6666666666665

In [19]:
rdd2 = sc.parallelize(range(1,10))

In [20]:
rdd2.filter(lambda x: x%3==0).collect()

[3, 6, 9]

In [21]:
rdd2.filter(lambda x: x%3==0).reduce(lambda x,y: x*y)

162

In [30]:
people=sc.textFile("D:\Springboard\DATA SCIENCE AT SCALE\spark-pycon15\data\people.txt")

In [24]:
people.first()

u'Orlando\tM\t40\tPython'

In [25]:
# split on tab
people.map(lambda x: x.split('\t')).first()

[u'Orlando', u'M', u'40', u'Python']

In [35]:
people=sc.textFile("D:\Springboard\DATA SCIENCE AT SCALE\spark-pycon15\data\people.txt").map(lambda x: x.split('\t'))
people.collect()

[[u'Orlando', u'M', u'40', u'Python'],
 [u'Lina', u'F', u'39', u'C#'],
 [u'John', u'M', u'30', u'Python'],
 [u'Jane', u'F', u'32', u'Python'],
 [u'Michelle', u'F', u'18', u'Python'],
 [u'Daniel', u'M', u'20', u'C#']]

In [33]:
people.map(lambda t: (t[1],1)).first()

(u'M', 1)

In [34]:
# count of genders
people.map(lambda t: (t[1],1)).reduceByKey(lambda x,y:x+y).collect()

[(u'M', 3), (u'F', 3)]

In [36]:
# count of favorite programming language
people.map(lambda t: (t[3],1)).reduceByKey(lambda x,y:x+y).collect()

[(u'Python', 4), (u'C#', 2)]

In [37]:
# look at gender and age for first row
people.map(lambda t: (t[1],int(t[2]))).first()

(u'M', 40)

In [38]:
# look at gender and age for all rows
people.map(lambda t: (t[1],int(t[2]))).collect()

[(u'M', 40), (u'F', 39), (u'M', 30), (u'F', 32), (u'F', 18), (u'M', 20)]

In [None]:
people.map(lambda t: (t[1],int(t[2]))).collect()

In [39]:
import sys
sys.path.insert(0,'D:\Springboard\DATA SCIENCE AT SCALE\spark-pycon15\code\simple')

import person

In [41]:
people=sc.textFile("D:\Springboard\DATA SCIENCE AT SCALE\spark-pycon15\data\people.txt")
people.first()

u'Orlando\tM\t40\tPython'

In [63]:
people=sc.textFile("D:\Springboard\DATA SCIENCE AT SCALE\spark-pycon15\data\people.txt")

In [64]:
people.map(lambda x: person.Person().parse(x))

PythonRDD[93] at RDD at PythonRDD.scala:48

In [65]:
people.first()

u'Orlando\tM\t40\tPython'

In [66]:
sales = sc.textFile("D:\Springboard\DATA SCIENCE AT SCALE\spark-pycon15\data\sales\sales_*.txt")

In [67]:
sales = sales.map(lambda x: x.split('\t'))

In [68]:
sales.map(lambda x: int(x[3])).sum()

569

In [3]:
states = [("AL", "Alabama"),
         ('AK', "Alaska"),
         ("AR", "Arizona")];
populations = [("AL", 4779736),
      ("AK", 710231),
      ("AR", 6392017)];

In [4]:
states_rdd = sc.parallelize(states)
pop_rdd = sc.parallelize(populations)

In [6]:
states_rdd.join(pop_rdd).collect()

[('AK', ('Alaska', 710231)),
 ('AR', ('Arizona', 6392017)),
 ('AL', ('Alabama', 4779736))]

# Data Tables

In [5]:
conf = SparkConf().setAppName()

In [None]:
# data tables have explicit schemas, and a potentially better query optimizer
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext

sc = SparkContext()
sqlCtx = SQLContext(sc)

# you can read from json files (one object in each line, need quotes around field names)
# you get a datatable which is a kind of RDD, with schema info, and each tuple is of class Row
people=sqlCtx.jsonFile("../../data/people1.json")

# the show method is similar to collect, but displays nicely
people.show()

# the select method is almost-equivalent to map
# can use strings, or columns from datatable in expressions
people.select("name",people.age+1).show()

# can use filter (same as in rdd)
people.filter(people.age>30)

# can use [] instead of filter, like in pandas
people[people.gender=='F']

# can use groupBy, and then count() or a few other operators
people.groupBy(people.gender).count()

# can use sql ! 

#first register as temp table
people.registerTempTable("people")

sqlCtx.sql("select name, age FROM people").show()

sqlCtx.sql("select gender,avg(age) AS AvgAge FROM people GROUP BY gender").show()
