In [1]:
import os
os.environ['PYSPARK_PYTHON'] = '/nfshome/lj1230/.conda/envs/myEnv/bin/python3.5'

from pyspark import SparkContext
sc = SparkContext('local', 'pyspark')

from pyspark.sql.session import SparkSession
spark = SparkSession(sc)

# Trivials

In [2]:
rdd = sc.textFile('book.txt')

In [3]:
rdd.first()

'The Project Gutenberg EBook of English Coins and Tokens, by '

In [4]:
rdd0 = sc.parallelize([1, 2, 3, 4, 5])

In [5]:
rdd0.first()

1

In [6]:
# 对前三列进行操作

rdd.map(lambda line: line.split()).take(3)

[['The',
  'Project',
  'Gutenberg',
  'EBook',
  'of',
  'English',
  'Coins',
  'and',
  'Tokens,',
  'by'],
 ['Llewellynn', 'Jewitt', 'and', 'Barclay', 'V.', 'Head'],
 []]

In [7]:
rdd.flatMap(lambda line: line.split()).take(3)

['The', 'Project', 'Gutenberg']

In [9]:
rdd.flatMap(lambda line: line.split()).map(lambda x: (x, 1)).take(3)

[('The', 1), ('Project', 1), ('Gutenberg', 1)]

In [10]:
rdd.flatMap(lambda line: line.split()).map(lambda x: (x, 1)).groupByKey().take(3)

[('CÆSAR', <pyspark.resultiterable.ResultIterable at 0x7f391ac337b8>),
 ('fellows', <pyspark.resultiterable.ResultIterable at 0x7f391ac33400>),
 ('die-sinker’s', <pyspark.resultiterable.ResultIterable at 0x7f391ac337f0>)]

In [11]:
rdd.flatMap(lambda line: line.split()).map(lambda x: (x, 1)).reduceByKey(lambda x, y: x + y).take(3)

[('CÆSAR', 1), ('fellows', 1), ('die-sinker’s', 1)]

In [17]:
rdd1 = rdd.flatMap(lambda line: line.split()).map(lambda x: (x, 1)).reduceByKey(lambda x, y: x + y)
rdd1.top(5, lambda x: x[1])

[('the', 2104), ('of', 1631), ('and', 1277), ('a', 875), ('or', 710)]

# SAT - Task1: by RDD

In [21]:
SAT_FN = "SAT_Results.csv"
HSD_FN = "DOE_High_School_Directory_2014-2015.csv"

In [22]:
sat = sc.textFile(SAT_FN)
sat.first()

'DBN,SCHOOL NAME,Num of SAT Test Takers,SAT Critical Reading Avg. Score,SAT Math Avg. Score,SAT Writing Avg. Score'

In [23]:
list(enumerate(sat.first().split(",")))

[(0, 'DBN'),
 (1, 'SCHOOL NAME'),
 (2, 'Num of SAT Test Takers'),
 (3, 'SAT Critical Reading Avg. Score'),
 (4, 'SAT Math Avg. Score'),
 (5, 'SAT Writing Avg. Score')]

In [24]:
def extractScore(partitionID, rows):
    if partitionID == 0:  # 去掉第一行属性名
        next(rows)
    import csv
    reader = csv.reader(rows)
    for fields in reader:
        if fields[2] != "s":
            yield (fields[0], (int(fields[4]), int(fields[2])))

satScores = sat.mapPartitionsWithIndex(extractScore)
satScores.take(5)
# DBN, Math Avg., 人数

[('02M047', (400, 16)),
 ('21K410', (437, 475)),
 ('30Q301', (440, 98)),
 ('17K382', (374, 59)),
 ('18K637', (381, 35))]

In [25]:
schools = sc.textFile(HSD_FN).cache()
# list(enumerate(schools.first().split(",")))

In [26]:
def extractSchool(partitionID, rows):
    if partitionID == 0:  # 去掉第一行属性名
        next(rows)
    import csv
    reader = csv.reader(rows)
    for fields in reader:
        if len(fields) == 58 and fields[17].isdigit() and int(fields[17]) > 500:
            yield (fields[0], fields[2])

largeSchools = schools.mapPartitionsWithIndex(extractSchool)
largeSchools.take(5)
# DBN, 区

[('01M450', 'Manhattan'),
 ('01M539', 'Manhattan'),
 ('01M696', 'Manhattan'),
 ('02M374', 'Manhattan'),
 ('02M400', 'Manhattan')]

In [27]:
br_sat = largeSchools.join(satScores)

In [28]:
sorted(br_sat.values().take(5))

[('Brooklyn', (374, 59)),
 ('Brooklyn', (409, 88)),
 ('Queens', (445, 68)),
 ('Queens', (449, 395)),
 ('Queens', (492, 135))]

In [30]:
br_sat.take(5)

[('17K382', ('Brooklyn', (374, 59))),
 ('28Q310', ('Queens', (445, 68))),
 ('32K545', ('Brooklyn', (409, 88))),
 ('30Q445', ('Queens', (449, 395))),
 ('30Q575', ('Queens', (492, 135)))]

In [31]:
br_sat.values().mapValues(lambda x: (x[0] * x[1], x[1])).take(5)

[('Brooklyn', (22066, 59)),
 ('Queens', (30260, 68)),
 ('Brooklyn', (35992, 88)),
 ('Queens', (177355, 395)),
 ('Queens', (66420, 135))]

In [32]:
br_sat.values() \
    .mapValues(lambda x: (x[0] * x[1], x[1])) \
    .reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1])).take(5)

[('Brooklyn', (4544126, 9322)),
 ('Manhattan', (3206992, 6228)),
 ('Queens', (5190534, 10942)),
 ('Staten Island', (1406967, 2944)),
 ('Bronx', (1619364, 3444))]

In [103]:
largeSchools.join(satScores).values() \
    .mapValues(lambda x: (x[0] * x[1], x[1])) \
    .reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1])) \
    .mapValues(lambda x: x[0] / x[1]).take(5)

[('Brooklyn', 487.46256168204246),
 ('Manhattan', 514.9312780989081),
 ('Queens', 474.3679400475233),
 ('Staten Island', 477.9099864130435),
 ('Bronx', 470.198606271777)]

# SAT - Task1: by dataframe

In [114]:
sat.first()

'DBN,SCHOOL NAME,Num of SAT Test Takers,SAT Critical Reading Avg. Score,SAT Math Avg. Score,SAT Writing Avg. Score'

In [122]:
dfScores = spark.read.load(SAT_FN, format='csv', header=True, inferSchema=True)
dfScores = dfScores.select("DBN", 
                           dfScores['`SAT Math Avg. Score`'].cast('int').alias('score'), 
                           dfScores['Num of SAT Test Takers'].cast('int').alias('ntakers')).na.drop()
dfScores = dfScores.select("DBN", (dfScores.score * dfScores.ntakers).alias("total"), "ntakers")
dfScores.head()

Row(DBN='02M047', total=6400, ntakers=16)

In [124]:
dfSchools = spark.read.load(HSD_FN, format="csv", header=True, inferSchema=True)
dfSchools = dfSchools.filter(dfSchools["total_students"] > 500)
dfSchools = dfSchools.select("dbn", "boro")
dfSchools.head()

Row(dbn='01M450', boro='Manhattan')

In [125]:
dfResults = dfSchools.join(dfScores, dfSchools.dbn == dfScores.DBN, how="inner")
dfResults = dfResults.groupBy("boro").sum("total", "ntakers")
dfResults = dfResults.withColumn('avg', (dfResults[1] / dfResults[2]))
dfResults = dfResults.select("boro", 'avg')
dfResults.show()

+-------------+------------------+
|         boro|               avg|
+-------------+------------------+
|       Queens| 474.3679400475233|
|     Brooklyn|487.46256168204246|
|Staten Island| 477.9099864130435|
|    Manhattan| 514.9312780989081|
|        Bronx|  470.198606271777|
+-------------+------------------+



# SAT - Task 2