#### Input is a list of names, age, and number of friends. We want to determine the average number of friends by age.

In [1]:
from pyspark import SparkConf, SparkContext
import os

In [2]:
# Set spark to run on all cores "local[*]", set to "local" for 1 core, or replace with a number to specify the exact number
# app name will show up in UI
conf = SparkConf().setMaster("local[*]").setAppName("FriendByAge")
sc = SparkContext(conf = conf)

In [3]:
datadir = os.path.join(os.path.normpath(os.getcwd() + os.sep + os.pardir), 'data')
datafile = os.path.join(datadir,'fakefriends.csv')

sample data:

row, name, age, number of friends

`0,Will,33,385
1,Jean-Luc,26,2
2,Hugh,55,221
3,Deanna,40,465`

In [4]:
def parse_line(line):
    """Output tuple of age, number of friends"""
    fields = line.split(',')
    age = int(fields[2])
    numFriends = int(fields[3])
    return (age, numFriends)

In [5]:
lines = sc.textFile(datafile)
rdd = lines.map(parse_line)

In [6]:
rdd.take(5)

[(33, 385), (26, 2), (55, 221), (40, 465), (68, 21)]

In [7]:
rdd_mapvalues = rdd.mapValues(lambda x: (x, 1))
rdd_mapvalues.take(5)

[(33, (385, 1)), (26, (2, 1)), (55, (221, 1)), (40, (465, 1)), (68, (21, 1))]

In [9]:
totalByAge = rdd_mapvalues.reduceByKey(lambda x, y: (x[0]+y[0], x[1]+y[1]))
totalByAge.take(5)

[(26, (4115, 17)),
 (40, (4264, 17)),
 (68, (2696, 10)),
 (54, (3615, 13)),
 (38, (2903, 15))]

In [11]:
averagebyAge = totalByAge.mapValues(lambda x: x[0] / x[1])
results = averagebyAge.sortByKey(True).collect()
for result in results:
    print(result)

(18, 343.375)
(19, 213.27272727272728)
(20, 165.0)
(21, 350.875)
(22, 206.42857142857142)
(23, 246.3)
(24, 233.8)
(25, 197.45454545454547)
(26, 242.05882352941177)
(27, 228.125)
(28, 209.1)
(29, 215.91666666666666)
(30, 235.8181818181818)
(31, 267.25)
(32, 207.9090909090909)
(33, 325.3333333333333)
(34, 245.5)
(35, 211.625)
(36, 246.6)
(37, 249.33333333333334)
(38, 193.53333333333333)
(39, 169.28571428571428)
(40, 250.8235294117647)
(41, 268.55555555555554)
(42, 303.5)
(43, 230.57142857142858)
(44, 282.1666666666667)
(45, 309.53846153846155)
(46, 223.69230769230768)
(47, 233.22222222222223)
(48, 281.4)
(49, 184.66666666666666)
(50, 254.6)
(51, 302.14285714285717)
(52, 340.6363636363636)
(53, 222.85714285714286)
(54, 278.0769230769231)
(55, 295.53846153846155)
(56, 306.6666666666667)
(57, 258.8333333333333)
(58, 116.54545454545455)
(59, 220.0)
(60, 202.71428571428572)
(61, 256.22222222222223)
(62, 220.76923076923077)
(63, 384.0)
(64, 281.3333333333333)
(65, 298.2)
(66, 276.4444444444444