In [1]:
import pandas as pd
from pyspark import SparkContext, SparkConf

In [2]:
conf = SparkConf().setMaster('local').setAppName('AverageFriendsByAge')
sc = SparkContext(conf=conf)

In [8]:
df = pd.read_csv('../datasets/fakefriends.csv', index_col=0, names=['name', 'age', 'n_friends'])
df.head()

Unnamed: 0,name,age,n_friends
0,Will,33,385
1,Jean-Luc,26,2
2,Hugh,55,221
3,Deanna,40,465
4,Quark,68,21


In [48]:
df.groupby(by='age').mean().head()

Unnamed: 0_level_0,n_friends
age,Unnamed: 1_level_1
18,343.375
19,213.272727
20,165.0
21,350.875
22,206.428571


In [46]:
def parse_line(line):
    fields = line.split(',')
    age = int(fields[2])
    n_friends = int(fields[3])
    return (age, n_friends)

lines = sc.textFile('../datasets/fakefriends.csv')
rdd = lines.map(parse_line)

totals_by_age = rdd.mapValues(lambda x: (x, 1)) \
                   .reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1])) \
                   
average_by_age = totals_by_age.mapValues(lambda x: x[0] / x[1])

sorted(average_by_age.collect(), key=lambda x: x[1], reverse=True)

[(63, 384.0),
 (21, 350.875),
 (18, 343.375),
 (52, 340.6363636363636),
 (33, 325.3333333333333),
 (45, 309.53846153846155),
 (56, 306.6666666666667),
 (42, 303.5),
 (51, 302.14285714285717),
 (65, 298.2),
 (55, 295.53846153846155),
 (44, 282.1666666666667),
 (48, 281.4),
 (64, 281.3333333333333),
 (54, 278.0769230769231),
 (66, 276.44444444444446),
 (68, 269.6),
 (41, 268.55555555555554),
 (31, 267.25),
 (57, 258.8333333333333),
 (61, 256.22222222222223),
 (50, 254.6),
 (40, 250.8235294117647),
 (37, 249.33333333333334),
 (36, 246.6),
 (23, 246.3),
 (34, 245.5),
 (26, 242.05882352941177),
 (30, 235.8181818181818),
 (69, 235.2),
 (24, 233.8),
 (47, 233.22222222222223),
 (43, 230.57142857142858),
 (27, 228.125),
 (46, 223.69230769230768),
 (53, 222.85714285714286),
 (62, 220.76923076923077),
 (59, 220.0),
 (29, 215.91666666666666),
 (67, 214.625),
 (19, 213.27272727272728),
 (35, 211.625),
 (28, 209.1),
 (32, 207.9090909090909),
 (22, 206.42857142857142),
 (60, 202.71428571428572),
 (25