#### Input is a list of names, age, and number of friends. We want to determine the average number of friends by age.

In [1]:
from pyspark import SparkConf, SparkContext
import os, collections

In [2]:
# Set spark to run on all cores "local[*]", set to "local" for 1 core, or replace with a number to specify the exact number
# app name will show up in UI
conf = SparkConf().setMaster("local[*]").setAppName("FriendByAge")
sc = SparkContext(conf = conf)

In [3]:
datadir = os.path.join(os.path.normpath(os.getcwd() + os.sep + os.pardir), 'data')
datafile = os.path.join(datadir,'fakefriends.csv')

sample data:

row, name, age, number of friends

`0,Will,33,385
1,Jean-Luc,26,2
2,Hugh,55,221
3,Deanna,40,465`

In [9]:
def parse_line(line):
    """Output tuple of age, number of friends"""
    fields = line.split(',')
    age = int(fields[2])
    numFriends = int(fields[3])
    return (age, numFriends)

In [10]:
lines = sc.textFile(datafile)
rdd = lines.map(parse_line)

In [11]:
rdd.take(5)

[(33, 385), (26, 2), (55, 221), (40, 465), (68, 21)]

In [13]:
rdd_mapvalues = rdd.mapValues(lambda x: (x, 1))
rdd_mapvalues.take(5)

[(33, (385, 1)), (26, (2, 1)), (55, (221, 1)), (40, (465, 1)), (68, (21, 1))]

In [16]:
totalFriends = rdd_mapvalues.reduceByKey(lambda x, y: x +y)
totalFriends.take(5)

[(26,
  (2,
   1,
   281,
   1,
   84,
   1,
   282,
   1,
   381,
   1,
   145,
   1,
   345,
   1,
   293,
   1,
   298,
   1,
   492,
   1,
   269,
   1,
   254,
   1,
   7,
   1,
   383,
   1,
   124,
   1,
   391,
   1,
   84,
   1)),
 (40,
  (465,
   1,
   254,
   1,
   459,
   1,
   407,
   1,
   18,
   1,
   284,
   1,
   389,
   1,
   349,
   1,
   406,
   1,
   198,
   1,
   172,
   1,
   33,
   1,
   56,
   1,
   7,
   1,
   261,
   1,
   286,
   1,
   220,
   1)),
 (68,
  (21,
   1,
   264,
   1,
   112,
   1,
   490,
   1,
   481,
   1,
   217,
   1,
   189,
   1,
   206,
   1,
   293,
   1,
   423,
   1)),
 (54,
  (307,
   1,
   253,
   1,
   75,
   1,
   440,
   1,
   7,
   1,
   441,
   1,
   235,
   1,
   369,
   1,
   397,
   1,
   462,
   1,
   72,
   1,
   442,
   1,
   115,
   1)),
 (38,
  (380,
   1,
   459,
   1,
   2,
   1,
   173,
   1,
   76,
   1,
   180,
   1,
   96,
   1,
   410,
   1,
   454,
   1,
   95,
   1,
   38,
   1,
   203,
   1,
   143,
   1,
   3