# Tutorial: Taming Big Data With Apache Spark and Python - Hands On!
## Exercise 2.0 - Average Friends By Age

### Setup

FindSpark

This will circumvent many issues with your system finding spark

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!wget https://archive.apache.org/dist/spark/spark-2.4.5/spark-2.4.5-bin-hadoop2.7.tgz
!tar -xvf spark-2.4.5-bin-hadoop2.7.tgz
!mv spark-2.4.5-bin-hadoop2.7 spark-2.4.5

In [None]:
import os
# Install java
!apt-get update -qq
!apt-get install -y openjdk-8-jdk-headless -qq > /dev/null 

!pip install -q findspark
 
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]
os.environ["SPARK_HOME"] = "/content/spark-2.4.5"
!java -version

openjdk version "1.8.0_342"
OpenJDK Runtime Environment (build 1.8.0_342-8u342-b07-0ubuntu1~18.04-b07)
OpenJDK 64-Bit Server VM (build 25.342-b07, mixed mode)


In [None]:
!git clone https://github.com/bangkit-pambudi/resource-spark.git

Cloning into 'resource-spark'...
remote: Enumerating objects: 38, done.[K
remote: Counting objects: 100% (38/38), done.[K
remote: Compressing objects: 100% (36/36), done.[K
remote: Total 38 (delta 7), reused 0 (delta 0), pack-reused 0[K
Unpacking objects: 100% (38/38), done.


In [None]:
import findspark
findspark.init()

Load Libraries

In [None]:
from pyspark import SparkConf, SparkContext

Set the file path

In [None]:
data_folder = "/content/resource-spark/data/"

Create the Spark Context

In [None]:
# configure your Spark context; master node is local machine
conf = SparkConf().setMaster("local").setAppName("FriendsByAge")

# create a spark context object
sc = SparkContext(conf = conf)

Define a Parse Line Function

In [None]:
def parseLine(line):
    fields = line.split(',')
    age = int(fields[2])
    numFriends = int(fields[3])
    return (age, numFriends)

### Load the Data

In [None]:
# path to file of interest
file_to_open = data_folder + "fakefriends.csv"

# load the file; textFile breaks up a data file so that each row represents a single value in an RDD
lines = sc.textFile(file_to_open)

lines.top(5)

In [None]:
def parseLine(line):
    fields = line.split(',')
    age = int(fields[2])
    numFriends = int(fields[3])
    return (age, numFriends)

rdd = lines.map(parseLine)

rdd.top(5)

In [None]:
totalsByAge = rdd.mapValues(lambda x: (x, 1)).reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1]))

totalsByAge.top(5)

Inspect the RDD

In [None]:
lines.top(5)

['99,Keiko,69,491',
 '98,Will,44,178',
 '97,Nerys,69,361',
 '96,Ezri,25,233',
 '95,Odo,29,173']

### Transformations

Return key pair values of age and number of friends

In [None]:
rdd = lines.map(parseLine)

rdd.top(5)

[(69, 491), (69, 470), (69, 431), (69, 361), (69, 236)]

We add a count (i.e., 1) for each entry. We then aggregate by age, summing friends and counts.

In [None]:
totalsByAge = rdd.mapValues(lambda x: (x, 1)).reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1]))

totalsByAge.top(5)

[(69, (2352, 10)),
 (68, (2696, 10)),
 (67, (3434, 16)),
 (66, (2488, 9)),
 (65, (1491, 5))]

For each age, we find the average number of friends.

In [None]:
averagesByAge = totalsByAge.mapValues(lambda x: x[0] / x[1])

averagesByAge.top(5)

[(69, 235.2),
 (68, 269.6),
 (67, 214.625),
 (66, 276.44444444444446),
 (65, 298.2)]

### Actions

Print out the results

In [None]:
results = averagesByAge.collect()
for result in results:
    print(result)

(33, 325.3333333333333)
(26, 242.05882352941177)
(55, 295.53846153846155)
(40, 250.8235294117647)
(68, 269.6)
(59, 220.0)
(37, 249.33333333333334)
(54, 278.0769230769231)
(38, 193.53333333333333)
(27, 228.125)
(53, 222.85714285714286)
(57, 258.8333333333333)
(56, 306.6666666666667)
(43, 230.57142857142858)
(36, 246.6)
(22, 206.42857142857142)
(35, 211.625)
(45, 309.53846153846155)
(60, 202.71428571428572)
(67, 214.625)
(19, 213.27272727272728)
(30, 235.8181818181818)
(51, 302.14285714285717)
(25, 197.45454545454547)
(21, 350.875)
(42, 303.5)
(49, 184.66666666666666)
(48, 281.4)
(50, 254.6)
(39, 169.28571428571428)
(32, 207.9090909090909)
(58, 116.54545454545455)
(64, 281.3333333333333)
(31, 267.25)
(52, 340.6363636363636)
(24, 233.8)
(20, 165.0)
(62, 220.76923076923077)
(41, 268.55555555555554)
(44, 282.1666666666667)
(69, 235.2)
(65, 298.2)
(61, 256.22222222222223)
(28, 209.1)
(66, 276.44444444444446)
(46, 223.69230769230768)
(29, 215.91666666666666)
(18, 343.375)
(47, 233.22222222222