Set up java to run spark and pyspark:

In [None]:
!pip install pyspark
!pip install -U -q PyDrive
!apt install openjdk-8-jdk-headless -qq

Initialize spark context:

In [None]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

import pyspark
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession

conf = SparkConf().setMaster("local").setAppName("FriendsByAge")
sc = SparkContext(conf = conf)
spark = SparkSession.builder.getOrCreate()

Load the dataset:

In [None]:
friends_data = spark.read.csv('/content/drive/MyDrive/spark_tutorials/spark_datasets/fakefriends-header.csv', header=True)
friends_data.printSchema()

root
 |-- userID: string (nullable = true)
 |-- name: string (nullable = true)
 |-- age: string (nullable = true)
 |-- friends: string (nullable = true)



Manipulate the data:

In [None]:
d = friends_data.rdd.map( lambda x: (int(x[2]), int(x[3])) ).mapValues( lambda x: (x, 1) )
d.take(3)

[(33, (385, 1)), (26, (2, 1)), (55, (221, 1))]

In [None]:
d = d.reduceByKey( lambda x, y: (x[0] + y[0], x[1] + y[1])).mapValues( lambda x: round(x[0] / x[1], 2) )
d = d.sortBy(lambda x: x).collect()
d

Same thing with Spark DataFrames:

In [None]:
from pyspark.sql import *
from pyspark.sql.functions import *

# avg_friends_by_age = friends_data.groupBy("age").avg("friends").sort("age")
avg_friends_by_age = friends_data.groupBy("age").agg(round(avg("friends"), 2).alias("AverageFriends")).sort(asc("age"))
avg_friends_by_age.show()

+---+--------------+
|age|AverageFriends|
+---+--------------+
| 18|        343.38|
| 19|        213.27|
| 20|         165.0|
| 21|        350.88|
| 22|        206.43|
| 23|         246.3|
| 24|         233.8|
| 25|        197.45|
| 26|        242.06|
| 27|        228.13|
| 28|         209.1|
| 29|        215.92|
| 30|        235.82|
| 31|        267.25|
| 32|        207.91|
| 33|        325.33|
| 34|         245.5|
| 35|        211.63|
| 36|         246.6|
| 37|        249.33|
+---+--------------+
only showing top 20 rows

