# RDDs Vs. DataFrames
A simple tutorial to compare low-level APIs such as RDD and structured APIs in this case DataFrames

# Compute Aggregation By Key Using DataFrames

In [24]:
# IMPORT ALL NECESSARY VARIABLES FOR YOU TO RUN THE CODE BELOW
from pyspark.sql import SparkSession

In [2]:
# import required Python packages
import pandas as pd

In [3]:
# create a SparkSession
spark =  spark = SparkSession.builder \
    .master("local[*]") \
    .appName("Hello Spark") \
    .getOrCreate()   

In [6]:
spark.createDataFrame?

In [25]:
# Create a simple Spark DataFrame 
data = [("Brooke", 20), ("Denny", 31), ("Jules", 30), ("TD", 35), ("Brooke", 25)]
colnames = ["name", "age"]
# Use the createDataFrame() function on spark to create a DataFrame with the data above
data_df = spark.createDataFrame(data,colnames)
# Group the same names together, aggregate their ages, and compute an average avg_df = data_df.groupBy("name").agg(avg("age"))
avg_df = data_df.groupBy('name').mean()#.agg(avg("age"))
# show the results
avg_df.show()

+------+--------+
|  name|avg(age)|
+------+--------+
|Brooke|    22.5|
| Denny|    31.0|
| Jules|    30.0|
|    TD|    35.0|
+------+--------+



In [28]:
data_df.groupBy('name').mean().toPandas()

In [29]:
a

Unnamed: 0,name,avg(age)
0,Brooke,22.5
1,Denny,31.0
2,Jules,30.0
3,TD,35.0


# So, how would you do the same thing in RDDs

In [18]:
# we can use the parallelize function to create an RDD from Python objects
# however, we need a SparkContext object to create RDD and we can create  it from SparkSession
rdd_data = spark.sparkContext.parallelize(data)
# Use map and reduceByKey transformations with their lambda 
# expressions to aggregate and then compute average
ages_rdd = (rdd_data
.map(lambda x: (x[0], (x[1], 1)))
.reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1]))
.map(lambda x: (x[0], x[1][0]/x[1][1])))

# you can use collect() function on RDD to bring all the data on a single core and 
# look at it 
ages_rdd_list = ages_rdd.collect()

# What do you think? Which one is easier to work with?