In [None]:
from pyspark.sql import SparkSession

spark = (SparkSession.builder
                    .appName('helloSpark')
                    .getOrCreate()
        )

Further info on Spark sessions:  
https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/spark_session.html

In [None]:
spark

In [None]:
# Let's generate some data for analysis

import random

names = ["Alice", "Ben", "Charles", "Daisy"]
start_range = 900
end_range = 5000
python_data = [[random.choice(names),random.randint(start_range,end_range)] for i in range(500000)]

In [None]:
# To read in a Python object (list, dict), we can use spark.createDataFrame
# We define a schema to have nicer column names and avoid Spark having to infer the schema
schema = "name STRING, salary INT"

df = spark.createDataFrame(python_data,schema=schema)

In [None]:
# to display some rows, you can use .show() 
df.show()

In [None]:
df_new = (df.groupBy("name")
          .avg("salary")
         )

In [None]:
df_new.show()

In [None]:
# Many of the functions hide behind spark.sql.functions
import pyspark.sql.functions as F

(df_new.select(
    "name"
    ,"avg(salary)"
    ,F.round("avg(salary)").alias("average")
    ).show()
)

The following is for a comparison with the popular Python package `pandas`

In [None]:
import pandas as pd 

pd_df = pd.DataFrame(python_data,columns=["name","salary"])

In [None]:
pd_df

In [None]:
pd_df.groupby("name").mean("salary")

Which of these seemed to be faster?  
Why?