# Spark API Exercises

In [24]:
import pandas as pd
import numpy as np
import pyspark 
import pydataset

from pyspark.sql.functions import sum, mean, concat, lit, regexp_extract, regexp_replace, when

In [2]:
# set up spark invironment
spark = pyspark.sql.SparkSession.builder.getOrCreate()

Save this work in your spark-exercises repo. Then add, commit, and push your changes.

Create a jupyter notebook or python script named spark101 for this exercise.

1. Create a spark data frame that contains your favorite programming languages.

    - The name of the column should be language
    - View the schema of the dataframe
    - Output the shape of the dataframe
    - Show the first 5 records in the dataframe


In [6]:
# create dictionary
Dict = {'language': ['python', 'php', 'C', 'C++', 'java', 'SQL']}

# turn dictionary into pandas dataframe
fav_languages = pd.DataFrame(Dict)

In [7]:
# turn pandas dataframe into spark dataframe
df = spark.createDataFrame(fav_languages)

In [15]:
# look at shape
print((df.count(), len(df.columns)))

(6, 1)


In [9]:
df.show(5)

+--------+
|language|
+--------+
|  python|
|     php|
|       C|
|     C++|
|    java|
+--------+
only showing top 5 rows



2. Load the mpg dataset as a spark dataframe.

    a. Create 1 column of output that contains a message like the one below:

    `The 1999 audi a4 has a 4 cylinder engine.`
    
    For each vehicle.
    
    b. Transform the trans column so that it only contains either manual or auto.

In [22]:
# load mpg data
mpg = pydataset.data('mpg')

# turn into spark dataframe
df = spark.createDataFrame(mpg)

In [23]:
df.show(5)

+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|        audi|   a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|   a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|   a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|   a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|
|        audi|   a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
only showing top 5 rows



In [29]:
df.select(concat(lit('The '), 'year', lit(' '), 'manufacturer', 
                 lit(' '), 'model', lit(' has a '), 'cyl', lit(' cylinder engine.')).alias('description'))

DataFrame[description: string]

In [30]:
df = df.select('*',concat(lit('The '), 'year', lit(' '), 'manufacturer',
                          lit(' '), 'model', lit(' has a '), 'cyl', lit(' cylinder engine.')).alias('description'))

df.show(5)

+------------+-----+-----+----+---+----------+---+---+---+---+-------+--------------------+
|manufacturer|model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|         description|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+--------------------+
|        audi|   a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|The 1999 audi a4 ...|
|        audi|   a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|The 1999 audi a4 ...|
|        audi|   a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|The 2008 audi a4 ...|
|        audi|   a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|The 2008 audi a4 ...|
|        audi|   a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|The 1999 audi a4 ...|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+--------------------+
only showing top 5 rows



3. Load the tips dataset as a spark dataframe.

    - What percentage of observations are smokers?
    - Create a column that contains the tip percentage
    - Calculate the average tip percentage for each combination of sex and smoker.

4. Use the seattle weather dataset referenced in the lesson to answer the questions below.

    - Convert the temperatures to fahrenheit.
    - Which month has the most rain, on average?
    - Which year was the windiest?
    - What is the most frequent type of weather in January?
    - What is the average high and low temperature on sunny days in July in 2013 and 2014?
    - What percentage of days were rainy in q3 of 2015?
    - For each year, find what percentage of days it rained (had non-zero precipitation).