# Spark Exercise API

In [1]:
import pyspark
import pandas as pd
from pydataset import data
from pyspark.sql.functions import lit, concat

### Create a spark data frame that contains your favorite programming languages

* The name of the column should be language
* View the schema of the dataframe
* Output the shape of the dataframe
* Show the first 5 records in the dataframe

In [2]:
spark = pyspark.sql.SparkSession.builder.getOrCreate()

In [3]:
pandas_dataframe = pd.DataFrame({'language': ['Python', 'Ruby', 'C++', 'Java', 'RockStar']})
df = spark.createDataFrame(pandas_dataframe)

In [4]:
print(df.count(), "rows", len(df.columns), "columns")

5 rows 1 columns


In [5]:
df.show()

+--------+
|language|
+--------+
|  Python|
|    Ruby|
|     C++|
|    Java|
|RockStar|
+--------+



### Load the mpg dataset as a spark dataframe.

* Create 1 column of output that contains a message like the one below for each vehicle

* The 1999 audi a4 has a 4 cylinder engine. 

* Transform the trans column so that it only contains either manual or auto.

In [6]:
mpg = spark.createDataFrame(data("mpg"))
mpg.show(5)

+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|        audi|   a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|   a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|   a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|   a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|
|        audi|   a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
only showing top 5 rows



In [7]:
mpg.select(concat(lit('The '), mpg.year, lit(' '), mpg.manufacturer, lit(' has a '), mpg.cyl, lit(' cylinder engine.')).alias('description')).show(10, False)

+--------------------------------------+
|description                           |
+--------------------------------------+
|The 1999 audi has a 4 cylinder engine.|
|The 1999 audi has a 4 cylinder engine.|
|The 2008 audi has a 4 cylinder engine.|
|The 2008 audi has a 4 cylinder engine.|
|The 1999 audi has a 6 cylinder engine.|
|The 1999 audi has a 6 cylinder engine.|
|The 2008 audi has a 6 cylinder engine.|
|The 1999 audi has a 4 cylinder engine.|
|The 1999 audi has a 4 cylinder engine.|
|The 2008 audi has a 4 cylinder engine.|
+--------------------------------------+
only showing top 10 rows



### Load the tips dataset as a spark dataframe.

* What percentage of observations are smokers?
* Create a column that contains the tip percentage
* Calculate the average tip percentage for each combination of sex and smoker.

In [8]:
tips = spark.createDataFrame(data("tips"))
tips.show(5)

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
+----------+----+------+------+---+------+----+
only showing top 5 rows



### Use the seattle weather dataset referenced in the lesson to answer the questions below.

* Convert the temperatures to fahrenheit.
* Which month has the most rain, on average?
* Which year was the windiest?
* What is the most frequent type of weather in January?
* What is the average high and low temperature on sunny days in July in 2013 and 2014?
* What percentage of days were rainy in q3 of 2015?
* For each year, find what percentage of days it rained (had non-zero precipitation).

In [10]:
from vega_datasets import data

df = data.seattle_weather().assign(date=lambda df: df.date.astype(str))
df = spark.createDataFrame(df)
df.show(6)

+----------+-------------+--------+--------+----+-------+
|      date|precipitation|temp_max|temp_min|wind|weather|
+----------+-------------+--------+--------+----+-------+
|2012-01-01|          0.0|    12.8|     5.0| 4.7|drizzle|
|2012-01-02|         10.9|    10.6|     2.8| 4.5|   rain|
|2012-01-03|          0.8|    11.7|     7.2| 2.3|   rain|
|2012-01-04|         20.3|    12.2|     5.6| 4.7|   rain|
|2012-01-05|          1.3|     8.9|     2.8| 6.1|   rain|
|2012-01-06|          2.5|     4.4|     2.2| 2.2|   rain|
+----------+-------------+--------+--------+----+-------+
only showing top 6 rows

