### Spark 101 - Spark API Exercises

In [21]:
import pandas as pd
import numpy as np
import pyspark
import pyspark.sql.functions as f
from pyspark.sql.functions import when
from pyspark.sql.functions import lit
from pydataset import data

### 1.

In [9]:
languages = {'language':['python', 'sql', 'spark', 'java', 'javascript']}
df = pd.DataFrame(data=languages)
df

Unnamed: 0,language
0,python
1,sql
2,spark
3,java
4,javascript


In [13]:
spark = pyspark.sql.SparkSession.builder.getOrCreate()
df = spark.createDataFrame(df)
df

DataFrame[language: string]

In [14]:
df.show(5)

[Stage 0:>                                                          (0 + 1) / 1]

+----------+
|  language|
+----------+
|    python|
|       sql|
|     spark|
|      java|
|javascript|
+----------+



                                                                                

In [18]:
df.describe().show()

+-------+--------+
|summary|language|
+-------+--------+
|  count|       5|
|   mean|    null|
| stddev|    null|
|    min|    java|
|    max|     sql|
+-------+--------+



In [19]:
df.printSchema()

root
 |-- language: string (nullable = true)



---

### 2.

In [22]:
mpg = spark.createDataFrame(data("mpg"))
mpg.show(5)

+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|        audi|   a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|   a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|   a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|   a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|
|        audi|   a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
only showing top 5 rows



In [34]:
mpg.select(f.concat(lit('The '), mpg.year, lit(' '),
                    mpg.manufacturer, lit(' '), mpg.model, lit(' '), lit('has a '), 
                    mpg.cyl, lit(' '), lit('cylinder engine')).alias('vehicle statement')).show(10, truncate=False)

+------------------------------------------------+
|vehicle statement                               |
+------------------------------------------------+
|The 1999 audi a4 has a 4 cylinder engine        |
|The 1999 audi a4 has a 4 cylinder engine        |
|The 2008 audi a4 has a 4 cylinder engine        |
|The 2008 audi a4 has a 4 cylinder engine        |
|The 1999 audi a4 has a 6 cylinder engine        |
|The 1999 audi a4 has a 6 cylinder engine        |
|The 2008 audi a4 has a 6 cylinder engine        |
|The 1999 audi a4 quattro has a 4 cylinder engine|
|The 1999 audi a4 quattro has a 4 cylinder engine|
|The 2008 audi a4 quattro has a 4 cylinder engine|
+------------------------------------------------+
only showing top 10 rows



In [65]:
mpg.select(f.substring_index(mpg.trans, '(', 1).alias('trans_type')).show(5)

+----------+
|trans_type|
+----------+
|      auto|
|    manual|
|    manual|
|      auto|
|      auto|
+----------+
only showing top 5 rows



---

### 3.

In [66]:
tips = spark.createDataFrame(data("tips"))
tips.show(5)

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
+----------+----+------+------+---+------+----+
only showing top 5 rows



In [96]:
total = tips.select(f.count(tips.smoker)).head()[0]
total

244

In [99]:
#method chaining...
tips.groupby(tips.smoker).count().select('*', (f.col('count')/total).alias('percentage')).show()

+------+-----+-------------------+
|smoker|count|         percentage|
+------+-----+-------------------+
|    No|  151| 0.6188524590163934|
|   Yes|   93|0.38114754098360654|
+------+-----+-------------------+



In [101]:
tips.select('*', ((tips.tip / tips.total_bill).alias('tip_percentage'))).show(5)

+----------+----+------+------+---+------+----+-------------------+
|total_bill| tip|   sex|smoker|day|  time|size|     tip_percentage|
+----------+----+------+------+---+------+----+-------------------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|0.05944673337257211|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|0.16054158607350097|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|0.16658733936220846|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2| 0.1397804054054054|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|0.14680764538430255|
+----------+----+------+------+---+------+----+-------------------+
only showing top 5 rows



In [106]:
tips.groupby('sex', 'smoker').agg(f.avg('tip')).show()

+------+------+------------------+
|   sex|smoker|          avg(tip)|
+------+------+------------------+
|  Male|    No|3.1134020618556706|
|Female|    No| 2.773518518518518|
|  Male|   Yes| 3.051166666666666|
|Female|   Yes| 2.931515151515151|
+------+------+------------------+



---

### 4.

In [187]:
from vega_datasets import data

weather = data.seattle_weather().assign(date=lambda df: df.date.astype(str))
weather = spark.createDataFrame(weather)
weather.show(6)

+----------+-------------+--------+--------+----+-------+
|      date|precipitation|temp_max|temp_min|wind|weather|
+----------+-------------+--------+--------+----+-------+
|2012-01-01|          0.0|    12.8|     5.0| 4.7|drizzle|
|2012-01-02|         10.9|    10.6|     2.8| 4.5|   rain|
|2012-01-03|          0.8|    11.7|     7.2| 2.3|   rain|
|2012-01-04|         20.3|    12.2|     5.6| 4.7|   rain|
|2012-01-05|          1.3|     8.9|     2.8| 6.1|   rain|
|2012-01-06|          2.5|     4.4|     2.2| 2.2|   rain|
+----------+-------------+--------+--------+----+-------+
only showing top 6 rows



In [189]:
#convert temps to F
weather = weather.withColumn(
    "temp_max", (f.col("temp_max") * 9 / 5 + 32)
).withColumn("temp_min", (f.col("temp_min") * 9 / 5 + 32))
weather.show(5)

+----------+-------------+--------+--------+----+-------+
|      date|precipitation|temp_max|temp_min|wind|weather|
+----------+-------------+--------+--------+----+-------+
|2012-01-01|          0.0|   55.04|    41.0| 4.7|drizzle|
|2012-01-02|         10.9|   51.08|   37.04| 4.5|   rain|
|2012-01-03|          0.8|   53.06|   44.96| 2.3|   rain|
|2012-01-04|         20.3|   53.96|   42.08| 4.7|   rain|
|2012-01-05|          1.3|   48.02|   37.04| 6.1|   rain|
+----------+-------------+--------+--------+----+-------+
only showing top 5 rows



In [190]:
from pyspark.sql.functions import month, year, quarter
(
    weather.withColumn("month", month("date"))
    .groupBy("month")
    .agg(f.avg("precipitation").alias("avg_rainfall"))
    .sort("avg_rainfall", ascending=False)
    .show()
)



+-----+-------------------+
|month|       avg_rainfall|
+-----+-------------------+
|   11|  5.354166666666667|
|   12|  5.021774193548389|
|    3|  4.888709677419354|
|   10|  4.059677419354839|
|    1| 3.7580645161290316|
|    2|  3.734513274336283|
|    4|  3.128333333333333|
|    9|             1.9625|
|    5| 1.6733870967741935|
|    8| 1.3201612903225806|
|    6| 1.1075000000000002|
|    7|0.38870967741935486|
+-----+-------------------+



In [191]:
(
    weather.withColumn("year", year("date"))
    .groupBy("year")
    .agg(f.avg("wind").alias("avg_wind"))
    .sort("avg_wind", ascending=False)
    .show()
)

+----+------------------+
|year|          avg_wind|
+----+------------------+
|2012| 3.400819672131147|
|2014|3.3876712328767122|
|2015|  3.15972602739726|
|2013|3.0158904109589044|
+----+------------------+



In [192]:
df2 = weather.withColumn("month", month("date"))
df2.show()

+----------+-------------+------------------+------------------+----+-------+-----+
|      date|precipitation|          temp_max|          temp_min|wind|weather|month|
+----------+-------------+------------------+------------------+----+-------+-----+
|2012-01-01|          0.0|             55.04|              41.0| 4.7|drizzle|    1|
|2012-01-02|         10.9|             51.08|             37.04| 4.5|   rain|    1|
|2012-01-03|          0.8|             53.06|             44.96| 2.3|   rain|    1|
|2012-01-04|         20.3|             53.96|             42.08| 4.7|   rain|    1|
|2012-01-05|          1.3|             48.02|             37.04| 6.1|   rain|    1|
|2012-01-06|          2.5|             39.92|             35.96| 2.2|   rain|    1|
|2012-01-07|          0.0|             44.96|             37.04| 2.3|   rain|    1|
|2012-01-08|          0.0|              50.0|             37.04| 2.0|    sun|    1|
|2012-01-09|          4.3|             48.92|              41.0| 3.4|   rain

In [193]:
df3 = df2.filter(df2.month == 1)
df3.show()

+----------+-------------+------------------+------------------+----+-------+-----+
|      date|precipitation|          temp_max|          temp_min|wind|weather|month|
+----------+-------------+------------------+------------------+----+-------+-----+
|2012-01-01|          0.0|             55.04|              41.0| 4.7|drizzle|    1|
|2012-01-02|         10.9|             51.08|             37.04| 4.5|   rain|    1|
|2012-01-03|          0.8|             53.06|             44.96| 2.3|   rain|    1|
|2012-01-04|         20.3|             53.96|             42.08| 4.7|   rain|    1|
|2012-01-05|          1.3|             48.02|             37.04| 6.1|   rain|    1|
|2012-01-06|          2.5|             39.92|             35.96| 2.2|   rain|    1|
|2012-01-07|          0.0|             44.96|             37.04| 2.3|   rain|    1|
|2012-01-08|          0.0|              50.0|             37.04| 2.0|    sun|    1|
|2012-01-09|          4.3|             48.92|              41.0| 3.4|   rain

In [194]:
df3.groupby('weather').count().sort('count', ascending=False).show()

+-------+-----+
|weather|count|
+-------+-----+
|    fog|   38|
|   rain|   35|
|    sun|   33|
|drizzle|   10|
|   snow|    8|
+-------+-----+



In [195]:
#all in one call - needed to use the 'col()' function to make it run together
(
    weather.withColumn("month", month("date"))
    .filter(f.col('month') == 1)
    .groupby('weather').count()
    .sort('count', ascending=False)
    .show()
)

+-------+-----+
|weather|count|
+-------+-----+
|    fog|   38|
|   rain|   35|
|    sun|   33|
|drizzle|   10|
|   snow|    8|
+-------+-----+



In [196]:
month_year = weather.withColumn("month", month("date")).withColumn('year', year('date'))
month_year.show()

+----------+-------------+------------------+------------------+----+-------+-----+----+
|      date|precipitation|          temp_max|          temp_min|wind|weather|month|year|
+----------+-------------+------------------+------------------+----+-------+-----+----+
|2012-01-01|          0.0|             55.04|              41.0| 4.7|drizzle|    1|2012|
|2012-01-02|         10.9|             51.08|             37.04| 4.5|   rain|    1|2012|
|2012-01-03|          0.8|             53.06|             44.96| 2.3|   rain|    1|2012|
|2012-01-04|         20.3|             53.96|             42.08| 4.7|   rain|    1|2012|
|2012-01-05|          1.3|             48.02|             37.04| 6.1|   rain|    1|2012|
|2012-01-06|          2.5|             39.92|             35.96| 2.2|   rain|    1|2012|
|2012-01-07|          0.0|             44.96|             37.04| 2.3|   rain|    1|2012|
|2012-01-08|          0.0|              50.0|             37.04| 2.0|    sun|    1|2012|
|2012-01-09|         

In [197]:
(    
    month_year.filter(month_year.year == 2013)
    .filter(month_year.month == 7)
    .filter(month_year.weather == 'sun')
    .agg(f.avg('temp_max'), f.avg('temp_min')).show()

)

+-----------------+-----------------+
|    avg(temp_max)|    avg(temp_min)|
+-----------------+-----------------+
|79.85333333333334|57.16666666666668|
+-----------------+-----------------+



In [198]:
#all in one call - needed to use the 'col()' function to make it run together
(
    weather.withColumn("month", month("date"))
    .withColumn('year', year('date'))
    .filter(f.col('year') == 2013)
    .filter(f.col('month') == 7)
    .filter(weather.weather == 'sun')
    .agg(f.avg('temp_max'), f.avg('temp_min'))
    .show()
)


+-----------------+-----------------+
|    avg(temp_max)|    avg(temp_min)|
+-----------------+-----------------+
|79.85333333333334|57.16666666666668|
+-----------------+-----------------+



In [199]:
df5 = weather.withColumn('quarter', quarter('date')).withColumn('year', year('date'))
df5.show()

+----------+-------------+------------------+------------------+----+-------+-------+----+
|      date|precipitation|          temp_max|          temp_min|wind|weather|quarter|year|
+----------+-------------+------------------+------------------+----+-------+-------+----+
|2012-01-01|          0.0|             55.04|              41.0| 4.7|drizzle|      1|2012|
|2012-01-02|         10.9|             51.08|             37.04| 4.5|   rain|      1|2012|
|2012-01-03|          0.8|             53.06|             44.96| 2.3|   rain|      1|2012|
|2012-01-04|         20.3|             53.96|             42.08| 4.7|   rain|      1|2012|
|2012-01-05|          1.3|             48.02|             37.04| 6.1|   rain|      1|2012|
|2012-01-06|          2.5|             39.92|             35.96| 2.2|   rain|      1|2012|
|2012-01-07|          0.0|             44.96|             37.04| 2.3|   rain|      1|2012|
|2012-01-08|          0.0|              50.0|             37.04| 2.0|    sun|      1|2012|

In [200]:
df6 = df5.filter(df5.year == 2015).filter(df5.quarter == 3)
df6.show()

+----------+-------------+-----------------+------------------+----+-------+-------+----+
|      date|precipitation|         temp_max|          temp_min|wind|weather|quarter|year|
+----------+-------------+-----------------+------------------+----+-------+-------+----+
|2015-07-01|          0.0|89.96000000000001|62.959999999999994| 4.3|    sun|      3|2015|
|2015-07-02|          0.0|            93.02|             64.04| 3.4|    sun|      3|2015|
|2015-07-03|          0.0|            91.94|             64.04| 2.6|    sun|      3|2015|
|2015-07-04|          0.0|            91.94|              59.0| 2.9|    sun|      3|2015|
|2015-07-05|          0.0|91.03999999999999|62.059999999999995| 2.1|    sun|      3|2015|
|2015-07-06|          0.0|84.91999999999999|             60.08| 3.2|drizzle|      3|2015|
|2015-07-07|          0.0|            80.96|             57.02| 2.4|    sun|      3|2015|
|2015-07-08|          0.0|             86.0|             57.92| 1.9|drizzle|      3|2015|
|2015-07-0

In [201]:
df6.groupby('weather').count().show()

+-------+-----+
|weather|count|
+-------+-----+
|    fog|   21|
|drizzle|    5|
|   rain|    2|
|    sun|   64|
+-------+-----+



In [202]:
df6.select(when(f.col("weather") == "rain", 1).otherwise(0).alias("rain")).agg(f.mean("rain")).show()

+--------------------+
|           avg(rain)|
+--------------------+
|0.021739130434782608|
+--------------------+



In [204]:
# measure a rainy day by precipitation > 0
(
    weather.withColumn("year", year("date"))
    .select(when(f.col("precipitation") > 0, 1).otherwise(0).alias("did_rain"), "year")
    .groupby("year")
    .agg(f.mean("did_rain"))
    .show()
)

+----+-------------------+
|year|      avg(did_rain)|
+----+-------------------+
|2012|0.48360655737704916|
|2013|0.41643835616438357|
|2014|  0.410958904109589|
|2015|0.39452054794520547|
+----+-------------------+

