In [28]:
import pyspark

import pandas as pd
import numpy as np

from pydataset import data

In [29]:
from pyspark.sql.functions import col, expr

In [30]:
# creating our entry point to programmig with Spark

spark = pyspark.sql.SparkSession.builder.getOrCreate()

1. Create a spark data frame that contains your favorite programming languages.

- The name of the column should be language
- View the schema of the dataframe
- Output the shape of the dataframe
- Show the first 5 records in the dataframe


In [31]:
language_df = spark.createDataFrame(pd.DataFrame({'language': ['python', 'R', 'java','javascript', 'scala', 'c++', 'sas', 'ruby'],
                                                  'score': [10,7,6,4,3,5,2,1]}))

In [32]:
language_df.show()

+----------+-----+
|  language|score|
+----------+-----+
|    python|   10|
|         R|    7|
|      java|    6|
|javascript|    4|
|     scala|    3|
|       c++|    5|
|       sas|    2|
|      ruby|    1|
+----------+-----+



In [33]:
language_df.printSchema()

root
 |-- language: string (nullable = true)
 |-- score: long (nullable = true)



In [34]:
# one way to get the shape of the data. We can see that there are
# 8 observations and 2 columns
language_df.describe().show()

+-------+--------+------------------+
|summary|language|             score|
+-------+--------+------------------+
|  count|       8|                 8|
|   mean|    null|              4.75|
| stddev|    null|2.9154759474226504|
|    min|       R|                 1|
|    max|   scala|                10|
+-------+--------+------------------+



In [35]:
# we can also use another function to get the length of the dataframe
language_df.count()

8

In [36]:
# lets only see the first five observations

language_df.show(5)

+----------+-----+
|  language|score|
+----------+-----+
|    python|   10|
|         R|    7|
|      java|    6|
|javascript|    4|
|     scala|    3|
+----------+-----+
only showing top 5 rows



2. Load the mpg dataset as a spark dataframe.

    - Create 1 column of output that contains a message like the one below:

        - The 1999 audi a4 has a 4 cylinder engine.
    - For each vehicle.

        - Transform the trans column so that it only contains either manual or auto.



In [37]:
mpg = spark.createDataFrame(data('mpg'))

In [38]:
from pyspark.sql.functions import when



In [39]:
from pyspark.sql.functions import lit
from pyspark.sql.functions import concat, sum, avg, min, max, count, mean

In [40]:
# Create 1 column of output that contains a message like the one below:
# The 1999 audi a4 has a 4 cylinder engine.

mpg.select(concat(lit('The '), mpg.year, lit(' '), mpg.manufacturer,lit(' '), mpg.model,lit(' '), lit('has '), mpg.cyl, lit('cylinder engine'))).alias('Final').show(1)

+----------------------------------------------------------------------------+
|concat(The , year,  , manufacturer,  , model,  , has , cyl, cylinder engine)|
+----------------------------------------------------------------------------+
|                                                        The 1999 audi a4 ...|
+----------------------------------------------------------------------------+
only showing top 1 row



In [41]:
mpg.show()

+------------+------------------+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|             model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+------------------+-----+----+---+----------+---+---+---+---+-------+
|        audi|                a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|                a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|                a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|                a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|
|        audi|                a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|
|        audi|                a4|  2.8|1999|  6|manual(m5)|  f| 18| 26|  p|compact|
|        audi|                a4|  3.1|2008|  6|  auto(av)|  f| 18| 27|  p|compact|
|        audi|        a4 quattro|  1.8|1999|  4|manual(m5)|  4| 18| 26|  p|compact|
|        audi|        a4 quattro|  1.8|1999|  4|  auto(l5)|  4| 16| 25|  p|c

In [42]:
from pyspark.sql.functions import regexp_extract, regexp_replace

In [43]:
#For each vehicle.

# Transform the trans column so that it only contains either manual or auto.

mpg.select( 'manufacturer', 'model', 'displ', 'year', 'cyl', 'trans', regexp_extract('trans', r"(\w+)", 1)\
           .alias('transmission'), 'drv', 'cty', 'hwy', 'fl', 'class').show()

+------------+------------------+-----+----+---+----------+------------+---+---+---+---+-------+
|manufacturer|             model|displ|year|cyl|     trans|transmission|drv|cty|hwy| fl|  class|
+------------+------------------+-----+----+---+----------+------------+---+---+---+---+-------+
|        audi|                a4|  1.8|1999|  4|  auto(l5)|        auto|  f| 18| 29|  p|compact|
|        audi|                a4|  1.8|1999|  4|manual(m5)|      manual|  f| 21| 29|  p|compact|
|        audi|                a4|  2.0|2008|  4|manual(m6)|      manual|  f| 20| 31|  p|compact|
|        audi|                a4|  2.0|2008|  4|  auto(av)|        auto|  f| 21| 30|  p|compact|
|        audi|                a4|  2.8|1999|  6|  auto(l5)|        auto|  f| 16| 26|  p|compact|
|        audi|                a4|  2.8|1999|  6|manual(m5)|      manual|  f| 18| 26|  p|compact|
|        audi|                a4|  3.1|2008|  6|  auto(av)|        auto|  f| 18| 27|  p|compact|
|        audi|        a4 quatt

3. Load the tips dataset as a spark dataframe.

    - What percentage of observations are smokers?
    - Create a column that contains the tip percentage
    - Calculate the average tip percentage for each combination of sex and smoker.

In [44]:
tips = spark.createDataFrame(data('tips'))

In [45]:
tips.show()

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
|     25.29|4.71|  Male|    No|Sun|Dinner|   4|
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|
|     26.88|3.12|  Male|    No|Sun|Dinner|   4|
|     15.04|1.96|  Male|    No|Sun|Dinner|   2|
|     14.78|3.23|  Male|    No|Sun|Dinner|   2|
|     10.27|1.71|  Male|    No|Sun|Dinner|   2|
|     35.26| 5.0|Female|    No|Sun|Dinner|   4|
|     15.42|1.57|  Male|    No|Sun|Dinner|   2|
|     18.43| 3.0|  Male|    No|Sun|Dinner|   4|
|     14.83|3.02|Female|    No|Sun|Dinner|   2|
|     21.58|3.92|  Male|    No|Sun|Dinner|   2|
|     10.33|1.67|Female|    No|Sun|Dinner|   3|
|     16.29|3.71|  Male|    No|Sun|Dinne

In [46]:
# percentage of smoker 

(tips.filter(tips.smoker == 'Yes').count()/tips.count()) * 100

38.114754098360656

In [47]:
# creating a separate column for tip percentage

tips.select('total_bill', 'tip', 'sex','smoker', 'day','time','size', expr('(tip/total_bill)*100').alias('tip_percentage')).show()

+----------+----+------+------+---+------+----+------------------+
|total_bill| tip|   sex|smoker|day|  time|size|    tip_percentage|
+----------+----+------+------+---+------+----+------------------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|5.9446733372572105|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|16.054158607350097|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|16.658733936220845|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2| 13.97804054054054|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|14.680764538430255|
|     25.29|4.71|  Male|    No|Sun|Dinner|   4| 18.62396204033215|
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2| 22.80501710376283|
|     26.88|3.12|  Male|    No|Sun|Dinner|   4|11.607142857142858|
|     15.04|1.96|  Male|    No|Sun|Dinner|   2|13.031914893617023|
|     14.78|3.23|  Male|    No|Sun|Dinner|   2|21.853856562922868|
|     10.27|1.71|  Male|    No|Sun|Dinner|   2| 16.65043816942551|
|     35.26| 5.0|Female|    No|Sun|Dinner|   4|14.180374361883

In [48]:
# calculating the average tip percentage for each combination of sex and smoker

tips.select('sex', 'smoker', expr('(tip/total_bill)*100').alias('tip_percentage')).groupBy('sex', 'smoker')\
.mean('tip_percentage').show()

+------+------+-------------------+
|   sex|smoker|avg(tip_percentage)|
+------+------+-------------------+
|  Male|    No|  16.06687151291298|
|  Male|   Yes| 15.277117520248513|
|Female|    No| 15.692097076918358|
|Female|   Yes|  18.21503526994103|
+------+------+-------------------+



4. Use the seattle weather dataset referenced in the lesson to answer the questions below.

    - Convert the temperatures to farenheight.
    - Which month has the most rain, on average?
    - Which year was the windiest?
    - What is the most frequent type of weather in January?
    - What is the average high and low temperature on sunny days in July in 2013 and 2014?
    - What percentage of days were rainy in q3 of 2015?
    - For each year, find what percentage of days it rained (had non-zero precipitation).

In [49]:
from vega_datasets import data

In [50]:
# creating a spark dataframe
seattle = spark.createDataFrame(data('seattle_weather'))

In [51]:
# taking a peek at the data
seattle.show()

+-------------------+-------------+--------+--------+----+-------+
|               date|precipitation|temp_max|temp_min|wind|weather|
+-------------------+-------------+--------+--------+----+-------+
|2012-01-01 00:00:00|          0.0|    12.8|     5.0| 4.7|drizzle|
|2012-01-02 00:00:00|         10.9|    10.6|     2.8| 4.5|   rain|
|2012-01-03 00:00:00|          0.8|    11.7|     7.2| 2.3|   rain|
|2012-01-04 00:00:00|         20.3|    12.2|     5.6| 4.7|   rain|
|2012-01-05 00:00:00|          1.3|     8.9|     2.8| 6.1|   rain|
|2012-01-06 00:00:00|          2.5|     4.4|     2.2| 2.2|   rain|
|2012-01-07 00:00:00|          0.0|     7.2|     2.8| 2.3|   rain|
|2012-01-08 00:00:00|          0.0|    10.0|     2.8| 2.0|    sun|
|2012-01-09 00:00:00|          4.3|     9.4|     5.0| 3.4|   rain|
|2012-01-10 00:00:00|          1.0|     6.1|     0.6| 3.4|   rain|
|2012-01-11 00:00:00|          0.0|     6.1|    -1.1| 5.1|    sun|
|2012-01-12 00:00:00|          0.0|     6.1|    -1.7| 1.9|    

In [52]:
import pyspark.sql.functions as f

In [53]:
# converting the temperatures to fahrenheit

seattle.select('date', 'precipitation', expr('ROUND(temp_max * 1.8 + 32)').alias('max_temp_F'),\
               expr('ROUND(temp_min * 1.8 + 32)').alias('min_temp_F')).show()

+-------------------+-------------+----------+----------+
|               date|precipitation|max_temp_F|min_temp_F|
+-------------------+-------------+----------+----------+
|2012-01-01 00:00:00|          0.0|      55.0|      41.0|
|2012-01-02 00:00:00|         10.9|      51.0|      37.0|
|2012-01-03 00:00:00|          0.8|      53.0|      45.0|
|2012-01-04 00:00:00|         20.3|      54.0|      42.0|
|2012-01-05 00:00:00|          1.3|      48.0|      37.0|
|2012-01-06 00:00:00|          2.5|      40.0|      36.0|
|2012-01-07 00:00:00|          0.0|      45.0|      37.0|
|2012-01-08 00:00:00|          0.0|      50.0|      37.0|
|2012-01-09 00:00:00|          4.3|      49.0|      41.0|
|2012-01-10 00:00:00|          1.0|      43.0|      33.0|
|2012-01-11 00:00:00|          0.0|      43.0|      30.0|
|2012-01-12 00:00:00|          0.0|      43.0|      29.0|
|2012-01-13 00:00:00|          0.0|      41.0|      27.0|
|2012-01-14 00:00:00|          4.1|      40.0|      33.0|
|2012-01-15 00

In [59]:
# which month has most rain on average?

seattle.withColumn('month', f.month('date')).groupBy('month').agg(avg('precipitation').alias('average_rainfall_by_month'))\
.sort(f.desc('average_rainfall_by_month'))\
.show(1)

+-----+-------------------------+
|month|average_rainfall_by_month|
+-----+-------------------------+
|   11|        5.354166666666667|
+-----+-------------------------+
only showing top 1 row



In [60]:
#Which year was the windiest?

seattle.withColumn('year', f.year('date')).groupBy('year').agg(avg('wind').alias('average_wind_by_year'))\
.sort(f.desc('average_wind_by_year'))\
.show(1)

+----+--------------------+
|year|average_wind_by_year|
+----+--------------------+
|2012|   3.400819672131147|
+----+--------------------+
only showing top 1 row



In [61]:
#What is the most frequent type of weather in January

seattle.filter(f.month('date')==1).groupBy('weather').count().sort(f.desc('count')).show(1)

+-------+-----+
|weather|count|
+-------+-----+
|    fog|   38|
+-------+-----+
only showing top 1 row



In [116]:
#What is the average high and low temperature on sunny days in July in 2013 and 2014?

seattle.where((f.month('date') == 7) & (seattle.weather == 'sun') & ((f.year('date') == 2013) | (f.year('date') == 2014))).\
agg(f.mean('temp_max'), f.mean('temp_min')).show()


+------------------+-----------------+
|     avg(temp_max)|    avg(temp_min)|
+------------------+-----------------+
|26.828846153846158|14.18269230769231|
+------------------+-----------------+



In [107]:
# getting the temperature difference for every sunny days in July of 2013 and 2014

seattle.where((f.month('date') == 7) & (seattle.weather == 'sun') & ((f.year('date') == 2013) | (f.year('date') == 2014))).\
select('date','temp_min', 'temp_max', 'weather',expr('ROUND(temp_max - temp_min)').alias('temp_dif')).show()


+-------------------+--------+--------+-------+--------+
|               date|temp_min|temp_max|weather|temp_dif|
+-------------------+--------+--------+-------+--------+
|2013-07-01 00:00:00|    18.3|    31.7|    sun|    13.0|
|2013-07-02 00:00:00|    15.6|    28.3|    sun|    13.0|
|2013-07-03 00:00:00|    16.7|    26.1|    sun|     9.0|
|2013-07-05 00:00:00|    13.9|    23.3|    sun|     9.0|
|2013-07-06 00:00:00|    13.3|    26.1|    sun|    13.0|
|2013-07-07 00:00:00|    13.9|    23.9|    sun|    10.0|
|2013-07-08 00:00:00|    13.3|    26.7|    sun|    13.0|
|2013-07-09 00:00:00|    15.0|    30.0|    sun|    15.0|
|2013-07-10 00:00:00|    13.9|    22.2|    sun|     8.0|
|2013-07-11 00:00:00|    12.2|    22.8|    sun|    11.0|
|2013-07-12 00:00:00|    13.3|    19.4|    sun|     6.0|
|2013-07-13 00:00:00|    11.1|    26.1|    sun|    15.0|
|2013-07-14 00:00:00|    12.8|    27.8|    sun|    15.0|
|2013-07-15 00:00:00|    14.4|    27.8|    sun|    13.0|
|2013-07-16 00:00:00|    18.3| 