In [1]:
# Create a spark data frame that contains your favorite programming languages.

# The name of the column should be language
# View the schema of the dataframe
# Output the shape of the dataframe
# Show the first 5 records in the dataframe


# imports
import pyspark
from pyspark.sql.functions import *
import pandas as pd
import numpy as np

conf = pyspark.SparkConf().set('spark.driver.host','127.0.0.1')
sc = pyspark.SparkContext(master='local', appName='myAppName',conf=conf)

# set up spark environment
spark = pyspark.sql.SparkSession.builder.getOrCreate()

# create pandas dataframe
languages = ["python", "C", "Kotlin", "Java", "Go"]

df = pd.DataFrame({"language":languages})

# convert to spark dataframe
df = spark.createDataFrame(df)

df.show()

print((df.count(), len(df.columns)))

+--------+
|language|
+--------+
|  python|
|       C|
|  Kotlin|
|    Java|
|      Go|
+--------+

(5, 1)


In [2]:
# Load the mpg dataset as a spark dataframe
# Create 1 column of output that contains a message like the one below:
# The 1999 audi a4 has a 4 cylinder engine.
# For each vehicle.


import pyspark
from pydataset import data
from pyspark.sql.functions import *

mpg = spark.createDataFrame(data("mpg"))

mpg.show()

# the YEAR MANUFACTUERER MODEL has a CYL cylinder engine.
mpg.select(
            concat( 
                    lit("The "), 
                    col("Year"),
                    lit(" "),
                    col("manufacturer"),
                    lit(" "),
                    col("model"),
                    lit(" has a "),
                    col("cyl"),
                    lit(" cylinder engine.")
                    ).alias("Car Summery")
                   ).show(20, False)
        

+------------+------------------+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|             model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+------------------+-----+----+---+----------+---+---+---+---+-------+
|        audi|                a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|                a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|                a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|                a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|
|        audi|                a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|
|        audi|                a4|  2.8|1999|  6|manual(m5)|  f| 18| 26|  p|compact|
|        audi|                a4|  3.1|2008|  6|  auto(av)|  f| 18| 27|  p|compact|
|        audi|        a4 quattro|  1.8|1999|  4|manual(m5)|  4| 18| 26|  p|compact|
|        audi|        a4 quattro|  1.8|1999|  4|  auto(l5)|  4| 16| 25|  p|c

In [21]:
# Transform the trans column so that it only contains either manual or auto.

mpg = mpg.withColumn("trans",regexp_extract( "trans",r"^(\w+)",1)
         .alias("Trans"))

mpg.show()

+------------+------------------+-----+----+---+------+---+---+---+---+-------+
|manufacturer|             model|displ|year|cyl| trans|drv|cty|hwy| fl|  class|
+------------+------------------+-----+----+---+------+---+---+---+---+-------+
|        audi|                a4|  1.8|1999|  4|  auto|  f| 18| 29|  p|compact|
|        audi|                a4|  1.8|1999|  4|manual|  f| 21| 29|  p|compact|
|        audi|                a4|  2.0|2008|  4|manual|  f| 20| 31|  p|compact|
|        audi|                a4|  2.0|2008|  4|  auto|  f| 21| 30|  p|compact|
|        audi|                a4|  2.8|1999|  6|  auto|  f| 16| 26|  p|compact|
|        audi|                a4|  2.8|1999|  6|manual|  f| 18| 26|  p|compact|
|        audi|                a4|  3.1|2008|  6|  auto|  f| 18| 27|  p|compact|
|        audi|        a4 quattro|  1.8|1999|  4|manual|  4| 18| 26|  p|compact|
|        audi|        a4 quattro|  1.8|1999|  4|  auto|  4| 16| 25|  p|compact|
|        audi|        a4 quattro|  2.0|2

In [4]:
# Load the tips dataset as a spark dataframe.

import pyspark
from pydataset import data
from pyspark.sql.functions import *

spark = pyspark.sql.SparkSession.builder.getOrCreate()

df = spark.createDataFrame(data("tips"))

df.show()

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
|     25.29|4.71|  Male|    No|Sun|Dinner|   4|
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|
|     26.88|3.12|  Male|    No|Sun|Dinner|   4|
|     15.04|1.96|  Male|    No|Sun|Dinner|   2|
|     14.78|3.23|  Male|    No|Sun|Dinner|   2|
|     10.27|1.71|  Male|    No|Sun|Dinner|   2|
|     35.26| 5.0|Female|    No|Sun|Dinner|   4|
|     15.42|1.57|  Male|    No|Sun|Dinner|   2|
|     18.43| 3.0|  Male|    No|Sun|Dinner|   4|
|     14.83|3.02|Female|    No|Sun|Dinner|   2|
|     21.58|3.92|  Male|    No|Sun|Dinner|   2|
|     10.33|1.67|Female|    No|Sun|Dinner|   3|
|     16.29|3.71|  Male|    No|Sun|Dinne

In [31]:
# What percentage of observations are smokers?

total = df.count()

(df.groupBy(df.smoker)
   .agg((round(count(df.smoker)/total,2)*100)
   .alias("percent"))
   .filter(df.smoker=="Yes")
   .show())



# smokers = df.select((smokers/total).alias("Percent of Smokers"))


+------+-------+
|smoker|percent|
+------+-------+
|   Yes|   38.0|
+------+-------+



In [6]:
# Create a column that contains the tip percentage

df.withColumn("Tip Percentage",round((col("tip")/col("total_bill"))*100,0)).show()

+----------+----+------+------+---+------+----+--------------+
|total_bill| tip|   sex|smoker|day|  time|size|Tip Percentage|
+----------+----+------+------+---+------+----+--------------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|           6.0|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|          16.0|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|          17.0|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|          14.0|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|          15.0|
|     25.29|4.71|  Male|    No|Sun|Dinner|   4|          19.0|
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|          23.0|
|     26.88|3.12|  Male|    No|Sun|Dinner|   4|          12.0|
|     15.04|1.96|  Male|    No|Sun|Dinner|   2|          13.0|
|     14.78|3.23|  Male|    No|Sun|Dinner|   2|          22.0|
|     10.27|1.71|  Male|    No|Sun|Dinner|   2|          17.0|
|     35.26| 5.0|Female|    No|Sun|Dinner|   4|          14.0|
|     15.42|1.57|  Male|    No|Sun|Dinner|   2|        

In [32]:
# Calculate the average tip percentage for each combination of sex and smoker

(df.withColumn("tip_percentage",round((df.tip/df.total_bill)*100,0))
   .groupBy("sex")
   .pivot("smoker")
   .agg(round(mean("tip_percentage"),2))
   .show())

+------+-----+-----+
|   sex|   No|  Yes|
+------+-----+-----+
|Female|15.69|18.24|
|  Male| 16.1|15.28|
+------+-----+-----+



In [8]:
# Use the seattle weather dataset referenced in the lesson to answer the questions below.

from vega_datasets import data

weather = data.seattle_weather().assign(date=lambda df: df.date.astype(str))
weather = spark.createDataFrame(weather)
weather.show(6)

+----------+-------------+--------+--------+----+-------+
|      date|precipitation|temp_max|temp_min|wind|weather|
+----------+-------------+--------+--------+----+-------+
|2012-01-01|          0.0|    12.8|     5.0| 4.7|drizzle|
|2012-01-02|         10.9|    10.6|     2.8| 4.5|   rain|
|2012-01-03|          0.8|    11.7|     7.2| 2.3|   rain|
|2012-01-04|         20.3|    12.2|     5.6| 4.7|   rain|
|2012-01-05|          1.3|     8.9|     2.8| 6.1|   rain|
|2012-01-06|          2.5|     4.4|     2.2| 2.2|   rain|
+----------+-------------+--------+--------+----+-------+
only showing top 6 rows



In [9]:
weather.dtypes

[('date', 'string'),
 ('precipitation', 'double'),
 ('temp_max', 'double'),
 ('temp_min', 'double'),
 ('wind', 'double'),
 ('weather', 'string')]

In [24]:
# Convert the temperatures to farenheight.

(weather.withColumn("temp_max",expr("round(temp_max * 9 / 5 + 32)"))
       .withColumn("temp_min",expr("round(temp_min * 9 / 5 + 32)"))
       .show())

+----------+-------------+--------+--------+----+-------+
|      date|precipitation|temp_max|temp_min|wind|weather|
+----------+-------------+--------+--------+----+-------+
|2012-01-01|          0.0|    55.0|    41.0| 4.7|drizzle|
|2012-01-02|         10.9|    51.0|    37.0| 4.5|   rain|
|2012-01-03|          0.8|    53.0|    45.0| 2.3|   rain|
|2012-01-04|         20.3|    54.0|    42.0| 4.7|   rain|
|2012-01-05|          1.3|    48.0|    37.0| 6.1|   rain|
|2012-01-06|          2.5|    40.0|    36.0| 2.2|   rain|
|2012-01-07|          0.0|    45.0|    37.0| 2.3|   rain|
|2012-01-08|          0.0|    50.0|    37.0| 2.0|    sun|
|2012-01-09|          4.3|    49.0|    41.0| 3.4|   rain|
|2012-01-10|          1.0|    43.0|    33.0| 3.4|   rain|
|2012-01-11|          0.0|    43.0|    30.0| 5.1|    sun|
|2012-01-12|          0.0|    43.0|    29.0| 1.9|    sun|
|2012-01-13|          0.0|    41.0|    27.0| 1.3|    sun|
|2012-01-14|          4.1|    40.0|    33.0| 5.3|   snow|
|2012-01-15|  

In [25]:
# Which month has the most rain, on average?

(weather.groupBy(month("date").alias("month"))
       .agg(round(avg(weather.precipitation),2)
       .alias("ave_rain"))
       .sort(month("date").asc())
       .show())
        

+-----+--------+
|month|ave_rain|
+-----+--------+
|    1|    3.76|
|    2|    3.73|
|    3|    4.89|
|    4|    3.13|
|    5|    1.67|
|    6|    1.11|
|    7|    0.39|
|    8|    1.32|
|    9|    1.96|
|   10|    4.06|
|   11|    5.35|
|   12|    5.02|
+-----+--------+



In [26]:
# Which year was the windiest?

(weather.groupBy(year("date").alias("year"))
        .agg(round(avg(weather.wind),2)
        .alias("ave_wind")).sort(year("date").asc()).show())

+----+--------+
|year|ave_wind|
+----+--------+
|2012|     3.4|
|2013|    3.02|
|2014|    3.39|
|2015|    3.16|
+----+--------+



In [27]:
# What is the most frequent type of weather in January?

(weather.filter(month("date") == "1")
        .groupBy(weather.weather.alias("Jan_weather"))
        .agg(count(weather.weather).alias("number_of_days"))
        .sort(weather.weather.asc())
        .show())

+-----------+--------------+
|Jan_weather|number_of_days|
+-----------+--------------+
|    drizzle|            10|
|        fog|            38|
|       rain|            35|
|       snow|             8|
|        sun|            33|
+-----------+--------------+



In [28]:
# What is the average high and low tempurature on sunny days in July in 2013 and 2014?

(weather.filter(weather.weather=="sun")
       .filter(month("date")=="7").filter((year("date") == 2013) | (year("date") == 2014))
       .select("temp_max","temp_min").agg(round(avg(weather.temp_max))
       .alias("ave_max_temp"),round(avg(weather.temp_min))
       .alias("ave_min_temp")).show())

+------------+------------+
|ave_max_temp|ave_min_temp|
+------------+------------+
|        27.0|        14.0|
+------------+------------+



In [29]:
# What percentage of days were rainy in q3 of 2015?

total = (weather
         .filter((month("date")==7) | (month("date")==8) | (month("date")==9))
         .filter(year("date")==2015).count())

(weather.filter((month("date")==7) | (month("date")==8) | (month("date")==9))
        .filter(year("date")==2015)
        .groupBy(weather.weather)
        .agg((round(count(weather.weather)/total,2)*100)
        .alias("percent")).filter(weather.weather=="rain")
        .show())
 


+-------+-------+
|weather|percent|
+-------+-------+
|   rain|    2.0|
+-------+-------+



In [30]:
# For each year, find what percentage of days it rained (had non-zero precipitation).

total = 365

(weather.filter(weather.precipitation>0)
                       .groupBy(year("date")
                       .alias("year"))
        .agg((round(count(weather.precipitation)/total,2)*100)
        .alias("percentage_of_days_it_rained"))
        .sort(year('date')
        .asc())
        .show())




+----+----------------------------+
|year|percentage_of_days_it_rained|
+----+----------------------------+
|2012|                        48.0|
|2013|                        42.0|
|2014|                        41.0|
|2015|                        39.0|
+----+----------------------------+

