In [2]:
# Create a spark data frame that contains your favorite programming languages.

# The name of the column should be language
# View the schema of the dataframe
# Output the shape of the dataframe
# Show the first 5 records in the dataframe


# imports
import pyspark
from pyspark.sql.functions import *
import pandas as pd
import numpy as np

conf = pyspark.SparkConf().set('spark.driver.host','127.0.0.1')
sc = pyspark.SparkContext(master='local', appName='myAppName',conf=conf)

# set up spark environment
spark = pyspark.sql.SparkSession.builder.getOrCreate()

# create pandas dataframe
languages = ["python", "C", "Kotlin", "Java", "Go"]

df = pd.DataFrame({"language":languages})

# convert to spark dataframe
df = spark.createDataFrame(df)

df.show()

print((df.count(), len(df.columns)))

+--------+
|language|
+--------+
|  python|
|       C|
|  Kotlin|
|    Java|
|      Go|
+--------+

(5, 1)


In [3]:
# Load the mpg dataset as a spark dataframe
# Create 1 column of output that contains a message like the one below:
# The 1999 audi a4 has a 4 cylinder engine.
# For each vehicle.


import pyspark
from pydataset import data
from pyspark.sql.functions import *

mpg = spark.createDataFrame(data("mpg"))

mpg.show()

# the YEAR MANUFACTUERER MODEL has a CYL cylinder engine.
mpg.select(
            concat( 
                    lit("The "), 
                    col("Year"),
                    lit(" "),
                    col("manufacturer"),
                    lit(" "),
                    col("model"),
                    lit(" has a "),
                    col("cyl"),
                    lit(" cylinder engine.")
                    ).alias("Car Summery")
                   ).show(20, False)
        

+------------+------------------+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|             model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+------------------+-----+----+---+----------+---+---+---+---+-------+
|        audi|                a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|                a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|                a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|                a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|
|        audi|                a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|
|        audi|                a4|  2.8|1999|  6|manual(m5)|  f| 18| 26|  p|compact|
|        audi|                a4|  3.1|2008|  6|  auto(av)|  f| 18| 27|  p|compact|
|        audi|        a4 quattro|  1.8|1999|  4|manual(m5)|  4| 18| 26|  p|compact|
|        audi|        a4 quattro|  1.8|1999|  4|  auto(l5)|  4| 16| 25|  p|c

In [None]:
# Transform the trans column so that it only contains either manual or auto.

mpg = mpg.withColumn("trans",regexp_extract( "trans",r"^(\w+)",1).alias("Trans"))

mpg.show()

In [None]:
# Load the tips dataset as a spark dataframe.

import pyspark
from pydataset import data
from pyspark.sql.functions import *

spark = pyspark.sql.SparkSession.builder.getOrCreate()

df = spark.createDataFrame(data("tips"))

df.show()

In [None]:
# What percentage of observations are smokers?

total = df.count()

df.groupBy(df.smoker).agg((round(count(df.smoker)/total,2)*100).alias("percent")).filter(df.smoker=="Yes").show()



# smokers = df.select((smokers/total).alias("Percent of Smokers"))


In [None]:
# Create a column that contains the tip percentage

df.withColumn("Tip Percentage",round((col("tip")/col("total_bill"))*100,0)).show()

In [None]:
# Calculate the average tip percentage for each combination of sex and smoker

df.withColumn("tip_percentage",round((df.tip/df.total_bill)*100,0)).groupBy("sex").pivot("smoker").agg(round(mean("tip_percentage"),2)).show()

In [None]:
# Use the seattle weather dataset referenced in the lesson to answer the questions below.

from vega_datasets import data

weather = data.seattle_weather().assign(date=lambda df: df.date.astype(str))
weather = spark.createDataFrame(weather)
weather.show(6)

In [None]:
weather.dtypes

In [None]:
# Convert the temperatures to farenheight.

weather.withColumn("temp_max",expr("round(temp_max * 9 / 5 + 32)")).withColumn("temp_min",expr("round(temp_min * 9 / 5 + 32)")).show()

In [None]:
# Which month has the most rain, on average?

weather.groupBy(month("date").alias("month")).agg(round(avg(weather.precipitation),2).alias("ave_rain")).sort(month("date").asc()).show()
        

In [None]:
# Which year was the windiest?

weather.groupBy(year("date").alias("year")).agg(round(avg(weather.wind),2).alias("ave_wind")).sort(year("date").asc()).show()

In [None]:
# What is the most frequent type of weather in January?

weather.filter(month("date") == "1").groupBy(weather.weather.alias("Jan_weather")).agg(count(weather.weather).alias("number_of_days")).sort(weather.weather.asc()).show()

In [None]:
# What is the average high and low tempurature on sunny days in July in 2013 and 2014?

weather.filter(weather.weather=="sun").filter(month("date")=="7").filter((year("date") == 2013) | (year("date") == 2014)).select("temp_max","temp_min").agg(round(avg(weather.temp_max)).alias("ave_max_temp"),round(avg(weather.temp_min)).alias("ave_min_temp")).show()

In [None]:
# What percentage of days were rainy in q3 of 2015?

total = weather.filter((month("date")==7) | (month("date")==8) | (month("date")==9)).filter(year("date")==2015).count()

weather.filter((month("date")==7) | (month("date")==8) | (month("date")==9)).filter(year("date")==2015).groupBy(weather.weather).agg((round(count(weather.weather)/total,2)*100).alias("percent")).filter(weather.weather=="rain").show()



In [None]:
# For each year, find what percentage of days it rained (had non-zero precipitation).

total = 365

weather.filter(weather.precipitation>0).groupBy(year("date").alias("year")).agg((round(count(weather.precipitation)/total,2)*100).alias("percentage_of_days_it_rained")).sort(year('date').asc()).show()


