In [2]:
# Create a spark data frame that contains your favorite programming languages.

# The name of the column should be language
# View the schema of the dataframe
# Output the shape of the dataframe
# Show the first 5 records in the dataframe


# imports
import pyspark
import pandas as pd
import numpy as np

# set up spark environment
spark = pyspark.sql.SparkSession.builder.getOrCreate()

# create pandas dataframe
languages = ["python", "C", "Kotlin", "Java", "Go"]

df = pd.DataFrame({"language":languages})

# convert to spark dataframe
df = spark.createDataFrame(df)

df.show()

print((df.count(), len(df.columns)))

+--------+
|language|
+--------+
|  python|
|       C|
|  Kotlin|
|    Java|
|      Go|
+--------+

(5, 1)


In [3]:
# Load the mpg dataset as a spark dataframe
# Create 1 column of output that contains a message like the one below:
# The 1999 audi a4 has a 4 cylinder engine.
# For each vehicle.


import pyspark
from pydataset import data
from pyspark.sql.functions import *

mpg = spark.createDataFrame(data("mpg"))

mpg.show()

# the YEAR MANUFACTUERER MODEL has a CYL cylinder engine.
mpg.select(
            concat( 
                    lit("The "), 
                    col("Year"),
                    lit(" "),
                    col("manufacturer"),
                    lit(" "),
                    col("model"),
                    lit(" has a "),
                    col("cyl"),
                    lit(" cylinder engine.")
                    ).alias("Car Summery")
                   ).show(20, False)
        

+------------+------------------+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|             model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+------------------+-----+----+---+----------+---+---+---+---+-------+
|        audi|                a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|                a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|                a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|                a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|
|        audi|                a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|
|        audi|                a4|  2.8|1999|  6|manual(m5)|  f| 18| 26|  p|compact|
|        audi|                a4|  3.1|2008|  6|  auto(av)|  f| 18| 27|  p|compact|
|        audi|        a4 quattro|  1.8|1999|  4|manual(m5)|  4| 18| 26|  p|compact|
|        audi|        a4 quattro|  1.8|1999|  4|  auto(l5)|  4| 16| 25|  p|c

In [4]:
# Transform the trans column so that it only contains either manual or auto.

mpg = mpg.withColumn("trans",regexp_extract( "trans",r"^(\w+)",1).alias("Trans"))

mpg.show()

+------------+------------------+-----+----+---+------+---+---+---+---+-------+
|manufacturer|             model|displ|year|cyl| trans|drv|cty|hwy| fl|  class|
+------------+------------------+-----+----+---+------+---+---+---+---+-------+
|        audi|                a4|  1.8|1999|  4|  auto|  f| 18| 29|  p|compact|
|        audi|                a4|  1.8|1999|  4|manual|  f| 21| 29|  p|compact|
|        audi|                a4|  2.0|2008|  4|manual|  f| 20| 31|  p|compact|
|        audi|                a4|  2.0|2008|  4|  auto|  f| 21| 30|  p|compact|
|        audi|                a4|  2.8|1999|  6|  auto|  f| 16| 26|  p|compact|
|        audi|                a4|  2.8|1999|  6|manual|  f| 18| 26|  p|compact|
|        audi|                a4|  3.1|2008|  6|  auto|  f| 18| 27|  p|compact|
|        audi|        a4 quattro|  1.8|1999|  4|manual|  4| 18| 26|  p|compact|
|        audi|        a4 quattro|  1.8|1999|  4|  auto|  4| 16| 25|  p|compact|
|        audi|        a4 quattro|  2.0|2

In [9]:
# Load the tips dataset as a spark dataframe.

import pyspark
from pydataset import data
from pyspark.sql.functions import *

spark = pyspark.sql.SparkSession.builder.getOrCreate()

df = spark.createDataFrame(data("tips"))

# What percentage of observations are smokers?

In [None]:
# What percentage of observations are smokers?

In [8]:
# Create a column that contains the tip percentage

df.select(round((df.tip/df.total_bill)*100,0).alias("Tip Percentage")).show()

+--------------+
|Tip Percentage|
+--------------+
|           6.0|
|          16.0|
|          17.0|
|          14.0|
|          15.0|
|          19.0|
|          23.0|
|          12.0|
|          13.0|
|          22.0|
|          17.0|
|          14.0|
|          10.0|
|          16.0|
|          20.0|
|          18.0|
|          16.0|
|          23.0|
|          21.0|
|          16.0|
+--------------+
only showing top 20 rows



In [None]:
# Use the seattle weather dataset referenced in the lesson to answer the questions below.
# Convert the temperatures to farenheight.
# Which month has the most rain, on average?
# Which year was the windiest?
# What is the most frequent type of weather in January?
# What is the average high and low tempurature on sunny days in July in 2013 and 2014?
# What percentage of days were rainy in q3 of 2015?
# For each year, find what percentage of days it rained (had non-zero precipitation).