In [1]:
import pyspark.sql
import pyspark.sql.functions as F
spark = pyspark.sql.SparkSession.builder.getOrCreate()

## 1. Create a spark data frame that contains your favorite programming languages.
    
    - The name of the column should be `language`
    - View the schema of the dataframe
    - Output the shape of the dataframe
    - Show the first 5 records in the dataframe



In [2]:
data = [{'language':'python'},
    {'language':'javascript'}
    ]

df =spark.createDataFrame(data)
df.show()

+----------+
|  language|
+----------+
|    python|
|javascript|
+----------+



In [3]:
data = [['python'],['javascript']]
columns = ['language']

df = spark.createDataFrame(data=data, schema=columns)
df.show()

+----------+
|  language|
+----------+
|    python|
|javascript|
+----------+



## 2. Load the `mpg` dataset as a spark dataframe.


In [4]:
import pydataset
mpg = pydataset.data('mpg')
mpg.head(2)

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,cty,hwy,fl,class
1,audi,a4,1.8,1999,4,auto(l5),f,18,29,p,compact
2,audi,a4,1.8,1999,4,manual(m5),f,21,29,p,compact


In [5]:
df = spark.createDataFrame(mpg)
df.show()

+------------+------------------+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|             model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+------------------+-----+----+---+----------+---+---+---+---+-------+
|        audi|                a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|                a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|                a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|                a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|
|        audi|                a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|
|        audi|                a4|  2.8|1999|  6|manual(m5)|  f| 18| 26|  p|compact|
|        audi|                a4|  3.1|2008|  6|  auto(av)|  f| 18| 27|  p|compact|
|        audi|        a4 quattro|  1.8|1999|  4|manual(m5)|  4| 18| 26|  p|compact|
|        audi|        a4 quattro|  1.8|1999|  4|  auto(l5)|  4| 16| 25|  p|c

    
    1. Create 1 column of output that contains a message like the one below:
        
        `The 1999 audi a4 has a 4 cylinder engine.` 
        
        For each vehicle.
        


In [6]:
df = df.withColumnRenamed('manufacturer', 'make')

In [7]:
df.select(
    F.format_string("The %d %s %s has a %d cylinder engine", df['year'], df['make'], df['model'], df['cyl']).alias('sentence')
    ).show(5, truncate=False)

+----------------------------------------+
|sentence                                |
+----------------------------------------+
|The 1999 audi a4 has a 4 cylinder engine|
|The 1999 audi a4 has a 4 cylinder engine|
|The 2008 audi a4 has a 4 cylinder engine|
|The 2008 audi a4 has a 4 cylinder engine|
|The 1999 audi a4 has a 6 cylinder engine|
+----------------------------------------+
only showing top 5 rows



    2. Transform the `trans` column so that it only contains either `manual` or `auto`.
        


In [8]:
df.withColumn('trans',
    F.when(df['trans'].contains('manual'), 'manual'
        ).otherwise('auto')
    ).show(5)

+----+-----+-----+----+---+------+---+---+---+---+-------+
|make|model|displ|year|cyl| trans|drv|cty|hwy| fl|  class|
+----+-----+-----+----+---+------+---+---+---+---+-------+
|audi|   a4|  1.8|1999|  4|  auto|  f| 18| 29|  p|compact|
|audi|   a4|  1.8|1999|  4|manual|  f| 21| 29|  p|compact|
|audi|   a4|  2.0|2008|  4|manual|  f| 20| 31|  p|compact|
|audi|   a4|  2.0|2008|  4|  auto|  f| 21| 30|  p|compact|
|audi|   a4|  2.8|1999|  6|  auto|  f| 16| 26|  p|compact|
+----+-----+-----+----+---+------+---+---+---+---+-------+
only showing top 5 rows



## 3. Load the `tips` dataset as a spark dataframe.
    
    

In [9]:
df = pydataset.data('tips')
df = spark.createDataFrame(df)
df.show(5)

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
+----------+----+------+------+---+------+----+
only showing top 5 rows



1. What percentage of observations are smokers?
    

In [10]:
df.count()

244

In [11]:
df.groupBy('smoker').agg(F.count(df['smoker']) / df.count()).show()

+------+---------------------+
|smoker|(count(smoker) / 244)|
+------+---------------------+
|    No|   0.6188524590163934|
|   Yes|  0.38114754098360654|
+------+---------------------+



2. Create a column that contains the tip percentage
    

In [12]:
df.printSchema()

root
 |-- total_bill: double (nullable = true)
 |-- tip: double (nullable = true)
 |-- sex: string (nullable = true)
 |-- smoker: string (nullable = true)
 |-- day: string (nullable = true)
 |-- time: string (nullable = true)
 |-- size: long (nullable = true)



In [13]:
df = df.withColumn(
    'tip_percent', df.tip / df.total_bill
    )
df.show()

+----------+----+------+------+---+------+----+-------------------+
|total_bill| tip|   sex|smoker|day|  time|size|        tip_percent|
+----------+----+------+------+---+------+----+-------------------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|0.05944673337257211|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|0.16054158607350097|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|0.16658733936220846|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2| 0.1397804054054054|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|0.14680764538430255|
|     25.29|4.71|  Male|    No|Sun|Dinner|   4|0.18623962040332148|
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|0.22805017103762829|
|     26.88|3.12|  Male|    No|Sun|Dinner|   4|0.11607142857142858|
|     15.04|1.96|  Male|    No|Sun|Dinner|   2|0.13031914893617022|
|     14.78|3.23|  Male|    No|Sun|Dinner|   2| 0.2185385656292287|
|     10.27|1.71|  Male|    No|Sun|Dinner|   2| 0.1665043816942551|
|     35.26| 5.0|Female|    No|Sun|Dinner|   4|0

3. Calculate the average tip percentage for each combination of sex and smoker.



In [14]:
df.groupBy(['sex','smoker']).agg(F.avg(df.tip_percent)).show()

+------+------+-------------------+
|   sex|smoker|   avg(tip_percent)|
+------+------+-------------------+
|  Male|    No| 0.1606687151291298|
|  Male|   Yes| 0.1527711752024851|
|Female|    No| 0.1569209707691836|
|Female|   Yes|0.18215035269941035|
+------+------+-------------------+



## 4. Use the seattle weather dataset referenced in the lesson to answer the questions below.
    
    

In [15]:
from vega_datasets import data
df = data.seattle_weather()
df = spark.createDataFrame(df)
df.show(5)

+-------------------+-------------+--------+--------+----+-------+
|               date|precipitation|temp_max|temp_min|wind|weather|
+-------------------+-------------+--------+--------+----+-------+
|2012-01-01 00:00:00|          0.0|    12.8|     5.0| 4.7|drizzle|
|2012-01-02 00:00:00|         10.9|    10.6|     2.8| 4.5|   rain|
|2012-01-03 00:00:00|          0.8|    11.7|     7.2| 2.3|   rain|
|2012-01-04 00:00:00|         20.3|    12.2|     5.6| 4.7|   rain|
|2012-01-05 00:00:00|          1.3|     8.9|     2.8| 6.1|   rain|
+-------------------+-------------+--------+--------+----+-------+
only showing top 5 rows



- Convert the temperatures to fahrenheit.
    

In [16]:
def c_to_f(c):
    f = (c * (9/5))+32
    return f

In [17]:
9/5

1.8

In [18]:
def spark_to_f(name:str):
    return ((F.col(name) * (9/5)) + 32).alias(name)

In [19]:
type(spark_to_f('temp_max'))

pyspark.sql.column.Column

In [20]:
df = df.withColumn('temp_max',
        spark_to_f('temp_max')
    ).withColumn('temp_min',
        spark_to_f('temp_min')
    )

df.show()

+-------------------+-------------+------------------+------------------+----+-------+
|               date|precipitation|          temp_max|          temp_min|wind|weather|
+-------------------+-------------+------------------+------------------+----+-------+
|2012-01-01 00:00:00|          0.0|55.040000000000006|              41.0| 4.7|drizzle|
|2012-01-02 00:00:00|         10.9|             51.08|             37.04| 4.5|   rain|
|2012-01-03 00:00:00|          0.8|             53.06|             44.96| 2.3|   rain|
|2012-01-04 00:00:00|         20.3|             53.96|             42.08| 4.7|   rain|
|2012-01-05 00:00:00|          1.3|48.019999999999996|             37.04| 6.1|   rain|
|2012-01-06 00:00:00|          2.5|             39.92|             35.96| 2.2|   rain|
|2012-01-07 00:00:00|          0.0|             44.96|             37.04| 2.3|   rain|
|2012-01-08 00:00:00|          0.0|              50.0|             37.04| 2.0|    sun|
|2012-01-09 00:00:00|          4.3|        

- Which month has the most rain, on average?
    

In [21]:
df.groupBy( F.month(df.date).alias('month') ).agg(F.avg('precipitation').alias('avg')).sort('avg', ascending=False).show(1)

+-----+-----------------+
|month|              avg|
+-----+-----------------+
|   11|5.354166666666667|
+-----+-----------------+
only showing top 1 row



- Which year was the windiest?
    

In [22]:
df.groupBy( F.year(df.date).alias('year') ).agg(F.avg('wind').alias('avg')).sort('avg', ascending=False).show(5)

+----+------------------+
|year|               avg|
+----+------------------+
|2012| 3.400819672131147|
|2014|3.3876712328767136|
|2015|  3.15972602739726|
|2013|3.0158904109589044|
+----+------------------+



- What is the most frequent type of weather in January?
    

In [30]:
# fog
df.withColumn('month', F.month(df.date)).where(F.col('month') == 1).groupBy('weather').agg(F.count('weather')).show()

+-------+--------------+
|weather|count(weather)|
+-------+--------------+
|drizzle|            10|
|   rain|            35|
|    sun|            33|
|   snow|             8|
|    fog|            38|
+-------+--------------+



- What is the average high and low temperature on sunny days in July in 2013 and 2014?
    

In [44]:
month = F.month(F.col('date')).alias('month')
year = F.year(F.col('date')).alias('year')
df.select(
    month,
    year,
    'temp_max',
    'temp_min'
    ).where(month == 7).groupBy(['month','year']).agg(F.avg('temp_max'), F.avg('temp_min')).show(5)

+-----+----+-----------------+------------------+
|month|year|    avg(temp_max)|     avg(temp_min)|
+-----+----+-----------------+------------------+
|    7|2012| 73.2316129032258|55.278064516129035|
|    7|2013| 78.9683870967742| 57.07806451612904|
|    7|2014|            80.42|57.966451612903235|
|    7|2015|82.56838709677419|59.900000000000006|
+-----+----+-----------------+------------------+



- What percentage of days were rainy in q3 of 2015?
    

- For each year, find what percentage of days it rained (had non-zero precipitation).