In [0]:
from pyspark.sql.functions import collect_set

mpg_df = spark.read.csv("dbfs:/FileStore/mpg.csv", header=True, inferSchema=True)
mpg_grouped = mpg_df.groupBy("manufacturer").agg(collect_set("model").alias("unique_models"))
mpg_grouped.show(truncate=False)

+------------+--------------------------------------------------------------------------------------+
|manufacturer|unique_models                                                                         |
+------------+--------------------------------------------------------------------------------------+
|land rover  |[range rover]                                                                         |
|pontiac     |[grand prix]                                                                          |
|toyota      |[4runner 4wd, camry solara, corolla, land cruiser wagon 4wd, toyota tacoma 4wd, camry]|
|lincoln     |[navigator 2wd]                                                                       |
|audi        |[a4, a6 quattro, a4 quattro]                                                          |
|jeep        |[grand cherokee 4wd]                                                                  |
|dodge       |[durango 4wd, ram 1500 pickup 4wd, dakota pickup 4wd, caravan 2wd]  

In [0]:
from pyspark.sql.functions import round, avg

# Pivot and calculate average city mpg
mpg_pivoted = (mpg_df.groupBy("year", "cyl").pivot("manufacturer").agg(round(avg("cty"), 1)))
mpg_pivoted.show()


+----+---+----+---------+-----+----+-----+-------+----+----------+-------+-------+------+-------+------+------+----------+
|year|cyl|audi|chevrolet|dodge|ford|honda|hyundai|jeep|land rover|lincoln|mercury|nissan|pontiac|subaru|toyota|volkswagen|
+----+---+----+---------+-----+----+-----+-------+----+----------+-------+-------+------+-------+------+------+----------+
|2008|  8|16.0|     13.6| 11.9|13.6| null|   null|11.8|      12.0|   12.0|   13.0|  12.0|   16.0|  null|  13.5|      null|
|1999|  4|18.3|     19.0| 18.0|null| 24.8|   18.5|null|      null|   null|   null|  20.0|   null|  19.0|  20.0|      23.3|
|1999|  6|16.2|     18.0| 14.9|15.3| null|   18.0|15.0|      null|   null|   14.0|  16.5|   17.0|  null|  16.5|      16.8|
|2008|  6|16.8|     17.5| 15.1|15.3| null|   17.3|16.0|      null|   null|   13.0|  17.8|   18.0|  null|  16.8|      17.0|
|2008|  5|null|     null| null|null| null|   null|null|      null|   null|   null|  null|   null|  null|  null|      20.5|
|2008|  4|20.0| 

In [0]:
flights_df = spark.read.csv("dbfs:/FileStore/flight_summary___Copy.csv", header=True, inferSchema=True)
airports_df = spark.read.csv("dbfs:/FileStore/airports.csv", header=True, inferSchema=True)

airports_df = airports_df.withColumnRenamed("IATA_CODE", "airport_code").withColumnRenamed("AIRPORT", "airport_name").withColumnRenamed("LATITUDE", "lat").withColumnRenamed("LONGITUDE", "lon")

# Prevent column conflicts
airports_df_origin = airports_df.withColumnRenamed("airport_name", "origin_airport").withColumnRenamed("lat", "origin_lat").withColumnRenamed("lon", "origin_lon").withColumnRenamed("airport_code", "origin_airport_code")  
airports_df_dest = airports_df.withColumnRenamed("airport_name", "dest_airport").withColumnRenamed("lat", "dest_lat").withColumnRenamed("lon", "dest_lon").withColumnRenamed("airport_code", "dest_airport_code")  

# Join origin 
flights_with_orig = flights_df.join(
    airports_df_origin, flights_df["origin_code"] == airports_df_origin["origin_airport_code"], "left").select(
    flights_df["origin_code"], airports_df_origin["origin_airport"],
    airports_df_origin["origin_lat"], airports_df_origin["origin_lon"],
    flights_df["dest_code"], airports_df_origin["STATE"].alias("origin_state")
)

# Join destination 
flights_final = flights_with_orig.join(
    airports_df_dest, flights_with_orig["dest_code"] == airports_df_dest["dest_airport_code"], "left").select(
    flights_with_orig["origin_code"], flights_with_orig["origin_airport"],
    flights_with_orig["origin_lat"], flights_with_orig["origin_lon"],
    flights_with_orig["dest_code"], airports_df_dest["dest_airport"],
    airports_df_dest["dest_lat"], airports_df_dest["dest_lon"],
    flights_with_orig["origin_state"] 
)

# Filter for Texas
texas_flights = flights_final.filter(flights_final["origin_state"] == "TX")
texas_flights.show(truncate=False)



+-----------+-------------------------------------------------------+----------+----------+---------+-----------------------------------------------------------------------------+--------+----------+------------+
|origin_code|origin_airport                                         |origin_lat|origin_lon|dest_code|dest_airport                                                                 |dest_lat|dest_lon  |origin_state|
+-----------+-------------------------------------------------------+----------+----------+---------+-----------------------------------------------------------------------------+--------+----------+------------+
|LBB        |Lubbock Preston Smith International Airport            |33.66364  |-101.82278|DEN      |Denver International Airport                                                 |39.85841|-104.667  |TX          |
|AUS        |Austin-Bergstrom International Airport                 |30.19453  |-97.66987 |ELP      |El Paso International Airport                  

In [0]:
from pyspark.sql.window import Window
from pyspark.sql.functions import avg, weekofyear

aapl_df = spark.read.csv("dbfs:/FileStore/aapl_2017.csv", header=True, inferSchema=True)

aapl_df = aapl_df.withColumn("week", weekofyear(aapl_df["Date"]))
window_spec = Window.partitionBy("week").orderBy("Date")
aapl_weekly_avg = aapl_df.withColumn("weekly_avg_price", avg("Close").over(window_spec))
aapl_weekly_avg.orderBy("Date").show()



+----------+----------+----------+----------+----------+----------+--------+----+------------------+
|      Date|      Open|      High|       Low|     Close| Adj Close|  Volume|week|  weekly_avg_price|
+----------+----------+----------+----------+----------+----------+--------+----+------------------+
|2017-01-03|115.800003|116.330002|114.760002|116.150002| 114.31176|28781900|   1|        116.150002|
|2017-01-04|115.849998|116.510002|    115.75|116.019997|114.183815|21118100|   1|116.08499950000001|
|2017-01-05|115.919998|116.860001|115.809998|116.610001|114.764473|22193600|   1|            116.26|
|2017-01-06|116.779999|118.160004|116.470001|117.910004|116.043915|31751900|   1|116.67250100000001|
|2017-01-09|117.949997|    119.43|117.940002|118.989998|117.106812|33561900|   2|        118.989998|
|2017-01-10|118.769997|119.379997|118.300003|119.110001|117.224907|24462100|   2|       119.0499995|
|2017-01-11|118.739998|    119.93|118.599998|    119.75|117.854782|27588600|   2|119.283333