## Reading the raw data files from Databricks FS.

In [0]:
# File locations
airline_file_location = "/FileStore/tables/phdata/airlines.csv"
airport_file_location = "/FileStore/tables/phdata/airports.csv"
flights_file_location = "/FileStore/tables/phdata/flights/partition_0*.csv"
file_type = "csv"

# CSV options
infer_schema = "true"
first_row_is_header = "true"
delimiter = ","

#Creating Dataframes for all the raw csv files
# Read the airline.csv
airlines_df = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(airline_file_location)
airlines_df.show(truncate=False)

# Read the airport.csv
airports_df = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(airport_file_location)
airports_df.show(10, truncate=False)

# Read the flights.csv
flights_df = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(flights_file_location)
flights_df.show(10, truncate=False)

##Configure the Snowflake connector options and create Schema if needs

In [0]:
#Snowflake connection options
sf_options = {
  "sfUrl": "https://nv36420.ap-south-1.aws.snowflakecomputing.com/",
  "sfUser": "Mani",
  "sfPassword": "Password1!",
  "sfDatabase": "USER_MANI",
  "sfWarehouse": "INTERVIEW_WH"
}

In [0]:
%scala
import net.snowflake.spark.snowflake.Utils

val sf_options = Map(
  "sfUrl" -> "https://nv36420.ap-south-1.aws.snowflakecomputing.com/",
  "sfUser" -> "Mani",
  "sfPassword" -> "Password1!",
  "sfDatabase" -> "USER_MANI",
  "sfWarehouse" -> "INTERVIEW_WH"
)

// Create a Snowflake Schema
Utils.runQuery(sf_options, """CREATE SCHEMA IF NOT EXISTS RAW_DATA""")
Utils.runQuery(sf_options, """CREATE SCHEMA IF NOT EXISTS CURATED_DATA""")

## Writing raw data files into the Snowflake for better modularization

In [0]:
#Writing raw data files into the Snowfalke

airlines_df.write.mode('overwrite').format("snowflake").option("sfSchema", "RAW_DATA").option("dbtable", "USER_MANI.RAW_DATA.airlines_raw").options(**sf_options).save()
airports_df.write.mode('overwrite').format("snowflake").option("sfSchema", "RAW_DATA").option("dbtable", "USER_MANI.RAW_DATA.airports_raw").options(**sf_options).save()
flights_df.write.mode('overwrite').format("snowflake").option("sfSchema", "RAW_DATA").option("dbtable", "USER_MANI.RAW_DATA.flights_raw").options(**sf_options).save()

## Prepare the curated data set and store in Snowflake to generate the reports

In [0]:
#Writing the needed columns into the Snowflake table
cols = ["YEAR","MONTH","DAY","AIRLINE","FLIGHT_NUMBER","ORIGIN_AIRPORT","DESTINATION_AIRPORT","CANCELLED",     "CANCELLATION_REASON","AIR_SYSTEM_DELAY","SECURITY_DELAY","DEPARTURE_DELAY","ARRIVAL_DELAY",
        "AIRLINE_DELAY","LATE_AIRCRAFT_DELAY","WEATHER_DELAY"]

#On assumption basis, cleaning the data when there is null on ORIGIN_AIRPORT, DESTINATION_AIRPORT, AIRLINE
#Based on requirement, we can do more data scrubbing activity here
#Picking the desired columns to the table for computation
flights_clean_df = flights_df.dropna(how='any', subset=["ORIGIN_AIRPORT","DESTINATION_AIRPORT","AIRLINE"]).select(cols)
flights_clean_df.write.mode('overwrite').format("snowflake").option("sfSchema", "CURATED_DATA").option("dbtable", "USER_MANI.CURATED_DATA.flights").options(**sf_options).save()

## Report 1:
● Total number of flights by airline and airport on a monthly basis

In [0]:
view_df=spark.read.format("snowflake").option("dbtable", "USER_MANI.CURATED_DATA.TOTAL_FLIGHTS_PER_MONTH_BASIS").options(**sf_options).load()
display(view_df)

## Report 2:
On-time percentage of each airline for the year 2015

In [0]:
on_time_airlines_view_df=spark.read.format("snowflake").option("dbtable", "USER_MANI.CURATED_DATA.ON_TIME_AIRLINES").options(**sf_options).load()
display(on_time_airlines_view_df)

## Report 3:
● Airlines with largest number of delays

In [0]:
largest_delay_airlines_view_df=spark.read.format("snowflake").option("dbtable", "USER_MANI.CURATED_DATA.AIRLINES_WITH_LARGEST_DELAYS").options(**sf_options).load()
display(largest_delay_airlines_view_df)

## Report 4:
● Cancellation reasons by airport

In [0]:
cancel_reasons_view_df=spark.read.format("snowflake").option("dbtable", "USER_MANI.CURATED_DATA.CANCELLATION_REASONS_BY_AIRPORT").options(**sf_options).load()
display(cancel_reasons_view_df)

## Report 5:
● Delay reasons by airport

In [0]:
delay_reasons_view_df=spark.read.format("snowflake").option("dbtable", "USER_MANI.CURATED_DATA.DELAY_REASONS_BY_AIRPORT").options(**sf_options).load()
display(delay_reasons_view_df)

## Report 6:
● Airline with most unique routes

In [0]:
from pyspark.sql import functions as f
from pyspark.sql.types import ArrayType
from pyspark.sql import Window

window_airlines = Window.partitionBy("AIRLINE")
window_airlines_order = Window.partitionBy("AIRLINE").orderBy(f.asc("unique_route_count"))

cols = ["AIRLINE", "ORIGIN_AIRPORT", "DESTINATION_AIRPORT"]
unique_route_df = flights_df.select(cols).withColumn("airline_route", f.collect_set(f.array_sort(f.array(f.col("ORIGIN_AIRPORT"), f.col("DESTINATION_AIRPORT")))).over(window_airlines)).withColumn("unique_route_count", f.size("airline_route")).withColumn("row_no", f.row_number().over(window_airlines_order))
unique_route_df = unique_route_df.filter(f.col("row_no") == 1).drop("row_no", "ORIGIN_AIRPORT", "DESTINATION_AIRPORT")

unique_route_df.write.mode('overwrite').format("snowflake").option("sfSchema", "CURATED_DATA").option("dbtable", "USER_MANI.CURATED_DATA.airlines_unique_route").options(**sf_options).save()
# display(unique_route_df)

In [0]:
airlines_unique_routes_view_df=spark.read.format("snowflake").option("dbtable", "USER_MANI.CURATED_DATA.UNIQUE_ROUTES_BY_AIRLINES").options(**sf_options).load()
display(airlines_unique_routes_view_df)