Airlines data pipeline 

In [1]:
df = spark.read.format("csv").option("header","false").load("Files/bronze/airlines.csv")
# df now is a Spark DataFrame containing CSV data from "Files/bronze/airlines.csv".
display(df)

StatementMeta(, d40d6945-9890-4723-ac23-d9e6b9cd2c5a, 3, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, b61d325b-f1b1-40d3-859a-faf7a5140aa0)

In [2]:
from pyspark.sql import SparkSession  
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

# Create SparkSession
spark = SparkSession.builder.appName("Airline Project").getOrCreate()

# Define the schema for the airline data
Flights_schema = StructType([  
    StructField("AirlineCode", StringType(), nullable=False),
    StructField("Airline", StringType(), nullable=False)
])


StatementMeta(, d40d6945-9890-4723-ac23-d9e6b9cd2c5a, 4, Finished, Available, Finished)

In [3]:
#loading raw data/rows into schema
df = spark.read.format("csv") \
    .schema(Flights_schema) \
    .option("header", "false") \
    .load("Files/bronze/airlines.csv")

df.show()


StatementMeta(, d40d6945-9890-4723-ac23-d9e6b9cd2c5a, 5, Finished, Available, Finished)

+-----------+--------------------+
|AirlineCode|             Airline|
+-----------+--------------------+
|         UA|United Air Lines ...|
|         AA|American Airlines...|
|         US|     US Airways Inc.|
|         F9|Frontier Airlines...|
|         B6|     JetBlue Airways|
|         OO|Skywest Airlines ...|
|         AS|Alaska Airlines Inc.|
|         NK|    Spirit Air Lines|
|         WN|Southwest Airline...|
|         DL|Delta Air Lines Inc.|
|         EV|Atlantic Southeas...|
|         HA|Hawaiian Airlines...|
|         MQ|American Eagle Ai...|
|         VX|      Virgin America|
|         TT|             Testing|
+-----------+--------------------+



In [4]:
# creating delta table in Flights_schema
#df.write.format("delta").saveAsTable("Flights_schema.airlines")


df.write.format("delta").mode("append").saveAsTable("Flights_schema.airlines")


StatementMeta(, d40d6945-9890-4723-ac23-d9e6b9cd2c5a, 6, Finished, Available, Finished)

In [5]:
df.show()


StatementMeta(, d40d6945-9890-4723-ac23-d9e6b9cd2c5a, 7, Finished, Available, Finished)

+-----------+--------------------+
|AirlineCode|             Airline|
+-----------+--------------------+
|         UA|United Air Lines ...|
|         AA|American Airlines...|
|         US|     US Airways Inc.|
|         F9|Frontier Airlines...|
|         B6|     JetBlue Airways|
|         OO|Skywest Airlines ...|
|         AS|Alaska Airlines Inc.|
|         NK|    Spirit Air Lines|
|         WN|Southwest Airline...|
|         DL|Delta Air Lines Inc.|
|         EV|Atlantic Southeas...|
|         HA|Hawaiian Airlines...|
|         MQ|American Eagle Ai...|
|         VX|      Virgin America|
|         TT|             Testing|
+-----------+--------------------+



Airports data pipeline

In [6]:
df2 = spark.read.format("csv").option("header","true").load("Files/bronze/airports.csv")
# df now is a Spark DataFrame containing CSV data from "Files/bronze/airlines.csv".
display(df2)

StatementMeta(, d40d6945-9890-4723-ac23-d9e6b9cd2c5a, 8, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 2363d6b6-1f64-4a50-86d6-ac7452457945)

In [7]:
from pyspark.sql import SparkSession  
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType

# Create SparkSession
spark = SparkSession.builder.appName("Airline Project").getOrCreate()

# Define the schema for the airline data
Flights_schema = StructType([  
    StructField("IATA_CODE", StringType(), nullable=False),
    StructField("Airport", StringType(), nullable=False),
    StructField("City", StringType(), nullable=False),
    StructField("State", StringType(), nullable=False),
    StructField("Country", StringType(), nullable=False),
    StructField("Latitude", FloatType(), nullable=False),
    StructField("Longitude", FloatType(), nullable=False)
])

StatementMeta(, d40d6945-9890-4723-ac23-d9e6b9cd2c5a, 9, Finished, Available, Finished)

In [8]:
#loading raw data/rows into schema
df2 = spark.read.format("csv") \
    .schema(Flights_schema) \
    .option("header", "true") \
    .load("Files/bronze/airports.csv")

df2.show()

StatementMeta(, d40d6945-9890-4723-ac23-d9e6b9cd2c5a, 10, Finished, Available, Finished)

+---------+--------------------+-------------+-----+-------+--------+----------+
|IATA_CODE|             Airport|         City|State|Country|Latitude| Longitude|
+---------+--------------------+-------------+-----+-------+--------+----------+
|      ABE|Lehigh Valley Int...|    Allentown|   PA|    USA|40.65236|  -75.4404|
|      ABI|Abilene Regional ...|      Abilene|   TX|    USA|32.41132|  -99.6819|
|      ABQ|Albuquerque Inter...|  Albuquerque|   NM|    USA|35.04022|-106.60919|
|      ABR|Aberdeen Regional...|     Aberdeen|   SD|    USA|45.44906| -98.42183|
|      ABY|Southwest Georgia...|       Albany|   GA|    USA|31.53552| -84.19447|
|      ACK|Nantucket Memoria...|    Nantucket|   MA|    USA|41.25305| -70.06018|
|      ACT|Waco Regional Air...|         Waco|   TX|    USA|31.61129| -97.23052|
|      ACV|      Arcata Airport|Arcata/Eureka|   CA|    USA|40.97812|-124.10862|
|      ACY|Atlantic City Int...|Atlantic City|   NJ|    USA|39.45758| -74.57717|
|      ADK|        Adak Airp

In [9]:
# creating delta table in Flights_schema
#df2.write.format("delta").saveAsTable("Flights_schema.airports")   # loading first delta table 

# append new data- automated pipeline 

df2.write.format("delta").mode("append").saveAsTable("Flights_schema.airports") 

StatementMeta(, d40d6945-9890-4723-ac23-d9e6b9cd2c5a, 11, Finished, Available, Finished)