# AWS Glue Studio Notebook
##### You are now running a AWS Glue Studio notebook; To start using your notebook you need to start an AWS Glue Interactive Session.


####  Run this cell to set up and start your interactive session.


In [1]:
%idle_timeout 120
%glue_version 4.0
%worker_type G.1X
%number_of_workers 5

import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
  
sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)

Welcome to the Glue Interactive Sessions Kernel
For more information on available magic commands, please type %help in any new cell.

Please view our Getting Started page to access the most up-to-date information on the Interactive Sessions kernel: https://docs.aws.amazon.com/glue/latest/dg/interactive-sessions.html
Installed kernel version: 1.0.5 
Current idle_timeout is None minutes.
idle_timeout has been set to 120 minutes.
Setting Glue version to: 4.0
Previous worker type: None
Setting new worker type to: G.1X
Previous number of workers: None
Setting new number of workers to: 5
Trying to create a Glue session for the kernel.
Session Type: glueetl
Worker Type: G.1X
Number of Workers: 5
Idle Timeout: 120
Session ID: efa9356e-2ce9-41cd-92b6-bd7608fdaf2a
Applying the following default arguments:
--glue_kernel_version 1.0.5
--enable-glue-datacatalog true
Waiting for session efa9356e-2ce9-41cd-92b6-bd7608fdaf2a to get into ready status...
Session efa9356e-2ce9-41cd-92b6-bd7608fdaf2a has 

### **Start**

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
from pyspark.sql.functions import col, avg, when, floor, trim, concat, lit, desc, lag, sum, to_date, concat_ws, date_format
from pyspark.sql import functions as F




In [3]:
from awsglue.dynamicframe import DynamicFrame

# Initialize a Spark context
spark_context = SparkContext.getOrCreate()

# Initialize a Glue context
glue_context = GlueContext(spark_context)

# Specify the S3 path to your Parquet file
parquet_path = "s3://airtime-historical-data/clean/pipeline-2/"

# Read the Parquet file into a Spark DataFrame
df = glue_context.spark_session.read.parquet(parquet_path)

# Show the first few rows of the DataFrame to verify it's loaded correctly
df.show()


+-------+-------------+---------+----------+--------+----------+-------+----------+--------+----+---------+--------------+-------+-----+------+----+
|deptime|uniquecarrier|flightnum|crsarrtime|depdelay|dayofmonth|tailnum|crsdeptime|arrdelay|year|dayofweek|crselapsedtime|arrtime|month|origin|dest|
+-------+-------------+---------+----------+--------+----------+-------+----------+--------+----+---------+--------------+-------+-----+------+----+
|   1017|           MQ|     3374|      1115|       3|         1| N932AE|      1020|      10|2007|        5|            55|   1105|    6|   AUS| DAL|
|   1110|           MQ|     3762|      1220|       5|         2| N693AE|      1115|       4|2007|        1|            65|   1224|    4|   AMA| DFW|
|    834|           MQ|     4802|       955|       6|        10| N723AE|       840|      27|2007|        4|            75|    928|    5|   LGA| BOS|
|   1521|           MQ|     3787|      1540|      31|         8| N509AE|      1450|      21|2007|        6

In [6]:
# Assuming `df` is your DataFrame
na_counts = df.select([F.count(F.when(F.col(c).isNull(), c)).alias(c) for c in df.columns])

# Show the result
na_counts.show()

+-------+-------------+---------+----------+--------+----------+-------+----------+--------+----+---------+--------------+-------+-----+------+----+
|deptime|uniquecarrier|flightnum|crsarrtime|depdelay|dayofmonth|tailnum|crsdeptime|arrdelay|year|dayofweek|crselapsedtime|arrtime|month|origin|dest|
+-------+-------------+---------+----------+--------+----------+-------+----------+--------+----+---------+--------------+-------+-----+------+----+
|1087657|            0|        0|         0| 1087657|         0|      0|         0| 1190147|   0|        0|         25695|1188125|    0|     0|   0|
+-------+-------------+---------+----------+--------+----------+-------+----------+--------+----+---------+--------------+-------+-----+------+----+


***Further data cleaning***

In [7]:
ini_cnt = df.count()
df = df.dropna(subset = ["deptime", "crsarrtime", "depdelay", "dayofmonth", "tailnum", "crsdeptime", "arrdelay", "year", "dayofweek", "month", "arrtime"])
df = df.filter(df["tailnum"] != "NA")
fin_cnt = df.count()
rows_dropped = ini_cnt - fin_cnt

print(f"Number of rows intitally: {ini_cnt}")
print(f"Number of rows dropped: {rows_dropped}")
print(f"Number of rows left: {fin_cnt}")
df.show()

Number of rows intitally: 43458247
Number of rows dropped: 1190147
Number of rows left: 42268100
+-------+-------------+---------+----------+--------+----------+-------+----------+--------+----+---------+--------------+-------+-----+------+----+
|deptime|uniquecarrier|flightnum|crsarrtime|depdelay|dayofmonth|tailnum|crsdeptime|arrdelay|year|dayofweek|crselapsedtime|arrtime|month|origin|dest|
+-------+-------------+---------+----------+--------+----------+-------+----------+--------+----+---------+--------------+-------+-----+------+----+
|   1017|           MQ|     3374|      1115|       3|         1| N932AE|      1020|      10|2007|        5|            55|   1105|    6|   AUS| DAL|
|   1110|           MQ|     3762|      1220|       5|         2| N693AE|      1115|       4|2007|        1|            65|   1224|    4|   AMA| DFW|
|    834|           MQ|     4802|       955|       6|        10| N723AE|       840|      27|2007|        4|            75|    928|    5|   LGA| BOS|
|   1521|

***Defining output location***

In [9]:
s3 = "s3://pipeline2-data-storage/historic-processed/"




#### **Feature Engineering**

In [10]:
# Creating total delay column
df = df.withColumn("totaldelay", col("depdelay") + col("arrdelay"))
df.head()

Row(deptime=1017, uniquecarrier='MQ', flightnum='3374', crsarrtime=1115, depdelay=3, dayofmonth=1, tailnum='N932AE', crsdeptime=1020, arrdelay=10, year=2007, dayofweek=5, crselapsedtime=55, arrtime=1105, month=6, origin='AUS', dest='DAL', totaldelay=13)


In [11]:
# Creating a time of year column to see seasonality
df = df.withColumn("season", when(col("month").isin(12, 1, 2), "Winter") .when(col("month").between(3, 5), "Spring") .when(col("month").between(6, 8), "Summer") .when(col("month").between(9, 11), "Fall") )
df.head()

Row(deptime=1017, uniquecarrier='MQ', flightnum='3374', crsarrtime=1115, depdelay=3, dayofmonth=1, tailnum='N932AE', crsdeptime=1020, arrdelay=10, year=2007, dayofweek=5, crselapsedtime=55, arrtime=1105, month=6, origin='AUS', dest='DAL', totaldelay=13, season='Summer')


In [12]:
# Creating the scheduled arrival/departure HOUR column to see time of day
df = df.withColumn("scheduledarrhour", floor(col("crsarrtime") / 100))
df = df.withColumn("scheduleddephour", floor(col("crsdeptime") / 100))
df.head()

Row(deptime=1017, uniquecarrier='MQ', flightnum='3374', crsarrtime=1115, depdelay=3, dayofmonth=1, tailnum='N932AE', crsdeptime=1020, arrdelay=10, year=2007, dayofweek=5, crselapsedtime=55, arrtime=1105, month=6, origin='AUS', dest='DAL', totaldelay=13, season='Summer', scheduledarrhour=11, scheduleddephour=10)


In [13]:
# Creating a column to check if it is a long haul flight
df = df.withColumn("longhaulflight", when(col("crselapsedtime") > 360, 1).otherwise(0))
df.head()

Row(deptime=1017, uniquecarrier='MQ', flightnum='3374', crsarrtime=1115, depdelay=3, dayofmonth=1, tailnum='N932AE', crsdeptime=1020, arrdelay=10, year=2007, dayofweek=5, crselapsedtime=55, arrtime=1105, month=6, origin='AUS', dest='DAL', totaldelay=13, season='Summer', scheduledarrhour=11, scheduleddephour=10, longhaulflight=0)


In [14]:
# Get route of the flight
df = df.withColumn("route", concat(col("origin"), lit("-"), col("dest")))
df.show(2)

+-------+-------------+---------+----------+--------+----------+-------+----------+--------+----+---------+--------------+-------+-----+------+----+----------+------+----------------+----------------+--------------+-------+
|deptime|uniquecarrier|flightnum|crsarrtime|depdelay|dayofmonth|tailnum|crsdeptime|arrdelay|year|dayofweek|crselapsedtime|arrtime|month|origin|dest|totaldelay|season|scheduledarrhour|scheduleddephour|longhaulflight|  route|
+-------+-------------+---------+----------+--------+----------+-------+----------+--------+----+---------+--------------+-------+-----+------+----+----------+------+----------------+----------------+--------------+-------+
|   1017|           MQ|     3374|      1115|       3|         1| N932AE|      1020|      10|2007|        5|            55|   1105|    6|   AUS| DAL|        13|Summer|              11|              10|             0|AUS-DAL|
|   1110|           MQ|     3762|      1220|       5|         2| N693AE|      1115|       4|2007|       

In [15]:
# Create date
df = df.withColumn(
    "flightdate",
        to_date(concat_ws("-", col("dayofmonth").cast("string"), col("month").cast("string"), col("year").cast("string")), "d-M-yyyy")
    )
df.show(3)

+-------+-------------+---------+----------+--------+----------+-------+----------+--------+----+---------+--------------+-------+-----+------+----+----------+------+----------------+----------------+--------------+-------+----------+
|deptime|uniquecarrier|flightnum|crsarrtime|depdelay|dayofmonth|tailnum|crsdeptime|arrdelay|year|dayofweek|crselapsedtime|arrtime|month|origin|dest|totaldelay|season|scheduledarrhour|scheduleddephour|longhaulflight|  route|flightdate|
+-------+-------------+---------+----------+--------+----------+-------+----------+--------+----+---------+--------------+-------+-----+------+----+----------+------+----------------+----------------+--------------+-------+----------+
|   1017|           MQ|     3374|      1115|       3|         1| N932AE|      1020|      10|2007|        5|            55|   1105|    6|   AUS| DAL|        13|Summer|              11|              10|             0|AUS-DAL|2007-06-01|
|   1110|           MQ|     3762|      1220|       5|       

*Saving df*

In [16]:
df.write.mode("overwrite").parquet(s3 + "df/")




***Cascading Effect***

In [17]:
## To ensure that we are careful when taking care of the cumulative delay

df_cascade = df

# Define a window partitioned by TailNum, ordered by FlightDate and DepTime
window_spec = Window.partitionBy("TailNum").orderBy("FlightDate", "DepTime")

# Get the previous destination and arrival delay for the same aircraft
df_cascade = df_cascade.withColumn("PrevDest", lag("Dest").over(window_spec))
df_cascade = df_cascade.withColumn("PrevArrDelay", lag("ArrDelay").over(window_spec))

# Check if the current flight’s origin matches the previous flight's destination
df_cascade = df_cascade.withColumn("IsConnected", (col("Origin") == col("PrevDest")).cast("int"))

# Set effective arrival delay only if flights are connected
df_cascade = df_cascade.withColumn(
    "EffectivePrevArrDelay", 
    when(col("IsConnected") == 1, col("PrevArrDelay")).otherwise(0)
)

# Define a cumulative sum window, but reset for disconnected flights
cumulative_window = Window.partitionBy("TailNum").orderBy("FlightDate", "DepTime") \
                          .rowsBetween(Window.unboundedPreceding, Window.currentRow)

# Calculate the cumulative departure delay (only for connected flights)
df_cascade = df_cascade.withColumn(
    "CumulativeDepDelay",
    sum(col("DepDelay") + col("EffectivePrevArrDelay")).over(cumulative_window)
)

# Show relevant columns to verify
df_cascade.select(
    "TailNum", "FlightDate", "FlightNum", "Origin", "Dest", 
    "DepTime", "crsdeptime", "ArrDelay", "PrevDest", "IsConnected", 
    "EffectivePrevArrDelay", "CumulativeDepDelay"
).show(20, truncate=False)

+-------+----------+---------+------+----+-------+----------+--------+--------+-----------+---------------------+------------------+
|TailNum|FlightDate|FlightNum|Origin|Dest|DepTime|crsdeptime|ArrDelay|PrevDest|IsConnected|EffectivePrevArrDelay|CumulativeDepDelay|
+-------+----------+---------+------+----+-------+----------+--------+--------+-----------+---------------------+------------------+
|-N123D |2001-01-28|547      |ATL   |LAS |2146   |2145      |2       |null    |null       |0                    |1                 |
|-N123D |2001-01-29|352      |LAS   |DFW |717    |715       |5       |LAS     |1          |2                    |5                 |
|-N123D |2001-01-29|301      |ATL   |DEN |1759   |1805      |21      |DFW     |0          |0                    |11                |
|-N123D |2001-01-30|1275     |DEN   |SLC |559    |600       |3       |DEN     |1          |21                   |33                |
|-N123D |2001-01-30|306      |SLC   |DFW |903    |845       |33      

In [18]:
# Now order the DataFrame by the "date" column in ascending order (earliest to latest)
ordered_df = df_cascade.orderBy("FlightDate", ascending=False)

# Show the ordered DataFrame
ordered_df.show()

+-------+-------------+---------+----------+--------+----------+-------+----------+--------+----+---------+--------------+-------+-----+------+----+----------+------+----------------+----------------+--------------+-------+----------+--------+------------+-----------+---------------------+------------------+
|deptime|uniquecarrier|flightnum|crsarrtime|depdelay|dayofmonth|tailnum|crsdeptime|arrdelay|year|dayofweek|crselapsedtime|arrtime|month|origin|dest|totaldelay|season|scheduledarrhour|scheduleddephour|longhaulflight|  route|flightdate|PrevDest|PrevArrDelay|IsConnected|EffectivePrevArrDelay|CumulativeDepDelay|
+-------+-------------+---------+----------+--------+----------+-------+----------+--------+----+---------+--------------+-------+-----+------+----+----------+------+----------------+----------------+--------------+-------+----------+--------+------------+-----------+---------------------+------------------+
|   1852|           WN|     2696|      2125|      27|        31| N210W

In [19]:
# Show relevant columns to verify
df_cascade.select(
    "TailNum", "FlightDate", "FlightNum", "Origin", "Dest", 
    "DepTime", "crsdeptime", "ArrDelay", "DepDelay", "PrevDest", "IsConnected", 
    "EffectivePrevArrDelay", "CumulativeDepDelay"
).show(20, truncate=False)

+-------+----------+---------+------+----+-------+----------+--------+--------+--------+-----------+---------------------+------------------+
|TailNum|FlightDate|FlightNum|Origin|Dest|DepTime|crsdeptime|ArrDelay|DepDelay|PrevDest|IsConnected|EffectivePrevArrDelay|CumulativeDepDelay|
+-------+----------+---------+------+----+-------+----------+--------+--------+--------+-----------+---------------------+------------------+
|       |2003-03-01|247      |LGB   |LAS |1428   |1430      |6       |-2      |null    |null       |0                    |-2                |
|       |2003-03-01|252      |LAS   |LGB |1611   |1610      |-5      |1       |LAS     |1          |6                    |5                 |
|       |2003-03-02|247      |LGB   |LAS |1428   |1430      |-2      |-2      |LGB     |1          |-5                   |-2                |
|       |2003-03-02|252      |LAS   |LGB |1607   |1610      |-11     |-3      |LAS     |1          |-2                   |-7                |
|     

*Saving df_cascade*

In [19]:
df_cascade.write.mode("overwrite").parquet(s3 + "df-cascade/")




In [21]:
# df_cascade = df

# # Define a window partitioned by 'TailNum' and ordered by 'FlightDate' and 'DepTime'
# window_spec = Window.partitionBy("TailNum").orderBy("FlightDate", "DepTime")

# # Use 'lag' to get the previous flight's destination and arrival delay for the same aircraft
# df_cascade = df_cascade.withColumn(
#     "PrevDest", lag("Dest").over(window_spec)
# ).withColumn(
#     "PrevArrDelay", lag("ArrDelay").over(window_spec)
# )

# # Check if the current flight’s origin matches the previous flight's destination (same aircraft)
# df_cascade = df_cascade.withColumn(
#     "IsConnected", (col("Origin") == col("PrevDest")).cast("int")
# )

# # Calculate cumulative departure delay across connected flights for the same aircraft
# df_cascade = df_cascade.withColumn(
#     "CumulativeDepDelay",
#     sum(col("DepDelay") + col("PrevArrDelay") * col("IsConnected")).over(window_spec)
# )

# # Show relevant columns to verify the results
# df_cascade.select(
#     "TailNum", "FlightDate", "FlightNum", "Origin", "Dest", 
#     "DepTime", "ArrDelay", "PrevDest", "IsConnected", "CumulativeDepDelay"
# ).show(10)


+-------+----------+---------+------+----+-------+--------+--------+-----------+------------------+
|TailNum|FlightDate|FlightNum|Origin|Dest|DepTime|ArrDelay|PrevDest|IsConnected|CumulativeDepDelay|
+-------+----------+---------+------+----+-------+--------+--------+-----------+------------------+
| -N123D|2001-01-28|      547|   ATL| LAS|   2146|      -2|    null|       null|              null|
| -N123D|2001-01-29|      352|   LAS| DFW|    717|       5|     LAS|          1|               0.0|
| -N123D|2001-01-29|     1066|   DFW| ATL|   1300|       8|     DFW|          1|               5.0|
| -N123D|2001-01-29|      301|   ATL| DEN|   1759|     -21|     ATL|          1|               7.0|
| -N123D|2001-01-30|     1275|   DEN| SLC|    559|      -3|     DEN|          1|             -15.0|
| -N123D|2001-01-30|      306|   SLC| DFW|    903|      33|     SLC|          1|               0.0|
| -N123D|2001-01-30|      306|   DFW| MCO|   1330|     -10|     DFW|          1|              38.0|


#### **Data aggregation**

In [18]:
# # DATA AGGREGATION FOR CASCADING FLIGHT DELAY PATTERN

# # Group by destination airport to get cumulative delay patterns by each destination
# df_cumulative_dest = (df_cascade.groupBy("Dest").agg(
#     F.avg("CumulativeDepDelay").alias("AvgCumulativeDepDelay"),
#     F.max("CumulativeDepDelay").alias("MaxCumulativeDepDelay"),
#     F.min("CumulativeDepDelay").alias("MinCumulativeDepDelay")
# ).orderBy(F.desc("AvgCumulativeDepDelay")))

# # Show results to analyze delay patterns by destination
# df_cumulative_dest.show(20, truncate=False)


+----+---------------------+---------------------+---------------------+
|Dest|AvgCumulativeDepDelay|MaxCumulativeDepDelay|MinCumulativeDepDelay|
+----+---------------------+---------------------+---------------------+
|ROP |1.4004172382088414E8 |175018796            |58199339             |
|ANI |1.3992183489473686E8 |241671908            |10                   |
|GUM |1.1345049192883277E8 |242883647            |-30                  |
|SPN |1.1069340587178302E8 |191088217            |2824                 |
|HVN |1.1050559357592659E8 |243253899            |-1463                |
|DET |1.0902465258956784E8 |200314328            |28937833             |
|ROR |9.783379848190789E7  |191067206            |4833                 |
|KSM |9.613422624680851E7  |243211423            |13                   |
|PUB |9.230639895828332E7  |160109372            |15334                |
|TTN |8.610923477876823E7  |171026484            |37601                |
|ORH |8.570923068662181E7  |201245008            |-

In [19]:
# # Get average arrival delay by airline by the day
# agg_df1 = df.groupBy("UniqueCarrier", "DayOfWeek").agg(avg("ArrDelay").alias("AvgDelayByDay")).orderBy("UniqueCarrier", "DayOfWeek")
# agg_df1.show(10)

+-------------+---------+------------------+
|UniqueCarrier|DayOfWeek|     AvgDelayByDay|
+-------------+---------+------------------+
|           9E|        1|6.5996673953569465|
|           9E|        2| 5.907975790181574|
|           9E|        3| 5.535135641787847|
|           9E|        4| 5.876909254267745|
|           9E|        5| 7.497764753986643|
|           9E|        6| 4.284158904447597|
|           9E|        7| 6.893415750783495|
|           AA|        1| 6.650914113010778|
|           AA|        2| 5.736685253741351|
|           AA|        3|7.0400702413025265|
+-------------+---------+------------------+
only showing top 10 rows


In [22]:
# # Get average arrival delay by day
# agg_df3 = df.groupBy("DayOfWeek").agg(avg("ArrDelay").alias("TotalAveDelayByDay")).orderBy("DayOfWeek")
# agg_df3.show()

+---------+------------------+
|DayOfWeek|TotalAveDelayByDay|
+---------+------------------+
|        1|6.6695148496840915|
|        2| 5.960420592346179|
|        3| 7.091501517465088|
|        4| 8.945046639782811|
|        5| 9.606953425895007|
|        6| 4.187418958786977|
|        7| 6.525039865667659|
+---------+------------------+


In [20]:
# # Get average arrival delay by airline by the scheduled hour
# agg_df2 = df.groupBy("UniqueCarrier", "ScheduledArrHour").agg(avg("ArrDelay").alias("AvgDelayExpected"))
# agg_df2.show(10)

+-------------+----------------+------------------+
|UniqueCarrier|ScheduledArrHour|  AvgDelayExpected|
+-------------+----------------+------------------+
|           US|              13| 3.325344176858094|
|           DL|              20|12.102978945397147|
|           DL|              18| 10.58250863107307|
|           MQ|              13| 6.005755348363015|
|           HP|               7| 4.661067433242129|
|           HA|               9|-3.846256176358799|
|           UA|              20|13.063968389452409|
|           NW|               9|2.7916624595089496|
|           HP|              21| 9.636104150332324|
|           XE|              22|14.366321744799881|
+-------------+----------------+------------------+
only showing top 10 rows


In [23]:
# # Get average arrival delay time by airline by the type of flight to see if long-haul flights are more prone to delays
# agg_df4 = df.groupBy("UniqueCarrier", "LongHaulFlight").agg(avg("ArrDelay").alias("FlightTypeAvgDelay")).orderBy("UniqueCarrier", "LongHaulFlight")
# agg_df4.show(10)

+-------------+--------------+------------------+
|UniqueCarrier|LongHaulFlight|FlightTypeAvgDelay|
+-------------+--------------+------------------+
|           9E|             0|6.1082282144188005|
|           AA|             0|6.8367769208293945|
|           AA|             1|3.5920859601126045|
|           AQ|             0|1.2102615827681853|
|           AQ|             1|-8.778739184177997|
|           AS|             0| 8.416069574061208|
|           AS|             1| 5.457909292035398|
|           B6|             0|10.012434828260803|
|           B6|             1| 4.915056849541892|
|           CO|             0| 6.952947293399876|
+-------------+--------------+------------------+
only showing top 10 rows


In [25]:
# # Get average arrival delay by the time of the year
# agg_df5 = df.groupBy("Season").agg(avg("ArrDelay").alias("AveDelaySeason")).orderBy("Season")
# agg_df5.show()

+----------+-----------------+
|TimeOfYear|   AveDelaySeason|
+----------+-----------------+
|        Q1|7.847639106816504|
|        Q2|6.812042901816674|
|        Q3| 6.74891812829479|
|        Q4|  6.8239078570275|
+----------+-----------------+


In [31]:
# # Get average arrival delay by the destination to detect airports prone to delays
# agg_df6 = df.groupBy("Dest").agg(avg("ArrDelay").alias("AvgAirportDelay")).orderBy(desc("AvgAirportDelay"))
# agg_df6.show(10)

+----+------------------+
|Dest|   AvgAirportDelay|
+----+------------------+
| PVU|              28.0|
| OTH| 26.79233870967742|
| SOP|25.598705501618124|
| HHH|23.731300345224398|
| MQT| 23.21644234267006|
| ACK|22.587155963302752|
| YAP| 16.89442815249267|
| MCN|16.212311199772014|
| LMT| 16.11978221415608|
| MAZ| 14.69047619047619|
+----+------------------+
only showing top 10 rows


In [32]:
# # Get average arrival delay by Route
# agg_df7 = df.groupBy("Route").agg(avg("ArrDelay").alias("AvgRouteArrDelay"))
# agg_df7.show(10)

+-------+------------------+
|  Route|  AvgRouteArrDelay|
+-------+------------------+
|MSP-EWR| 13.15557346844772|
|SLC-MSP| 3.801438509609716|
|PIT-STL| 6.061640358722715|
|PIT-CLT|  7.26124373618849|
|MSP-GTF| 6.441260133693643|
|DTW-MKE|10.268531812774336|
|TPA-BWI| 6.111973392461198|
|SLC-RAP|3.5848762221759936|
|CLT-EWR|14.251507192335433|
|PIT-SYR| 7.075739493649305|
+-------+------------------+
only showing top 10 rows


### **End**

#### Example: Create a DynamicFrame from a table in the AWS Glue Data Catalog and display its schema


In [None]:
# dyf = glueContext.create_dynamic_frame.from_catalog(database='database_name', table_name='table_name')
# dyf.printSchema()

#### Example: Convert the DynamicFrame to a Spark DataFrame and display a sample of the data


In [None]:
# df = dyf.toDF()
# df.show()

#### Example: Visualize data with matplotlib


In [None]:
# import matplotlib.pyplot as plt

# # Set X-axis and Y-axis values
# x = [5, 2, 8, 4, 9]
# y = [10, 4, 8, 5, 2]
  
# # Create a bar chart 
# plt.bar(x, y)
  
# # Show the plot
# %matplot plt

#### Example: Write the data in the DynamicFrame to a location in Amazon S3 and a table for it in the AWS Glue Data Catalog


In [None]:
# s3output = glueContext.getSink(
#   path="s3://bucket_name/folder_name",
#   connection_type="s3",
#   updateBehavior="UPDATE_IN_DATABASE",
#   partitionKeys=[],
#   compression="snappy",
#   enableUpdateCatalog=True,
#   transformation_ctx="s3output",
# )
# s3output.setCatalogInfo(
#   catalogDatabase="demo", catalogTableName="populations"
# )
# s3output.setFormat("glueparquet")
# s3output.writeFrame(DyF)