# AWS Glue Studio Notebook
##### You are now running a AWS Glue Studio notebook; To start using your notebook you need to start an AWS Glue Interactive Session.


#### Optional: Run this cell to see available notebook commands ("magics").


####  Run this cell to set up and start your interactive session.


In [1]:
%idle_timeout 2880
%glue_version 4.0
%worker_type G.1X
%number_of_workers 5

import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
  
sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)

Current idle_timeout is None minutes.
idle_timeout has been set to 2880 minutes.
Setting Glue version to: 4.0
Previous worker type: None
Setting new worker type to: G.1X
Previous number of workers: None
Setting new number of workers to: 2
Trying to create a Glue session for the kernel.
Session Type: glueetl
Worker Type: G.1X
Number of Workers: 2
Idle Timeout: 2880
Session ID: aec64ff9-c1c9-48e7-af83-e3da3443f0c9
Applying the following default arguments:
--glue_kernel_version 1.0.5
--enable-glue-datacatalog true
Waiting for session aec64ff9-c1c9-48e7-af83-e3da3443f0c9 to get into ready status...
Session aec64ff9-c1c9-48e7-af83-e3da3443f0c9 has been created.



#### Start


In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
from pyspark.sql.functions import col, avg, when, floor, trim, concat, lit, desc, lag, sum, to_date, concat_ws, date_format
from pyspark.sql import functions as F




In [3]:
from awsglue.dynamicframe import DynamicFrame

# Initialize a Spark context
spark_context = SparkContext.getOrCreate()

# Initialize a Glue context
glue_context = GlueContext(spark_context)

# Specify the S3 path to your Parquet file
parquet_path = "s3://airtime-historical-data/clean/pipeline-1/"

# Read the Parquet file into a Spark DataFrame
df = glue_context.spark_session.read.parquet(parquet_path)

# Show the first few rows of the DataFrame to verify it's loaded correctly
df.show()


+-------+--------+------------+----------+------------+--------+--------+----+-----------------+-------------+---------+-----+------+----+
|deptime|depdelay|carrierdelay|dayofmonth|weatherdelay|arrdelay|nasdelay|year|lateaircraftdelay|securitydelay|dayofweek|month|origin|dest|
+-------+--------+------------+----------+------------+--------+--------+----+-----------------+-------------+---------+-----+------+----+
|   1528|      13|        null|        22|        null|       4|    null|1995|             null|         null|        6|    7|   GSP| ATL|
|   1527|      -4|           0|         2|           0|       5|       0|2005|                0|            0|        7|    1|   ALB| ATL|
|   1633|      63|        null|        19|        null|      71|    null|2001|             null|         null|        6|    5|   LGA| ATL|
|   1740|      10|          10|         2|           0|      37|      27|2004|                0|            0|        1|    8|   ATL| DCA|
|   1718|       9|        n

#### Create Season column based on month

In [4]:
df = df.withColumn(
    "Season",
    F.when((df["month"] == 12) | (df["month"] == 1) | (df["month"] == 2), "Winter")
     .when((df["month"] >= 3) & (df["month"] <= 5), "Spring")
     .when((df["month"] >= 6) & (df["month"] <= 8), "Summer")
     .when((df["month"] >= 9) & (df["month"] <= 11), "Fall")
)

# Show the DataFrame to verify the new column
df.show()


+-------+--------+------------+----------+------------+--------+--------+----+-----------------+-------------+---------+-----+------+----+------+
|deptime|depdelay|carrierdelay|dayofmonth|weatherdelay|arrdelay|nasdelay|year|lateaircraftdelay|securitydelay|dayofweek|month|origin|dest|Season|
+-------+--------+------------+----------+------------+--------+--------+----+-----------------+-------------+---------+-----+------+----+------+
|   1528|      13|        null|        22|        null|       4|    null|1995|             null|         null|        6|    7|   GSP| ATL|Summer|
|   1527|      -4|           0|         2|           0|       5|       0|2005|                0|            0|        7|    1|   ALB| ATL|Winter|
|   1633|      63|        null|        19|        null|      71|    null|2001|             null|         null|        6|    5|   LGA| ATL|Spring|
|   1740|      10|          10|         2|           0|      37|      27|2004|                0|            0|        1|    

#### Create Total Delay Column based on arrival + departure delay

In [5]:
# Create a new column 'Total delay' by adding 'depdelay' and 'arrdelay'
df = df.withColumn("totaldelay", F.col("depdelay") + F.col("arrdelay"))

# Show the updated DataFrame to verify the new column
df.show()

+-------+--------+------------+----------+------------+--------+--------+----+-----------------+-------------+---------+-----+------+----+------+----------+
|deptime|depdelay|carrierdelay|dayofmonth|weatherdelay|arrdelay|nasdelay|year|lateaircraftdelay|securitydelay|dayofweek|month|origin|dest|Season|totaldelay|
+-------+--------+------------+----------+------------+--------+--------+----+-----------------+-------------+---------+-----+------+----+------+----------+
|   1528|      13|        null|        22|        null|       4|    null|1995|             null|         null|        6|    7|   GSP| ATL|Summer|        17|
|   1527|      -4|           0|         2|           0|       5|       0|2005|                0|            0|        7|    1|   ALB| ATL|Winter|         1|
|   1633|      63|        null|        19|        null|      71|    null|2001|             null|         null|        6|    5|   LGA| ATL|Spring|       134|
|   1740|      10|          10|         2|           0|   

#### Create Severity column based on departure delay

In [6]:
df = df.withColumn(
    "Severity",
    F.when(df["totaldelay"] < 60, "Minimal")
     .when((df["totaldelay"] >= 60) & (df["totaldelay"] < 120), "Moderate")
     .when((df["totaldelay"] >= 120) & (df["totaldelay"] < 180), "Major")
     .when((df["totaldelay"] >= 180) & (df["totaldelay"] <= 360), "Significant")
     .when(df["totaldelay"] > 360, "Severe")
     .otherwise("No Delay")
)

# Show the DataFrame to verify the new column
df.show()

+-------+--------+------------+----------+------------+--------+--------+----+-----------------+-------------+---------+-----+------+----+------+----------+--------+
|deptime|depdelay|carrierdelay|dayofmonth|weatherdelay|arrdelay|nasdelay|year|lateaircraftdelay|securitydelay|dayofweek|month|origin|dest|Season|totaldelay|Severity|
+-------+--------+------------+----------+------------+--------+--------+----+-----------------+-------------+---------+-----+------+----+------+----------+--------+
|   1528|      13|        null|        22|        null|       4|    null|1995|             null|         null|        6|    7|   GSP| ATL|Summer|        17| Minimal|
|   1527|      -4|           0|         2|           0|       5|       0|2005|                0|            0|        7|    1|   ALB| ATL|Winter|         1| Minimal|
|   1633|      63|        null|        19|        null|      71|    null|2001|             null|         null|        6|    5|   LGA| ATL|Spring|       134|   Major|
|   

#### Output location

In [7]:
s3 = "s3://pipeline1-data-storage/processed-data/"




#### Saving df

In [8]:
df.write.mode("overwrite").parquet(s3 + "df/")


