In [None]:
import os
import pandas as pd
import numpy as np


spark_version = 'spark-3.0.3'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/spark/spark-3.0.3/spark-3.0.3-bin-hadoop2.7.tgz
!tar xf spark-3.0.3-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()

0% [Working]            Get:1 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
0% [Connecting to archive.ubuntu.com (91.189.88.142)] [1 InRelease 4,035 B/88.7                                                                               Get:2 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease [3,626 B]
Get:3 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease [15.9 kB]
Get:4 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ Packages [76.0 kB]
Hit:5 http://ppa.launchpad.net/cran/libgit2/ubuntu bionic InRelease
Hit:6 http://archive.ubuntu.com/ubuntu bionic InRelease
Ign:7 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Get:8 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]
Get:9 http://ppa.launchpad.net/deadsnakes/ppa/ubuntu bionic InRelease [15.9 kB]
Ign:10 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Get:

In [None]:
# Download the Postgres driver that will allow Spark to interact with Postgres.
!wget https://jdbc.postgresql.org/download/postgresql-42.2.16.jar

--2022-01-24 21:54:25--  https://jdbc.postgresql.org/download/postgresql-42.2.16.jar
Resolving jdbc.postgresql.org (jdbc.postgresql.org)... 72.32.157.228, 2001:4800:3e1:1::228
Connecting to jdbc.postgresql.org (jdbc.postgresql.org)|72.32.157.228|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1002883 (979K) [application/java-archive]
Saving to: ‘postgresql-42.2.16.jar’


2022-01-24 21:54:25 (5.36 MB/s) - ‘postgresql-42.2.16.jar’ saved [1002883/1002883]



In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Final-Project").config("spark.driver.extraClassPath","/content/postgresql-42.2.16.jar").getOrCreate()

In [None]:
from pyspark.sql import *
from pyspark.sql import SparkSession
from pyspark.sql.functions import lpad
from pyspark.sql.functions import concat
from pyspark.sql.functions import lit
from pyspark.sql.functions import substring

In [None]:
from pyspark import SparkFiles
url = "https://finalprojectstorage10.s3.us-east-2.amazonaws.com/2018.csv"
spark.sparkContext.addFile(url)
df = spark.read.option("encoding", "UTF-8").csv(SparkFiles.get("2018.csv"), header=True)
df.show()

+----------+----------+-----------------+------+----+------------+--------+---------+--------+----------+---------+-------+------------+--------+---------+---------+-----------------+--------+----------------+-------------------+--------+--------+-------------+-------------+---------+--------------+-------------------+-----------+
|   FL_DATE|OP_CARRIER|OP_CARRIER_FL_NUM|ORIGIN|DEST|CRS_DEP_TIME|DEP_TIME|DEP_DELAY|TAXI_OUT|WHEELS_OFF|WHEELS_ON|TAXI_IN|CRS_ARR_TIME|ARR_TIME|ARR_DELAY|CANCELLED|CANCELLATION_CODE|DIVERTED|CRS_ELAPSED_TIME|ACTUAL_ELAPSED_TIME|AIR_TIME|DISTANCE|CARRIER_DELAY|WEATHER_DELAY|NAS_DELAY|SECURITY_DELAY|LATE_AIRCRAFT_DELAY|Unnamed: 27|
+----------+----------+-----------------+------+----+------------+--------+---------+--------+----------+---------+-------+------------+--------+---------+---------+-----------------+--------+----------------+-------------------+--------+--------+-------------+-------------+---------+--------------+-------------------+-----------+
|

In [None]:
from pyspark.sql.functions import *

#Timestamp function to fix formatting
def padTimeStamp(x,y):
  if y is None:
      y = "0"

  y = y.replace(".0","")
  return x + " " + y.zfill(4)[0:2] + ":" + y.zfill(4)[2:4]

#Create udf becuase python fucntions do not work with pyspark
padTimeStampUDF = udf(lambda x,y: padTimeStamp(x,y)) 

#inital dataframe applying the udf to pad and format the timestamp columns and pick the rest of the columns
initial_df = df.select(
                    "OP_CARRIER",
                    "OP_CARRIER_FL_NUM", 
                    "ORIGIN",
                    padTimeStampUDF(df["FL_DATE"],df["CRS_DEP_TIME"]).alias("CRS_DEPARTURE_TIMESTAMP"),
                    df.columns[0],df.columns[4],
                    padTimeStampUDF(df["FL_DATE"],df["DEP_TIME"]).alias("ACTUAL_DEPARTURE_TIMESTAMP"),
                    df.columns[7],df.columns[8],
                    padTimeStampUDF(df["FL_DATE"],df["WHEELS_OFF"]).alias("WHEELS_OFF_TIMESTAMP"),
                    padTimeStampUDF(df["FL_DATE"],df["WHEELS_ON"]).alias("WHEELS_ON_TIMESTAMP"),
                    df.columns[11],
                    padTimeStampUDF(df["FL_DATE"],df["CRS_ARR_TIME"]).alias("CRS_ARRIVAL_TIMESTAMP"),
                    padTimeStampUDF(df["FL_DATE"],df["ARR_TIME"]).alias("ACTUAL_ARRIVAL_TIMESTAMP"), 
                    df.columns[14], df.columns[15], df.columns[16], df.columns[17], df.columns[18], df.columns[19], df.columns[20], df.columns[21], df.columns[22], df.columns[23], df.columns[24], df.columns[25], df.columns[26] 
                  )
#initial_df.printSchema()
#initial_df.show()

#Changed the 1 and 0s to true and false
initial_df = initial_df.withColumn("DIVERTED", when(col("DIVERTED") == '1.0', True).otherwise(False))
initial_df = initial_df.withColumn("CANCELLED", when(col("CANCELLED") == '1.0', True).otherwise(False))

#Final df where we change the column names and cast the datatypes
final_df = initial_df.selectExpr(
                                "OP_CARRIER as AIRLINE_CARRIER_CODE",
                                "OP_CARRIER_FL_NUM",
                                "ORIGIN as ORIGIN_AIRPORT_CODE",
                                "cast(CRS_DEPARTURE_TIMESTAMP as timestamp) CRS_DEPARTURE_TIMESTAMP",
                                "cast(FL_DATE as date) FLIGHT_DT",
                                "DEST as DEST_AIRPORT_CODE",
                                "cast(ACTUAL_DEPARTURE_TIMESTAMP as timestamp) ACTUAL_DEPARTURE_TIMESTAMP",
                                "cast(DEP_DELAY as integer) as DEPARTURE_DELAY_MINUTES",
                                "cast(TAXI_OUT as integer) TAXI_OUT_MINUTES",
                                "cast(WHEELS_OFF_TIMESTAMP as timestamp) WHEELS_OFF_TIMESTAMP",
                                "cast(WHEELS_ON_TIMESTAMP as timestamp) WHEELS_ON_TIMESTAMP",
                                "cast(TAXI_IN as integer) TAXI_IN_MINUTES",
                                "cast(CRS_ARRIVAL_TIMESTAMP as timestamp) CRS_ARRIVAL_TIMESTAMP",
                                "cast(CRS_ARRIVAL_TIMESTAMP as timestamp) ACTUAL_ARRIVAL_TIMESTAMP",
                                "cast(ARR_DELAY as integer) ARRIVAL_DELAY_MINUTES",
                                "CANCELLED as CANCELLED_IND",
                                "CANCELLATION_CODE",
                                "DIVERTED as DIVERTED_IND",
                                "cast(CRS_ELAPSED_TIME as integer) CRS_ELAPSED_TIME_MINUTES",
                                "cast(ACTUAL_ELAPSED_TIME as integer) ACTUAL_ELAPSED_TIME_MINUTES",
                                "cast(AIR_TIME as integer) AIR_TIME_MINUTES",
                                "cast(DISTANCE as integer) DISTANCE_MILES",
                                "cast(CARRIER_DELAY as integer) CARRIER_DELAY_MINUTES",
                                "cast(WEATHER_DELAY as integer) WEATHER_DELAY_MINUTES",
                                "cast(NAS_DELAY as integer) NAS_DELAY_MINUTES",
                                "cast(SECURITY_DELAY as integer) SECURITY_DELAY_MINUTES",
                                "cast(LATE_AIRCRAFT_DELAY as integer) LATE_AIRCRAFT_DELAY_MINUTES"
                                )
#Fill the null values with 0
final_df = final_df.na.fill(value=0,subset=["CARRIER_DELAY_MINUTES", "WEATHER_DELAY_MINUTES", "NAS_DELAY_MINUTES", "SECURITY_DELAY_MINUTES","LATE_AIRCRAFT_DELAY_MINUTES"])

final_df.show()

+--------------------+-----------------+-------------------+-----------------------+----------+-----------------+--------------------------+-----------------------+----------------+--------------------+-------------------+---------------+---------------------+------------------------+---------------------+-------------+-----------------+------------+------------------------+---------------------------+----------------+--------------+---------------------+---------------------+-----------------+----------------------+---------------------------+
|AIRLINE_CARRIER_CODE|OP_CARRIER_FL_NUM|ORIGIN_AIRPORT_CODE|CRS_DEPARTURE_TIMESTAMP| FLIGHT_DT|DEST_AIRPORT_CODE|ACTUAL_DEPARTURE_TIMESTAMP|DEPARTURE_DELAY_MINUTES|TAXI_OUT_MINUTES|WHEELS_OFF_TIMESTAMP|WHEELS_ON_TIMESTAMP|TAXI_IN_MINUTES|CRS_ARRIVAL_TIMESTAMP|ACTUAL_ARRIVAL_TIMESTAMP|ARRIVAL_DELAY_MINUTES|CANCELLED_IND|CANCELLATION_CODE|DIVERTED_IND|CRS_ELAPSED_TIME_MINUTES|ACTUAL_ELAPSED_TIME_MINUTES|AIR_TIME_MINUTES|DISTANCE_MILES|CARRIER_DELA

In [None]:
final_df.printSchema()

root
 |-- AIRLINE_CARRIER_CODE: string (nullable = true)
 |-- OP_CARRIER_FL_NUM: string (nullable = true)
 |-- ORIGIN_AIRPORT_CODE: string (nullable = true)
 |-- CRS_DEPARTURE_TIMESTAMP: timestamp (nullable = true)
 |-- FLIGHT_DT: date (nullable = true)
 |-- DEST_AIRPORT_CODE: string (nullable = true)
 |-- ACTUAL_DEPARTURE_TIMESTAMP: timestamp (nullable = true)
 |-- DEPARTURE_DELAY_MINUTES: integer (nullable = true)
 |-- TAXI_OUT_MINUTES: integer (nullable = true)
 |-- WHEELS_OFF_TIMESTAMP: timestamp (nullable = true)
 |-- WHEELS_ON_TIMESTAMP: timestamp (nullable = true)
 |-- TAXI_IN_MINUTES: integer (nullable = true)
 |-- CRS_ARRIVAL_TIMESTAMP: timestamp (nullable = true)
 |-- ACTUAL_ARRIVAL_TIMESTAMP: timestamp (nullable = true)
 |-- ARRIVAL_DELAY_MINUTES: integer (nullable = true)
 |-- CANCELLED_IND: boolean (nullable = false)
 |-- CANCELLATION_CODE: string (nullable = true)
 |-- DIVERTED_IND: boolean (nullable = false)
 |-- CRS_ELAPSED_TIME_MINUTES: integer (nullable = true)
 |-- A

In [None]:
final_df.filter((final_df.AIRLINE_CARRIER_CODE == "YX") & (final_df.OP_CARRIER_FL_NUM == "3624") & (final_df.CRS_DEPARTURE_TIMESTAMP == '2018-06-21 20:07:00')).show(100)

+--------------------+-----------------+-------------------+-----------------------+----------+-----------------+--------------------------+-----------------------+----------------+--------------------+-------------------+---------------+---------------------+------------------------+---------------------+-------------+-----------------+------------+------------------------+---------------------------+----------------+--------------+---------------------+---------------------+-----------------+----------------------+---------------------------+
|AIRLINE_CARRIER_CODE|OP_CARRIER_FL_NUM|ORIGIN_AIRPORT_CODE|CRS_DEPARTURE_TIMESTAMP| FLIGHT_DT|DEST_AIRPORT_CODE|ACTUAL_DEPARTURE_TIMESTAMP|DEPARTURE_DELAY_MINUTES|TAXI_OUT_MINUTES|WHEELS_OFF_TIMESTAMP|WHEELS_ON_TIMESTAMP|TAXI_IN_MINUTES|CRS_ARRIVAL_TIMESTAMP|ACTUAL_ARRIVAL_TIMESTAMP|ARRIVAL_DELAY_MINUTES|CANCELLED_IND|CANCELLATION_CODE|DIVERTED_IND|CRS_ELAPSED_TIME_MINUTES|ACTUAL_ELAPSED_TIME_MINUTES|AIR_TIME_MINUTES|DISTANCE_MILES|CARRIER_DELA

In [None]:
# Configure settings for RDS
mode = "append"
jdbc_url="jdbc:postgresql://airline-db.cjrwalaqox8l.us-east-2.rds.amazonaws.com/airline"
config = {"user":"postgres", 
          "password": "57jcw2Jvswa7vhDe", 
          "driver":"org.postgresql.Driver"}


In [None]:
# Write DataFrame to active_user table in RDS
final_df.write.jdbc(url=jdbc_url, table='project.flight_data', mode=mode, properties=config)