In [1]:
# Importing Spark libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, min, max, avg, count
from pyspark.sql import functions as F
from pyspark.sql.window import Window

# Importing Python Data Stat. libraries
import pandas as pd
import matplotlib.pyplot as plt

# For error handling and utilities 
import json
import requests
import urllib.request
import sys
import os

# TODO: 1 check warnings 
# TODO: 2 add corupted json handling in to function or make corupted json function 
# TODO: 3 re-write reading json via Spark as function, double check logic.for controls
# TODO: 4 harmonize error messages (numbers and explanation for readme) and code descriptions
# TODO: 5 Verification of data 
# TODO: 6 Handling of missing values 

# NOTE: 1 reading more complex file might be necessary to pre process json

In [2]:
# Spark builder (check warnings - port 4041)
spark = SparkSession.builder.appName("AirportFlightDataViz").getOrCreate()

24/10/11 22:57:49 WARN Utils: Your hostname, codespaces-358acb resolves to a loopback address: 127.0.0.1; using 10.0.2.21 instead (on interface eth0)
24/10/11 22:57:49 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/10/11 22:57:52 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
url_adsb = "https://raw.githubusercontent.com/JM-AE/Airport-Flight-Data/refs/heads/main/adsb.json"
response_adsb = requests.get(url_adsb)

# Check if file is possible to load using py.json
if response_adsb.status_code == 200:
    data_adsb = response_adsb.json()
    print("adsb.json data successfully loaded")
else: print(f"Failed to load adsb.json Status code:{response_adsb.status_code}")

# Print sample of .json file
for entry_adsb in data_adsb[:1]:
    print("data_adsb",entry_adsb)

# Request save as tmp file 
urllib.request.urlretrieve(url_adsb,"/tmp/file_absd.json")

# Check if file exist on correct address
if os.path.exists("/tmp/file_absd.json"):
    print("tmp file_absd.json exist")
else: 
    print("tmp file_absd.json do not exist")

# Check if .joson valid or not
with open("/tmp/file_absd.json","r") as f:
    try:
       data_adsb = json.load(f)
       print("absd.json is valid")
    except json.JSONDecodeError as e:
        print(f"Invalid absd.json structure: {e}") 

# Load in to Spark DF with check for error     
try:
    df_absd =spark.read.json("/tmp/file_absd.json")
    df_absd.show(3)
except Exception as t:
    print(f"Error loading DataFrame in single line option")
    print(str(t))
    print(f"Attempt to load DataFrame in multi line option")
    try: 
        df_absd =spark.read.option("multiline", "true").json("/tmp/file_absd.json")
        print(f"Dataframe Loaded succesfully")
    except Exception as w:
        print(f"Error loading DataFrame in multi line option:{str(w)}")

df_absd.show(1)

adsb.json data successfully loaded
data_adsb {'AircraftId': '400960', 'Latitude': 10.81889, 'Longitude': 106.65194, 'Track': 30, 'Altitude': 0, 'Speed': 0, 'Squawk': 7713, 'Type': 'A320', 'Registration': 'G-TTOE', 'LastUpdate': 1696278420, 'Origin': 'SGN', 'Destination': 'ICN', 'Flight': 'BA484', 'Onground': 1, 'Vspeed': 0, 'Callsign': 'BAW476C', 'SourceType': 'ADS-B FR24 receivers', 'ETA': 0}
tmp file_absd.json exist
absd.json is valid


                                                                                

Error loading DataFrame in single line option
Since Spark 2.3, the queries from raw JSON/CSV files are disallowed when the
referenced columns only include the internal corrupt record column
(named _corrupt_record by default). For example:
spark.read.schema(schema).csv(file).filter($"_corrupt_record".isNotNull).count()
and spark.read.schema(schema).csv(file).select("_corrupt_record").show().
Instead, you can cache or save the parsed results and then send the same query.
For example, val df = spark.read.schema(schema).csv(file).cache() and then
df.filter($"_corrupt_record".isNotNull).count().
Attempt to load DataFrame in multi line option
Dataframe Loaded succesfully
+----------+--------+--------+-----------+---+------+----------+--------+---------+--------+------+-------+------------+--------------------+-----+------+-----+----+------+
|AircraftId|Altitude|Callsign|Destination|ETA|Flight|LastUpdate|Latitude|Longitude|Onground|Origin|RadarId|Registration|          SourceType|Speed|Squawk

24/10/11 22:58:08 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


In [4]:
avg_speed_per_airport_origin = df_absd.groupBy("Origin").agg(F.avg("Speed").alias("AverageSpeed"))
avg_speed_per_airport_origin.show(truncate=False)
                                                      

[Stage 3:>                                                          (0 + 1) / 1]

+------+------------+
|Origin|AverageSpeed|
+------+------------+
|GUA   |170.0       |
|DOH   |250.0       |
|SGN   |234.2       |
|IAD   |170.0       |
+------+------------+



                                                                                

In [5]:
avg_speed_per_airport_dest = df_absd.groupBy("Destination").agg(F.avg("Speed").alias("AverageSpeed"))
avg_speed_per_airport_dest.show(truncate=False)

+-----------+------------+
|Destination|AverageSpeed|
+-----------+------------+
|BED        |170.0       |
|SYZ        |250.0       |
|ICN        |234.2       |
|MIA        |170.0       |
+-----------+------------+



In [6]:
avg_speed_per_aircraft = df_absd.groupBy("Type").agg(F.avg("Speed").alias("AverageSpeed"))
avg_speed_per_aircraft.show(truncate=False)

+----+------------------+
|Type|AverageSpeed      |
+----+------------------+
|A320|241.22222222222223|
|B738|170.0             |
|E545|170.0             |
+----+------------------+



In [7]:
window_spec = Window.partitionBy("Flight").orderBy(F.desc("LastUpdate")) 


In [10]:
df_with_rn=df_absd.withColumn("row_number",F.row_number().over(window_spec))

In [12]:
latest_df = df_with_rn.filter(F.col("row_number")==1).select("Flight","Speed","LastUpdate")
latest_df.show()

+------+-----+----------+
|Flight|Speed|LastUpdate|
+------+-----+----------+
|AAL476|    0|1696350960|
| BA484|    0|1696290420|
|LXJ476|    0|1696350320|
| QR476|    0|1696288335|
+------+-----+----------+



In [23]:
#Re doo 
better_df = df_with_rn.filter(F.col("row_number")==1)
better_df = better_df.withColumn("Speed",F.when(F.col("OnGround") == False, 1).otherwise(F.col("Speed"))).select("Flight","Speed","LastUpdate","OnGround")
better_df.show()

+------+-----+----------+--------+
|Flight|Speed|LastUpdate|OnGround|
+------+-----+----------+--------+
|AAL476|    0|1696350960|       1|
| BA484|    0|1696290420|       1|
|LXJ476|    0|1696350320|       1|
| QR476|    0|1696288335|       1|
+------+-----+----------+--------+



In [13]:
url_adsb_multi = "https://raw.githubusercontent.com/JM-AE/Airport-Flight-Data/refs/heads/main/adsb_multi_aircraft.json"
response_adsb_multi = requests.get(url_adsb_multi)

if response_adsb_multi.status_code == 200:
    data_adsb_multi = response_adsb_multi.json()
    print("adsb_multi.json data successfully loaded")
else: print(f"Failed to load adsb_multi.json Status code:{response_adsb_multi.status_code}")

urllib.request.urlretrieve(url_adsb_multi,"/tmp/file_absd_multi.json")
df_absd_multi =spark.read.json("/tmp/file_absd_multi.json")

for entry_adsb_multi in data_adsb_multi[:3]:
    print("data_adsb_multi",entry_adsb_multi)

adsb_multi.json data successfully loaded
