In [33]:
import requests

from pyspark.sql import SparkSession
from pyspark.sql.types import (
    StructType, StructField, StringType, LongType, DoubleType, BooleanType, ArrayType
)
import os
from dotenv import load_dotenv
load_dotenv()
PYTHONDIRECTORY = os.getenv("PYTHONDIRECTORY")
os.environ["PYSPARK_PYTHON"] = PYTHONDIRECTORY

In [34]:
# Initialize Spark session
spark = SparkSession.builder \
    .appName("OpenSkyAPI") \
    .config("spark.pyspark.python", PYTHONDIRECTORY) \
    .getOrCreate()

In [35]:
# URL for the OpenSky API endpoint
url = "https://opensky-network.org/api/states/all"

# Request data from the API
response = requests.get(url)
data = response.json()

# Extract the list of flight states
states = data.get('states', [])

In [36]:
# Function to convert int values to float for fields that expect DoubleType
def convert_row(row):
    # Make a copy of the row (which is a list)
    new_row = list(row)
    # For fields: longitude (index 5), latitude (6), baro_altitude (7), velocity (9),
    # true_track (10), vertical_rate (11), geo_altitude (13)
    for idx in [5, 6, 7, 9, 10, 11, 13]:
        val = new_row[idx]
        if val is not None:
            new_row[idx] = float(val)
    return new_row

# Convert each row accordingly
states = [convert_row(row) for row in states]

In [37]:
# Define an explicit schema based on the API documentation
schema = StructType([
    StructField("icao24", StringType(), True),
    StructField("callsign", StringType(), True),
    StructField("origin_country", StringType(), True),
    StructField("time_position", LongType(), True),
    StructField("last_contact", LongType(), True),
    StructField("longitude", DoubleType(), True),
    StructField("latitude", DoubleType(), True),
    StructField("baro_altitude", DoubleType(), True),
    StructField("on_ground", BooleanType(), True),
    StructField("velocity", DoubleType(), True),
    StructField("true_track", DoubleType(), True),
    StructField("vertical_rate", DoubleType(), True),
    StructField("sensors", ArrayType(LongType()), True),
    StructField("geo_altitude", DoubleType(), True),
    StructField("squawk", StringType(), True),
    StructField("spi", BooleanType(), True),
    StructField("position_source", LongType(), True)
])

In [38]:
# Create a Spark DataFrame using the explicit schema
df = spark.createDataFrame(states, schema=schema)

In [39]:
# Display the DataFrame
df.show()


+------+--------+--------------+-------------+------------+---------+--------+-------------+---------+--------+----------+-------------+-------+------------+------+-----+---------------+
|icao24|callsign|origin_country|time_position|last_contact|longitude|latitude|baro_altitude|on_ground|velocity|true_track|vertical_rate|sensors|geo_altitude|squawk|  spi|position_source|
+------+--------+--------------+-------------+------------+---------+--------+-------------+---------+--------+----------+-------------+-------+------------+------+-----+---------------+
|4b1816|SWR252B |   Switzerland|   1742821770|  1742821770|  21.2923| 43.5866|      11277.6|    false|   248.4|    118.58|          0.0|   NULL|    11475.72|  3006|false|              0|
|4b1818|SWR8GD  |   Switzerland|   1742821769|  1742821770|   8.2754| 47.4984|      2331.72|    false|  125.59|     18.88|        -8.45|   NULL|      2400.3|  5357|false|              0|
|ab1644|UAL1108 | United States|   1742821760|  1742821760| -77.4

In [40]:
df.count()

8934

In [41]:
df.describe().show()

+-------+--------+--------+--------------+--------------------+--------------------+-------------------+------------------+-----------------+------------------+------------------+-------------------+-----------------+------------------+---------------+
|summary|  icao24|callsign|origin_country|       time_position|        last_contact|          longitude|          latitude|    baro_altitude|          velocity|        true_track|      vertical_rate|     geo_altitude|            squawk|position_source|
+-------+--------+--------+--------------+--------------------+--------------------+-------------------+------------------+-----------------+------------------+------------------+-------------------+-----------------+------------------+---------------+
|  count|    8934|    8934|          8934|                8877|                8934|               8877|              8877|             8092|              8934|              8934|               8108|             8019|              6554|     

In [42]:
df.where(df.icao24 == "4b1818").show()

+------+--------+--------------+-------------+------------+---------+--------+-------------+---------+--------+----------+-------------+-------+------------+------+-----+---------------+
|icao24|callsign|origin_country|time_position|last_contact|longitude|latitude|baro_altitude|on_ground|velocity|true_track|vertical_rate|sensors|geo_altitude|squawk|  spi|position_source|
+------+--------+--------------+-------------+------------+---------+--------+-------------+---------+--------+----------+-------------+-------+------------+------+-----+---------------+
|4b1818|SWR8GD  |   Switzerland|   1742821769|  1742821770|   8.2754| 47.4984|      2331.72|    false|  125.59|     18.88|        -8.45|   NULL|      2400.3|  5357|false|              0|
+------+--------+--------------+-------------+------------+---------+--------+-------------+---------+--------+----------+-------------+-------+------------+------+-----+---------------+



In [43]:
df.orderBy(df.velocity).tail(10)

[Row(icao24='3c670c', callsign='DLH742  ', origin_country='Germany', time_position=1742821767, last_contact=1742821768, longitude=30.819, latitude=42.6913, baro_altitude=11277.6, on_ground=False, velocity=302.16, true_track=106.21, vertical_rate=0.33, sensors=None, geo_altitude=11506.2, squawk='5271', spi=False, position_source=0),
 Row(icao24='7608f9', callsign='PIA946  ', origin_country='Pakistan', time_position=1742821770, last_contact=1742821770, longitude=55.0736, latitude=26.4096, baro_altitude=11887.2, on_ground=False, velocity=302.84, true_track=64.65, vertical_rate=0.33, sensors=None, geo_altitude=12412.98, squawk=None, spi=False, position_source=0),
 Row(icao24='899104', callsign='CAL5234 ', origin_country='Taiwan', time_position=1742821769, last_contact=1742821770, longitude=-113.1797, latitude=52.8979, baro_altitude=10058.4, on_ground=False, velocity=306.86, true_track=125.1, vertical_rate=0.0, sensors=None, geo_altitude=9852.66, squawk=None, spi=False, position_source=0),
