# Data Processing with Apache Spark


In [1]:
import os
import pyspark
from pyspark.sql import SparkSession

In [2]:
os.environ["JAVA_HOME"] = "C:/Program Files/Java/jdk-23" 
os.environ["SPARK_HOME"] = "C:/Users/janad/Downloads/spark-3.5.3-bin-hadoop3" 
os.environ["HADOOP_HOME"] = os.environ["SPARK_HOME"] 

In [3]:
spark = SparkSession.builder \
    .appName("Case study") \
    .master("local[*]") \
    .config("spark.driver.extraJavaOptions", "-Djava.security.manager=allow") \
    .config("spark.executor.extraJavaOptions", "-Djava.security.manager=allow") \
    .getOrCreate()

In [4]:
spark

In [5]:
df_adsb = spark.read.json('adsb.json', multiLine=True)
df_oag = spark.read.json('oag.json', multiLine=True)

In [6]:
df_adsb.show(3,truncate=True)

+----------+--------+--------+-----------+---+------+----------+--------+---------+--------+------+-------+------------+--------------------+-----+------+-----+----+------+
|AircraftId|Altitude|Callsign|Destination|ETA|Flight|LastUpdate|Latitude|Longitude|Onground|Origin|RadarId|Registration|          SourceType|Speed|Squawk|Track|Type|Vspeed|
+----------+--------+--------+-----------+---+------+----------+--------+---------+--------+------+-------+------------+--------------------+-----+------+-----+----+------+
|    400960|       0| BAW476C|        ICN|  0| BA484|1696278420|10.81889|106.65194|       1|   SGN|   NULL|      G-TTOE|ADS-B FR24 receivers|    0|  7713|   30|A320|     0|
|    400960|   10000| BAW476C|        ICN|  0| BA484|1696279020|    12.5|    109.1|       0|   SGN|   NULL|      G-TTOE|ADS-B FR24 receivers|  300|  7713|   45|A320|  1500|
|    400960|   30000| BAW476C|        ICN|  0| BA484|1696280020|    15.3|    113.5|       0|   SGN|   NULL|      G-TTOE|ADS-B FR24 rece

In [7]:
df_adsb.printSchema()

root
 |-- AircraftId: string (nullable = true)
 |-- Altitude: long (nullable = true)
 |-- Callsign: string (nullable = true)
 |-- Destination: string (nullable = true)
 |-- ETA: long (nullable = true)
 |-- Flight: string (nullable = true)
 |-- LastUpdate: long (nullable = true)
 |-- Latitude: double (nullable = true)
 |-- Longitude: double (nullable = true)
 |-- Onground: long (nullable = true)
 |-- Origin: string (nullable = true)
 |-- RadarId: string (nullable = true)
 |-- Registration: string (nullable = true)
 |-- SourceType: string (nullable = true)
 |-- Speed: long (nullable = true)
 |-- Squawk: long (nullable = true)
 |-- Track: long (nullable = true)
 |-- Type: string (nullable = true)
 |-- Vspeed: long (nullable = true)



In [8]:
df_oag.show(3, truncate=2)

+----+------+
|data|paging|
+----+------+
|  [{|    {1|
+----+------+



In [9]:
df_oag.printSchema()

root
 |-- data: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- aircraftType: struct (nullable = true)
 |    |    |    |-- iata: string (nullable = true)
 |    |    |    |-- icao: string (nullable = true)
 |    |    |-- arrival: struct (nullable = true)
 |    |    |    |-- airport: struct (nullable = true)
 |    |    |    |    |-- faa: string (nullable = true)
 |    |    |    |    |-- iata: string (nullable = true)
 |    |    |    |    |-- icao: string (nullable = true)
 |    |    |    |-- date: struct (nullable = true)
 |    |    |    |    |-- local: string (nullable = true)
 |    |    |    |    |-- utc: string (nullable = true)
 |    |    |    |-- terminal: string (nullable = true)
 |    |    |    |-- time: struct (nullable = true)
 |    |    |    |    |-- local: string (nullable = true)
 |    |    |    |    |-- utc: string (nullable = true)
 |    |    |-- carrier: struct (nullable = true)
 |    |    |    |-- iata: string (nullable = true)
 |   

In [10]:
from pyspark.sql.functions import explode_outer
df_flat=df_oag.withColumn("data_flat", explode_outer("data"))

In [11]:
df_flat.show()

+--------------------+--------------------+--------------------+
|                data|              paging|           data_flat|
+--------------------+--------------------+--------------------+
|[{{773, NULL}, {{...|{10, https://api....|{{773, NULL}, {{N...|
|[{{773, NULL}, {{...|{10, https://api....|{{320, NULL}, {{N...|
|[{{773, NULL}, {{...|{10, https://api....|{{NULL, B763}, {{...|
|[{{773, NULL}, {{...|{10, https://api....|{{738, NULL}, {{N...|
|[{{773, NULL}, {{...|{10, https://api....|{{320, NULL}, {{N...|
|[{{773, NULL}, {{...|{10, https://api....|{{320, NULL}, {{N...|
|[{{773, NULL}, {{...|{10, https://api....|{{NULL, E545}, {{...|
|[{{773, NULL}, {{...|{10, https://api....|{{NULL, B763}, {{...|
|[{{773, NULL}, {{...|{10, https://api....|{{7M8, NULL}, {{N...|
|[{{773, NULL}, {{...|{10, https://api....|{{73H, NULL}, {{N...|
+--------------------+--------------------+--------------------+



In [12]:
#df_flatten=df_flat.select("data_flat.*")

In [13]:
#df_flatten.show(1,truncate=True)

In [14]:
from pyspark.sql.functions import col

df_flatten = df_flat.select(
    col("data_flat.aircraftType.iata").alias("aircraftType_iata"),
    col("data_flat.aircraftType.icao").alias("aircraftType_icao"),
    col("data_flat.arrival.airport.faa").alias("arrival_airport_faa"),
    col("data_flat.arrival.airport.iata").alias("arrival_airport_iata"),
    col("data_flat.arrival.airport.icao").alias("arrival_airport_icao"),
    col("data_flat.arrival.date.local").alias("arrival_date_local"),
    col("data_flat.arrival.date.utc").alias("arrival_date_utc"),
    col("data_flat.arrival.terminal").alias("arrival_airport_terminal"),
    col("data_flat.arrival.time.local").alias("arrival_airport_time_local"),
    col("data_flat.arrival.time.utc").alias("arrival_airport_time_utc"),
    col("data_flat.carrier.iata").alias("carrier.iata"),
    col("data_flat.carrier.icao").alias("carrier.icao"),
    col("data_flat.codeshare.aircraftOwner.name").alias("airlineowner"),
    col("data_flat.departure.airport.faa").alias("departure_airport_faa"),
    col("data_flat.departure.airport.iata").alias("departure_airport_iata"),
    col("data_flat.departure.airport.icao").alias("departure_airport_icao"),
    col("data_flat.departure.date.local").alias("departure_date_local"),
    col("data_flat.departure.date.utc").alias("departure_date_utc"),
    col("data_flat.departure.terminal").alias("departure_airport_terminal"),
    col("data_flat.departure.time.local").alias("departure_airport_time_local"),
    col("data_flat.departure.time.utc").alias("departure_airport_time_utc"),
    col("data_flat.elapsedTime"),
    col("data_flat.flightNumber"),
    col("data_flat.flightType"),
    col("data_flat.segmentInfo.numberOfStops").alias("segmentInfo_numberOfStops"),
    col("data_flat.statusDetails.arrival.actualTerminal").getItem(0).alias("arrival_actualTerminal"),
    col("data_flat.statusDetails.arrival.actualTime.inGate.local").getItem(0).alias("arrival_inGate_local"),
    col("data_flat.statusDetails.arrival.actualTime.inGate.utc").getItem(0).alias("arrival_inGate_utc"),
    col("data_flat.statusDetails.arrival.actualTime.inGateTimeliness").getItem(0).alias("arrival_inGateTimeliness"),
    col("data_flat.statusDetails.arrival.actualTime.inGateVariation").getItem(0).alias("arrival_inGateVariation"),
    col("data_flat.statusDetails.arrival.actualTime.onGround.local").getItem(0).alias("arrival_onGround_local"),
    col("data_flat.statusDetails.arrival.actualTime.onGround.utc").getItem(0).alias("arrival_onGround_utc"),
    col("data_flat.statusDetails.arrival.airport.faa").getItem(0).alias("arrival_airport_faa"),
    col("data_flat.statusDetails.arrival.airport.iata").getItem(0).alias("arrival_airport_iata_status"),
    col("data_flat.statusDetails.arrival.airport.icao").getItem(0).alias("arrival_airport_icao_status"),
    col("data_flat.statusDetails.arrival.baggage").getItem(0).alias("arrival_baggage"),
    col("data_flat.statusDetails.arrival.estimatedTime.inGate.local").getItem(0).alias("arrival_estimatedTime_inGate_local"),
    col("data_flat.statusDetails.arrival.estimatedTime.inGate.utc").getItem(0).alias("arrival_estimatedTime_inGate_utc"),
    col("data_flat.statusDetails.arrival.estimatedTime.inGateTimeliness").getItem(0).alias("arrival_estimatedTime_inGateTimeliness"),
    col("data_flat.statusDetails.arrival.estimatedTime.inGateVariation").getItem(0).alias("arrival_estimatedTime_inGateVariation"),
    col("data_flat.statusDetails.arrival.estimatedTime.onGround.local").getItem(0).alias("arrival_estimatedTime_onGround_local"),
    col("data_flat.statusDetails.arrival.estimatedTime.onGround.utc").getItem(0).alias("arrival_estimatedTime_onGround_utc"),
    col("data_flat.statusDetails.arrival.gate").getItem(0).alias("arrival_gate"),
    col("data_flat.statusDetails.departure.actualTerminal").getItem(0).alias("departure_actualTerminal"),
    col("data_flat.statusDetails.departure.actualTime.offGround.local").getItem(0).alias("departure_offGround_local"),
    col("data_flat.statusDetails.departure.actualTime.offGround.utc").getItem(0).alias("departure_offGround_utc"),
    col("data_flat.statusDetails.departure.actualTime.outGate.local").getItem(0).alias("departure_outGate_local"),
    col("data_flat.statusDetails.departure.actualTime.outGate.utc").getItem(0).alias("departure_outGate_utc"),
    col("data_flat.statusDetails.departure.actualTime.outGateTimeliness").getItem(0).alias("departure_outGateTimeliness"),
    col("data_flat.statusDetails.departure.actualTime.outGateVariation").getItem(0).alias("departure_outGateVariation"),
    col("data_flat.statusDetails.departure.airport.faa").getItem(0).alias("departure_airport_faa"),
    col("data_flat.statusDetails.departure.airport.iata").getItem(0).alias("departure_airport_iata_status"),
    col("data_flat.statusDetails.departure.airport.icao").getItem(0).alias("departure_airport_icao_status"),
    col("data_flat.statusDetails.departure.checkInCounter").getItem(0).alias("departure_checkInCounter"),
    col("data_flat.statusDetails.departure.estimatedTime.offGround.local").getItem(0).alias("departure_estimatedTime_offGround_local"),
    col("data_flat.statusDetails.departure.estimatedTime.offGround.utc").getItem(0).alias("departure_estimatedTime_offGround_utc"),
    col("data_flat.statusDetails.departure.estimatedTime.outGate.local").getItem(0).alias("departure_estimatedTime_outGate_local"),
    col("data_flat.statusDetails.departure.estimatedTime.outGate.utc").getItem(0).alias("departure_estimatedTime_outGate_utc"),
    col("data_flat.statusDetails.departure.estimatedTime.outGateTimeliness").getItem(0).alias("departure_estimatedTime_outGateTimeliness"),
    col("data_flat.statusDetails.departure.estimatedTime.outGateVariation").getItem(0).alias("departure_estimatedTime_outGateVariation"),
    col("data_flat.statusDetails.departure.gate").getItem(0).alias("departure_gate"),
    col("data_flat.statusDetails.equipment.actualAircraftType.iata").getItem(0).alias("equipment_actualAircraftType_iata"),
    col("data_flat.statusDetails.equipment.actualAircraftType.icao").getItem(0).alias("equipment_actualAircraftType_icao"),
    col("data_flat.statusDetails.equipment.aircraftRegistrationNumber").getItem(0).alias("equipment_aircraftRegistrationNumber"),
    col("data_flat.statusDetails.state").getItem(0).alias("state"),
    col("data_flat.statusDetails.updatedAt").getItem(0).alias("updatedAt")
)

In [15]:
import pandas as pd
pd.set_option('display.max_columns', None) 
pd_df = df_flatten.limit(5).toPandas()
pd_df.head()

Unnamed: 0,aircraftType_iata,aircraftType_icao,arrival_airport_faa,arrival_airport_iata,arrival_airport_icao,arrival_date_local,arrival_date_utc,arrival_airport_terminal,arrival_airport_time_local,arrival_airport_time_utc,carrier.iata,carrier.icao,airlineowner,departure_airport_faa,departure_airport_iata,departure_airport_icao,departure_date_local,departure_date_utc,departure_airport_terminal,departure_airport_time_local,departure_airport_time_utc,elapsedTime,flightNumber,flightType,segmentInfo_numberOfStops,arrival_actualTerminal,arrival_inGate_local,arrival_inGate_utc,arrival_inGateTimeliness,arrival_inGateVariation,arrival_onGround_local,arrival_onGround_utc,arrival_airport_faa.1,arrival_airport_iata_status,arrival_airport_icao_status,arrival_baggage,arrival_estimatedTime_inGate_local,arrival_estimatedTime_inGate_utc,arrival_estimatedTime_inGateTimeliness,arrival_estimatedTime_inGateVariation,arrival_estimatedTime_onGround_local,arrival_estimatedTime_onGround_utc,arrival_gate,departure_actualTerminal,departure_offGround_local,departure_offGround_utc,departure_outGate_local,departure_outGate_utc,departure_outGateTimeliness,departure_outGateVariation,departure_airport_faa.1,departure_airport_iata_status,departure_airport_icao_status,departure_checkInCounter,departure_estimatedTime_offGround_local,departure_estimatedTime_offGround_utc,departure_estimatedTime_outGate_local,departure_estimatedTime_outGate_utc,departure_estimatedTime_outGateTimeliness,departure_estimatedTime_outGateVariation,departure_gate,equipment_actualAircraftType_iata,equipment_actualAircraftType_icao,equipment_aircraftRegistrationNumber,state,updatedAt
0,773.0,,,ICN,RKSI,2023-10-03,2023-10-02,2.0,07:20,22:20,KE,KAL,,,SGN,VVTS,2023-10-03,2023-10-02,2.0,00:15,17:15,305,476,Scheduled,0,T2,2023-10-03T07:21:00+09:00,2023-10-02T22:21:00+00:00,Delayed,00:01:00,2023-10-03T07:14:00+09:00,2023-10-02T22:14:00+00:00,,ICN,RKSI,4,2023-10-03T07:22:00+09:00,2023-10-02T22:22:00+00:00,Delayed,00:02:00,2023-10-03T07:15:00+09:00,2023-10-02T22:15:00+00:00,231,,2023-10-03T00:43:00+07:00,2023-10-02T17:43:00+00:00,2023-10-03T00:31:00+07:00,2023-10-02T17:31:00+00:00,Delayed,00:16:00,,SGN,VVTS,,,,2023-10-03T00:15:00+07:00,2023-10-02T17:15:00+00:00,OnTime,00:00:00,,773,B773,HL7534,InGate,2023-10-02T22:26:29.057
1,320.0,,,SYZ,OISS,2023-10-03,2023-10-03,,03:45,00:15,QR,QTR,,,DOH,OTHH,2023-10-03,2023-10-02,,01:45,22:45,90,476,Scheduled,0,,2023-10-03T04:16:00+03:30,2023-10-03T00:46:00+00:00,Delayed,00:31:00,2023-10-03T04:07:00+03:30,2023-10-03T00:37:00+00:00,,SYZ,OISS,,,,,,2023-10-03T04:06:00+03:30,2023-10-03T00:36:00+00:00,,,2023-10-03T02:41:00+03:00,2023-10-02T23:41:00+00:00,2023-10-03T02:04:00+03:00,2023-10-02T23:04:00+00:00,Delayed,00:19:00,,DOH,OTHH,,,,2023-10-03T01:45:00+03:00,2023-10-02T22:45:00+00:00,OnTime,00:00:00,C62,320,A320,A7LAF,InGate,2023-10-03T01:12:15.986
2,,B763,,YHM,CYHM,2023-10-03,2023-10-03,,05:52,09:52,5X,,,SDF,SDF,KSDF,2023-10-03,2023-10-03,,04:41,08:41,0,476,Unscheduled,0,,,,,,2023-10-03T06:11:00-04:00,2023-10-03T10:11:00+00:00,,YHM,CYHM,,,,,,,,,,2023-10-03T05:09:00-04:00,2023-10-03T09:09:00+00:00,,,,,SDF,SDF,KSDF,,,,,,,,,,B763,N342UP,Landed,2023-10-03T10:13:34.644
3,738.0,,,MIA,KMIA,2023-10-03,2023-10-03,,10:50,14:50,AA,AAL,,,GUA,MGGT,2023-10-03,2023-10-03,,06:03,12:03,167,476,Scheduled,0,D,2023-10-03T10:35:00-04:00,2023-10-03T14:35:00+00:00,Early,-00:15:00,2023-10-03T10:24:00-04:00,2023-10-03T14:24:00+00:00,MIA,MIA,KMIA,CD,2023-10-03T10:33:00-04:00,2023-10-03T14:33:00+00:00,Early,-00:17:00,,,D6,,2023-10-03T06:12:00-06:00,2023-10-03T12:12:00+00:00,2023-10-03T05:54:00-06:00,2023-10-03T11:54:00+00:00,Early,-00:09:00,,GUA,MGGT,,,,2023-10-03T06:03:00-06:00,2023-10-03T12:03:00+00:00,OnTime,00:00:00,10,,B738,N826NN,InGate,2023-10-03T14:35:46.538
4,320.0,,,BCN,LEBL,2023-10-03,2023-10-03,1.0,10:40,08:40,BA,BAW,,,LHR,EGLL,2023-10-03,2023-10-03,5.0,07:25,06:25,135,476,Scheduled,0,1,2023-10-03T11:15:00+02:00,2023-10-03T09:15:00+00:00,Delayed,00:35:00,2023-10-03T11:11:00+02:00,2023-10-03T09:11:00+00:00,,BCN,LEBL,11,2023-10-03T11:15:00+02:00,2023-10-03T09:15:00+00:00,Delayed,00:35:00,,,,5.0,2023-10-03T08:26:00+01:00,2023-10-03T07:26:00+00:00,2023-10-03T08:11:00+01:00,2023-10-03T07:11:00+00:00,Delayed,00:46:00,,LHR,EGLL,D,,,2023-10-03T08:05:00+01:00,2023-10-03T07:05:00+00:00,Delayed,00:40:00,4B,20C,,GEUYE,InGate,2023-10-03T09:17:16.239


In [16]:
pd_df_adsb = df_adsb.limit(5).toPandas()
pd_df_adsb.head()

Unnamed: 0,AircraftId,Altitude,Callsign,Destination,ETA,Flight,LastUpdate,Latitude,Longitude,Onground,Origin,RadarId,Registration,SourceType,Speed,Squawk,Track,Type,Vspeed
0,400960,0,BAW476C,ICN,0,BA484,1696278420,10.81889,106.65194,1,SGN,,G-TTOE,ADS-B FR24 receivers,0,7713,30,A320,0
1,400960,10000,BAW476C,ICN,0,BA484,1696279020,12.5,109.1,0,SGN,,G-TTOE,ADS-B FR24 receivers,300,7713,45,A320,1500
2,400960,30000,BAW476C,ICN,0,BA484,1696280020,15.3,113.5,0,SGN,,G-TTOE,ADS-B FR24 receivers,500,7713,50,A320,2000
3,400960,35000,BAW476C,ICN,0,BA484,1696282020,18.6,118.9,0,SGN,,G-TTOE,ADS-B FR24 receivers,540,7713,60,A320,0
4,400960,38000,BAW476C,ICN,0,BA484,1696287020,28.2,127.8,0,SGN,,G-TTOE,ADS-B FR24 receivers,550,7713,80,A320,0


In [17]:
aircraft=df_adsb

In [18]:
airport=df_flatten

In [19]:
aircraft_count = aircraft.count()
print(f"Aircraft instances: {aircraft_count}")
airport_count = airport.count()
print(f"Airport instances: {airport_count}")

Aircraft instances: 28
Airport instances: 10


# Data cleaning

### Duplicates 

In [20]:
aircraft=aircraft.distinct()
airport=airport.distinct()

In [21]:
aircraft_count_d = aircraft.count()
print(f"Cleaning duplicates - Aircraft instances: {aircraft_count_d}")
airport_count_d = airport.count()
print(f"Cleaning duplicates -Airport instances: {airport_count_d}")

Cleaning duplicates - Aircraft instances: 28
Cleaning duplicates -Airport instances: 10


### Filltering - flightType - Unscheduled 

In [22]:
airport = airport.filter(airport.flightType != "Unscheduled")

In [23]:
airport_count_f = airport.count()
print(f"Filltering -Airport instances: {airport_count_f}")

Filltering -Airport instances: 7


### Missing values

In [24]:
aircraft = aircraft.dropna(subset=["Callsign"])

In [25]:
aircraft_count_missing = aircraft.count()
print(f"Cleaning duplicates - Aircraft instances: {aircraft_count_missing}")

Cleaning duplicates - Aircraft instances: 28


### Formating 

In [26]:
from pyspark.sql.functions import from_unixtime
from pyspark.sql.functions import col
from pyspark.sql.functions import to_timestamp


spark.conf.set("spark.sql.session.timeZone", "UTC")
aircraft = aircraft.withColumn(
    "LastUpdateUTC", 
    from_unixtime("LastUpdate", "yyyy-MM-dd HH:mm:ss")
)
aircraft = aircraft.withColumn("LastUpdateUTC", to_timestamp("LastUpdateUTC", "yyyy-MM-dd HH:mm:ss"))
#aircraft.show()


# KPIs


## a. average speed for each airport - Avg arrivals per day

In [27]:
# arrival_airport_icao - airport
# arrival_date_local - date local for landing
#counting the arrivals per day as how many records of that specific airport are in the dataset per day

from pyspark.sql.functions import count, to_date, avg


perday_count = airport.groupBy(
    "arrival_airport_icao",
    to_date("arrival_date_local")).agg(count("*").alias("perday_arrival"))

avg_arrivals = perday_count.groupBy("arrival_airport_icao").agg(avg("perday_arrival").alias("avg_perday_arrival"))
avg_arrivals.show()

+--------------------+------------------+
|arrival_airport_icao|avg_perday_arrival|
+--------------------+------------------+
|                MUHA|               1.0|
|                LDDU|               1.0|
|                KMIA|               1.0|
|                RKSI|               1.0|
|                OISS|               1.0|
|                RJTT|               1.0|
|                LEBL|               1.0|
+--------------------+------------------+



In [28]:
pd_avg_arrivals = avg_arrivals.toPandas()
pd_avg_arrivals.head()

Unnamed: 0,arrival_airport_icao,avg_perday_arrival
0,MUHA,1.0
1,LDDU,1.0
2,KMIA,1.0
3,RKSI,1.0
4,OISS,1.0


## b. the total number of delayed flights (categorized into arrival delays and departure delays)


In [29]:
#Delayed
arrival_delays = airport.filter(col("arrival_inGateTimeliness") == "Delayed").count()
departure_delays = airport.filter(col("departure_outGateTimeliness") == "Delayed").count()
#All
total_arrivals = airport.filter(col("arrival_inGateTimeliness").isNotNull()).count()
total_departures = airport.filter(col("departure_outGateTimeliness").isNotNull()).count()
#Counting proportions
arrival_delay_proportion = arrival_delays / total_arrivals
departure_delay_proportion = departure_delays / total_departures

print(f"Arrival's delays: {arrival_delays}; Out of all arrivals it makes: {arrival_delay_proportion}")
print(f"Departure's delays: {departure_delays}; Out of all departures it makes: {departure_delay_proportion}")

Arrival's delays: 4; Out of all arrivals it makes: 0.5714285714285714
Departure's delays: 5; Out of all departures it makes: 0.7142857142857143


# Spark partitioning

## a. Filter the DataFrame to retain only the most recent entry (the one with the smallest LastUpdate ) for each FlightId .

In [30]:
from pyspark.sql import Window
from pyspark.sql.functions import row_number, desc


#window partitioned by Flight and ordered by LastUpdateUTC
window_entry = Window.partitionBy("Flight").orderBy(desc("LastUpdateUTC"))

#chosing the first row from each fligh, aka the most recent
aircraft_recent_entry = (
    aircraft.withColumn("row_entry", row_number()
            .over(window_entry))
            .filter("row_entry = 1"))

aircraft_recent_entry.show()


+----------+--------+--------+-----------+---+------+----------+--------+---------+--------+------+-------+------------+--------------------+-----+------+-----+----+------+-------------------+---------+
|AircraftId|Altitude|Callsign|Destination|ETA|Flight|LastUpdate|Latitude|Longitude|Onground|Origin|RadarId|Registration|          SourceType|Speed|Squawk|Track|Type|Vspeed|      LastUpdateUTC|row_entry|
+----------+--------+--------+-----------+---+------+----------+--------+---------+--------+------+-------+------------+--------------------+-----+------+-----+----+------+-------------------+---------+
|      B738|       0|  AAL476|        MIA|  0|AAL476|1696350960| 25.7956|  -80.287|       1|   GUA|    300|      N826NN|ADS-B FR24 receivers|    0|  4321|  270|B738|     0|2023-10-03 16:36:00|        1|
|    400960|       0| BAW476C|        ICN|  0| BA484|1696290420|37.46567|126.44048|       1|   SGN|    200|      G-TTOE|ADS-B FR24 receivers|    0|  7713|  120|A320|     0|2023-10-02 23:47

## b. Return a DataFrame containing only the FlightId and the corresponding latest LastUpdate 
 

In [31]:
aircraft_recent_entry = aircraft_recent_entry.select("Flight", "LastUpdateUTC").orderBy(desc("LastUpdateUTC"))

In [32]:
pd_aircraft_recent_entry = aircraft_recent_entry.limit(5).toPandas()
pd_aircraft_recent_entry.head()

Unnamed: 0,Flight,LastUpdateUTC
0,AAL476,2023-10-03 16:36:00
1,LXJ476,2023-10-03 16:25:20
2,BA484,2023-10-02 23:47:00
3,QR476,2023-10-02 23:12:15


In [33]:
from io import StringIO

output = StringIO()

#Task 1.a
output.write("Task 1.a KPIs \n")
pd_avg_arrivals.to_csv(output, index=False)
output.write("\n")
# Task 1.b
output.write("Task 1.b KPIs \n")
output.write(f"Arrival's delays: {arrival_delays}; Out of all arrivals it makes: {arrival_delay_proportion}\n")
output.write(f"Departure's delays: {departure_delays}; Out of all departures it makes: {departure_delay_proportion}\n")
output.write("\n")
#Task 2
output.write("Task 2. Spark partitioning\n")
pd_aircraft_recent_entry.to_csv(output, index=False)

with open("report.csv", "w") as f:
    f.write(output.getvalue())
