# ✈️ FLIGHTS DATA INGESTION PIPELINE
### 1️⃣ Objective of Flights Extraction

Purpose:
Fetch recent arrival and departure flight data for the selected 14 airports using AeroDataBox flight endpoints and convert the responses into a structured DataFrame suitable for SQL ingestion and analytics.

### 2️⃣ Endpoints Used
Endpoint	Purpose
/flights/airports/iata/{code}/arrivals	Incoming flights
/flights/airports/iata/{code}/departures	Outgoing flights

### 3️⃣ Import Required Libraries

In [12]:
import requests
import pandas as pd
import time

### 4️⃣ Define Flight Extraction Parameters

Purpose:
Centralize reusable configuration.

In [17]:
API_HOST = "aerodatabox.p.rapidapi.com"

HEADERS = {
    "x-rapidapi-key": "377865ea05mshfffa5656a3d74fbp1ef6eajsnf18ff39999b9",
    "x-rapidapi-host": API_HOST
}

AIRPORTS = [
    "DEL","BOM","BLR","HYD","MAA","CCU","COK",
    "JFK","LHR","DXB","SIN","CDG","HND","SYD"
]

### 5️⃣ Define a Generic Flight Fetch Function

Purpose:
Fetch flights for a given airport and direction (arrivals or departures).

In [18]:
def fetch_airport_flights(iata_code):
    url = f"https://{API_HOST}/flights/airports/iata/{iata_code}"
    
    r = requests.get(url, headers=HEADERS)
    
    if r.status_code != 200:
        print(f"Failed for {iata_code} | Status: {r.status_code}")
        return None
    
    return r.json()





### 6️⃣ Extract & Normalize Flight Records

Purpose:
Flatten nested JSON into relational-friendly rows.

In [15]:
flight_rows = []

for airport in AIRPORTS:
    data = fetch_airport_flights(airport)

    if not data:
        continue

    # -------- ARRIVALS --------
    for f in data.get("arrivals", []):

        if f.get("codeshareStatus") != "IsOperator":
            continue

        flight_rows.append({
            "flight_number": f.get("number"),
            "airline_name": f.get("airline", {}).get("name"),
            "aircraft_registration": f.get("aircraft", {}).get("reg"),

            "origin_iata": f.get("departure", {}).get("airport", {}).get("iata"),
            "destination_iata": f.get("movement", {}).get("airport", {}).get("iata"),

            "scheduled_time": f.get("movement", {}).get("scheduledTime", {}).get("utc"),
            "actual_time": (
                f.get("movement", {}).get("runwayTime", {}).get("utc")
                or f.get("movement", {}).get("revisedTime", {}).get("utc")
            ),

            "status": f.get("status"),
            "flight_type": "arrival"
        })

    # -------- DEPARTURES --------
    for f in data.get("departures", []):

        if f.get("codeshareStatus") != "IsOperator":
            continue

        flight_rows.append({
            "flight_number": f.get("number"),
            "airline_name": f.get("airline", {}).get("name"),
            "aircraft_registration": f.get("aircraft", {}).get("reg"),

            "origin_iata": f.get("movement", {}).get("airport", {}).get("iata"),
            "destination_iata": f.get("arrival", {}).get("airport", {}).get("iata"),

            "scheduled_time": f.get("movement", {}).get("scheduledTime", {}).get("utc"),
            "actual_time": (
                f.get("movement", {}).get("runwayTime", {}).get("utc")
                or f.get("movement", {}).get("revisedTime", {}).get("utc")
            ),

            "status": f.get("status"),
            "flight_type": "departure"
        })


In [16]:
flights_df = pd.DataFrame(flight_rows)
flights_df

Unnamed: 0,flight_number,airline_name,aircraft_registration,origin_iata,destination_iata,scheduled_time,actual_time,status,flight_type
0,AI 1812,Air India,,,RPR,2026-01-02 05:40Z,,Unknown,arrival
1,AI 1810,Air India,VT-TNM,,SXR,2026-01-02 05:50Z,,Unknown,arrival
2,6E 2051,IndiGo,,,HYD,2026-01-02 05:50Z,,Unknown,arrival
3,AI 2385,Air India,VT-EXL,,KUL,2026-01-02 06:00Z,,Expected,arrival
4,6E 7443,IndiGo,,,BKB,2026-01-02 06:00Z,,Unknown,arrival
...,...,...,...,...,...,...,...,...,...
5128,QF 8729,Qantas,,TPE,,2026-01-02 11:10Z,2026-01-02 11:10Z,Expected,departure
5129,CI 52,China,,TPE,,2026-01-02 11:10Z,2026-01-02 11:10Z,Expected,departure
5130,MH 140,Malaysia,,KUL,,2026-01-02 11:15Z,2026-01-02 11:15Z,Expected,departure
5131,CX 138,Cathay Pacific,,HKG,,2026-01-02 11:20Z,2026-01-02 11:20Z,Expected,departure


In [17]:

flights_df.isna().sum()


flight_number               0
airline_name                0
aircraft_registration    2992
origin_iata              2650
destination_iata         2519
scheduled_time              0
actual_time               787
status                      0
flight_type                 0
dtype: int64

In [19]:
flight_rows = []

for airport in AIRPORTS:

    data = fetch_airport_flights(airport)
    if not data:
        continue

    # ---------- ARRIVALS ----------
    for f in data.get("arrivals", []):

        if f.get("codeshareStatus") != "IsOperator":
            continue

        scheduled_arrival = (
            f.get("arrival", {})
             .get("scheduledTime", {})
             .get("utc")
        )

        flight_id = f"{f.get('number')}_{scheduled_arrival}"

        flight_rows.append({
            "flight_id": flight_id,
            "flight_number": f.get("number"),
            "airline_name": f.get("airline", {}).get("name"),
            "aircraft_registration": f.get("aircraft", {}).get("reg"),
            "origin_iata": f.get("departure", {}).get("airport", {}).get("iata"),
            "destination_iata": f.get("movement", {}).get("airport", {}).get("iata"),
            "scheduled_departure": None,
            "actual_departure": None,
            "scheduled_arrival": scheduled_arrival,
            "actual_arrival": (
                f.get("arrival", {}).get("runwayTime", {}).get("utc")
                or f.get("arrival", {}).get("revisedTime", {}).get("utc")
            ),
            "status": f.get("status"),
            "movement_type": "arrival"
        })


    # ---------- DEPARTURES ----------
    for f in data.get("departures", []):

        if f.get("codeshareStatus") != "IsOperator":
            continue

        scheduled_departure = (
            f.get("departure", {})
             .get("scheduledTime", {})
             .get("utc")
        )

        flight_id = f"{f.get('number')}_{scheduled_departure}"

        flight_rows.append({
            "flight_id": flight_id,
            "flight_number": f.get("number"),
            "airline_name": f.get("airline", {}).get("name"),
            "aircraft_registration": f.get("aircraft", {}).get("reg"),
            "origin_iata": f.get("movement", {}).get("airport", {}).get("iata"),
            "destination_iata": f.get("arrival", {}).get("airport", {}).get("iata"),
            "scheduled_departure": scheduled_departure,
            "actual_departure": (
                f.get("departure", {}).get("runwayTime", {}).get("utc")
                or f.get("departure", {}).get("revisedTime", {}).get("utc")
            ),
            "scheduled_arrival": None,
            "actual_arrival": None,
            "status": f.get("status"),
            "movement_type": "departure"
        })


In [21]:
import pandas as pd

flights_df = pd.DataFrame(flight_rows)
flights_df 

Unnamed: 0,flight_id,flight_number,airline_name,aircraft_registration,origin_iata,destination_iata,scheduled_departure,actual_departure,scheduled_arrival,actual_arrival,status,movement_type
0,IX 1252_None,IX 1252,Air India Express,,,VNS,,,,,Unknown,arrival
1,6E 1462_None,6E 1462,IndiGo,,,DXB,,,,,Expected,arrival
2,AI 218_None,AI 218,Air India,VT-CIM,,KTM,,,,,Unknown,arrival
3,AI 2541_None,AI 2541,Air India,,,HYD,,,,,Unknown,arrival
4,IX 1017_None,IX 1017,Air India Express,VT-JPR,,SXR,,,,,Unknown,arrival
...,...,...,...,...,...,...,...,...,...,...,...,...
4689,QF 161_None,QF 161,Qantas,,WLG,,,,,,Expected,departure
4690,JQ 511_None,JQ 511,Jetstar,,MEL,,,,,,Expected,departure
4691,QF 578_None,QF 578,Qantas,,HTI,,,,,,Expected,departure
4692,QF 922_None,QF 922,Qantas,VH-VXJ,CNS,,,,,,Expected,departure


In [22]:
flights_df.isna().sum()

flight_id                   0
flight_number               0
airline_name                0
aircraft_registration    2630
origin_iata              2468
destination_iata         2247
scheduled_departure      4694
actual_departure         4694
scheduled_arrival        4694
actual_arrival           4694
status                      0
movement_type               0
dtype: int64