# Get scheduled fligh arrivals based on location

Retrieve scheduled flights from [AeroDataBox](https://rapidapi.com/aedbx-aedbx/api/aerodatabox) on
* [columns here]]

Please ensure you have set up the corresponding tables with `sql/create_database_data_pipeline_example.sql`.

Save you MySQL password in `python/key.env` as `MYSQL_KEY` (or provide you password by other means) and open your MySQL workbench.

## Import libraries

In [None]:
import pandas as pd
import requests
import sqlalchemy
from pytz import timezone
from datetime import datetime, timedelta
from dotenv import load_dotenv
import os

## Define functions

In [None]:
def create_connection_string():
    """
    Builds a SQLAlchemy-style connection string for a MySQL database.

    Environment:
        Expects a file named 'keys.env' containing:
            MYSQL_KEY=<your_mysql_password>

    Returns:
        str: A connection string of the form
             'mysql+pymysql://user:password@host:port/schema'
    """
    schema = "data_pipeline_example"
    host = "127.0.0.1"
    user = "root"
    load_dotenv('keys.env')  # Load credentials from .env file
    password = os.getenv("MYSQL_KEY")
    port = 3306
    return f'mysql+pymysql://{user}:{password}@{host}:{port}/{schema}'


def fetch_cities_data(connection_string):
    """
    Fetches a list of cities from the database, including their coordinates.

    Args:
        connection_string (str): Database connection string.

    Returns:
        pandas.DataFrame: A DataFrame with city data.
                          Expected columns: ['city_id', 'latitude', 'longitude']
    """
    return pd.read_sql("cities", con=connection_string)


In [121]:
def get_airports(cities_df):
  # API headers
  headers = {
      "X-RapidAPI-Key": API_key,
      "X-RapidAPI-Host": "aerodatabox.p.rapidapi.com"
  }

  # DataFrame to store results
  all_airports = []
  for _, city in cities_df.iterrows():
    lat = city["latitude"]
    lon = city["longitude"]

    # Construct the URL with the latitude and longitude
    url = f"https://aerodatabox.p.rapidapi.com/airports/search/location/"
    querystring = {"lat":lat, "lon":lon,"radiusKm":"50","limit":"10","withFlightInfoOnly":"true"}

    # Make the API request
    response = requests.get(url, headers=headers, params=querystring)

    if response.status_code == 200:
      data = response.json()
      airports = pd.json_normalize(data.get('items', []))
      airports['city_id'] =city["city_id"]
      all_airports.append(airports)

  return pd.concat(all_airports, ignore_index=True)

In [None]:
def get_flights_data(icaos):
    """
    Retrieves scheduled flights for airport icoa from the AeroDataBox API.

    Args:
        icaos (list): list containing icao as string.

    Environment:
        Expects a file named 'keys.env' containing:
            AERODATABOX_KEY=<your_openweather_api_key>

    Returns:
        pandas.DataFrame: DataFrame with weather forecasts, containing:
            - arrival_airport_icao            
            - departure_airport_icao
            - scheduled_arrival_time
            - flight_number
            - timestamp_flight: retrieval time
    """
    
    load_dotenv('keys.env')
    API_KEY = os.getenv("AERODATABOX_KEY")

    berlin_timezone = timezone('Europe/Berlin')
    schedule_time = (datetime.now(berlin_timezone) + timedelta(1)).strftime('%Y-%m-%d')
    times = {'from': ["T00:00", "T12:00"],
                "to": ["T11:59", "T23:59"]}

    querystring = {"withLeg":"false","direction":"Arrival","withCancelled":"false","withCodeshared":"false","withCargo":"false","withPrivate":"false","withLocation":"false"}
    headers = {
        "X-RapidAPI-Key": API_key,
        "x-rapidapi-host": "aerodatabox.p.rapidapi.com"
        }

    flight_items = []
    for icao in icaos:
        for time in range(2):
            url = f"""
            https://aerodatabox.p.rapidapi.com/flights/airports/icao/{icao}/
            {schedule_time + times ["from"][time]}/{schedule_time + times["to"][time]}
            """
            # print(url)

            response = requests.get(url, headers=headers, params=querystring)
            # print(response.status_code)
            if response.status_code != 200:
                break
            
            flight_data = response.json()
            retrieval_time = datetime.now(berlin_timezone).strftime("%Y-%m-%d %H:%M:%S")

            for item in flight_data["arrivals"]:
                flight_item = {
                    "arrival_airport_icao": icao,            
                    "departure_airport_icao": item['movement']['airport'].get('icao'),
                    "scheduled_arrival_time": item['movement']['scheduledTime'].get('local'),
                    "flight_number": item.get('number'),
                    "timestamp_flight":retrieval_time
                }
                flight_items.append(flight_item)

    flight_df = pd.DataFrame(flight_items)

    # Ensure correct datetime types
    flight_df["scheduled_arrival_time"] = flight_df["scheduled_arrival_time"].str.replace("+02:00","")#("Z", ":00")
    flight_df["scheduled_arrival_time"] = pd.to_datetime(flight_df["scheduled_arrival_time"])
    flight_df["timestamp_flight"] = pd.to_datetime(flight_df["timestamp_flight"])

    return flight_df

## Fetch data

In [117]:
cities_df = fetch_cities_data(create_connection_string())
cities_df

Unnamed: 0,city_id,city_name,country,country_code,latitude,longitude
0,1,Berlin,Germany,GER,52.52,13.405
1,2,Hamburg,Germany,GER,53.55,10.0
2,3,Munich,Germany,GER,48.138,11.575


In [123]:
all_airports = get_airports(cities_df)
all_airports

Unnamed: 0,icao,iata,name,shortName,municipalityName,countryCode,timeZone,location.lat,location.lon,city_id
0,EDDT,TXL,Berlin -Tegel,-Tegel,Berlin,DE,Europe/Berlin,52.5597,13.287699,1
1,EDDB,BER,Berlin Brandenburg,Brandenburg,Berlin,DE,Europe/Berlin,52.35139,13.493889,1
2,EDDH,HAM,Hamburg,Hamburg,Hamburg,DE,Europe/Berlin,53.6304,9.988229,2
3,EDDM,MUC,Munich,Munich,Munich,DE,Europe/Berlin,48.3538,11.7861,3


In [119]:
flights_df = get_flights_data(all_airports["icao"])
flights_df

Unnamed: 0,arrival_airport_icao,departure_airport_icao,scheduled_arrival_time,flight_number,timestamp_flight
0,EDDB,KIAD,2025-10-18 04:20:00,GAF 933,2025-10-17 15:34:59
1,EDDB,OLBA,2025-10-18 05:40:00,SR 1599,2025-10-17 15:34:59
2,EDDB,LTBJ,2025-10-18 06:00:00,XQ 866,2025-10-17 15:34:59
3,EDDB,LTFM,2025-10-18 06:00:00,TK 1293,2025-10-17 15:34:59
4,EDDB,LTBJ,2025-10-18 06:30:00,XQ 966,2025-10-17 15:34:59
...,...,...,...,...,...
876,EDDM,GCTS,2025-10-18 23:15:00,X3 2197,2025-10-17 15:35:03
877,EDDM,GCLP,2025-10-18 23:20:00,DE 1523,2025-10-17 15:35:03
878,EDDM,GCTS,2025-10-18 23:20:00,DE 1585,2025-10-17 15:35:03
879,EDDM,LEPA,2025-10-18 23:20:00,LH 1799,2025-10-17 15:35:03


## Send to SQL

Drop irrelevant info, rename if needed and send to database

In [127]:
airports_df = all_airports[['icao', 'name', 'city_id']]
airports_df = airports_df.rename(columns={"icao": "icao_code", "name": "airport_name"})

In [129]:
airports_df.to_sql(
        'cities_airports',
        if_exists='append',
        con=create_connection_string(),
        index=False
)

4

In [133]:
flights_df.to_sql(
        'flights',
        if_exists='append',
        con=create_connection_string(),
        index=False
)

881