## Import airport_df

In [1]:
import pandas as pd
import sqlalchemy
sys.path.append('..\keys')
import key

schema="gans_db"   # name of the database you want to use here
host="localhost"        # to connect to your local server
user="root"
password=key.SQL_PASSWORD # your password!!!!
port=3306
con = f'mysql+pymysql://{user}:{password}@{host}:{port}/{schema}'
airport_df = pd.read_sql_table('airport',con=con)

In [2]:
airport_df

Unnamed: 0,city_id,icao,name,type
0,9,AZ-0001,Zabrat Airport,medium_airport
1,15,EDDH,Hamburg Helmut Schmidt Airport,large_airport
2,15,EDHI,Hamburg-Finkenwerder Airport,medium_airport
3,29,EGBB,Birmingham International Airport,large_airport
4,3,EGLC,London City Airport,medium_airport
5,14,EPWA,Warsaw Chopin Airport,large_airport
6,23,LBSF,Sofia Airport,large_airport
7,18,LEBL,Josep Tarradellas Barcelona-El Prat Airport,large_airport
8,6,LECU,Madrid-Cuatro Vientos Airport,medium_airport
9,6,LEGT,Getafe Air Base,medium_airport


## Connect to API and collect Data

In [3]:
RERUN_TIME_IN_HOURS = 24
FLIGHT_FORCAST_TIMEWINDOW = 12

In [4]:
import math
from pytz import timezone
from datetime import datetime, date, timedelta
import requests 
import json

In [8]:
def connect_flight_api(icao_l):
    # Prepare URL inputs
    today = datetime.now().astimezone(timezone('Europe/Berlin')).date()
    rerun_window = math.ceil(RERUN_TIME_IN_HOURS/24) #roundup to bigger integer
    next_run = (today + timedelta(days=rerun_window))
    times = [["00:00","11:59"],["12:00","23:59"]]
    
    flight_df_l = []
    #Loop over all airports
    for icao in icao_l:
        for time in times:
            url = f"https://aerodatabox.p.rapidapi.com/flights/airports/icao/{icao}/{next_run}T{time[0]}/{next_run}T{time[1]}"
            querystring = {"withLeg":"true","direction":"Arrival","withCancelled":"false","withCodeshared":"true","withCargo":"false","withPrivate":"false","withLocation":"false"}
            headers = {
				"X-RapidAPI-Key": f"{key.Flight_API_key}",
				"X-RapidAPI-Host": "aerodatabox.p.rapidapi.com"
			}
            response = requests.request("GET", url, headers=headers, params=querystring)
            response.raise_for_status()
            flights_12h_json = response.json()
            for flight in flights_12h_json["arrivals"]:
                flight_dict = {}
                flight_dict['arrival_icao']         = icao
                flight_dict["arrival_time_local"]   = flight["arrival"].get('scheduledTimeLocal', pd.NaT)
                flight_dict["arrival_terminal"]     = flight['arrival'].get('terminal', "unknown")
                flight_dict["departure_city"]       =  flight["departure"]["airport"].get("name", "unknown")
                flight_dict["departure_icao"]       =  flight["departure"]["airport"].get("icao", "unknown")
                flight_dict["departure_time_local"] = flight["departure"].get("scheduledTimeLocal", pd.NaT)
                flight_dict["airline"]              =  flight["airline"].get("name", "unknown")
                flight_dict["flight_number"]        =  flight.get("number", "unknown")
                flight_dict["data_retrieved_on"]    =  datetime.now().astimezone(timezone('Europe/Berlin')).date()
                flight_df_l.append(flight_dict)
    return flight_df_l

In [10]:
	#TODO: Replace with list
 
airport_id_l = ["EGBB","EPWA"]
flights_df_l = connect_flight_api(airport_id_l)

In [11]:
flights_df = pd.DataFrame(flights_df_l)

In [12]:
len(flights_df)

286

In [13]:
flights_df

Unnamed: 0,arrival_icao,arrival_time_local,arrival_terminal,departure_city,departure_icao,departure_time_local,airline,flight_number,data_retrieved_on
0,EGBB,2023-01-07 07:05+00:00,unknown,Belfast,EGAA,2023-01-07 06:00+00:00,easyJet,U2 191,2023-01-06
1,EGBB,2023-01-07 07:35+00:00,unknown,Sofia,LBSF,2023-01-07 06:05+02:00,Ryanair,FR 6335,2023-01-06
2,EGBB,2023-01-07 07:40+00:00,unknown,Warsaw,EPWA,2023-01-07 05:45+01:00,Wizz Air,W6 1321,2023-01-06
3,EGBB,2023-01-07 07:40+00:00,unknown,Craiova,LRCV,2023-01-07 06:10+02:00,Wizz Air,W6 3765,2023-01-06
4,EGBB,2023-01-07 07:45+00:00,unknown,Dublin,EIDW,2023-01-07 06:40+00:00,British Airways,BA 2102,2023-01-06
...,...,...,...,...,...,...,...,...,...
281,EPWA,2023-01-07 23:05+01:00,unknown,Tel Aviv Yafo,LLBG,2023-01-07 20:05+02:00,LOT - Polish,LO 154,2023-01-06
282,EPWA,2023-01-07 23:10+01:00,unknown,London,EGGW,2023-01-07 19:45+00:00,Wizz Air,W6 1308,2023-01-06
283,EPWA,2023-01-07 23:15+01:00,unknown,Rome,LIRF,2023-01-07 20:50+01:00,Wizz Air,W6 1442,2023-01-06
284,EPWA,2023-01-07 23:20+01:00,unknown,Tenerife Island,GCTS,2023-01-07 16:45+00:00,Wizz Air,W6 1496,2023-01-06


In [14]:
flights_df.arrival_terminal.unique()

array(['unknown'], dtype=object)

## Clean flights_df

In [27]:
flights_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 286 entries, 0 to 285
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype              
---  ------                --------------  -----              
 0   arrival_icao          286 non-null    object             
 1   arrival_time_local    286 non-null    datetime64[ns, UTC]
 2   arrival_terminal      286 non-null    object             
 3   departure_city        286 non-null    object             
 4   departure_icao        286 non-null    object             
 5   departure_time_local  246 non-null    datetime64[ns, UTC]
 6   airline               286 non-null    object             
 7   flight_number         286 non-null    object             
 8   data_retrieved_on     286 non-null    datetime64[ns]     
dtypes: datetime64[ns, UTC](2), datetime64[ns](1), object(6)
memory usage: 20.2+ KB


In [23]:
flights_df.arrival_time_local = pd.to_datetime(flights_df.arrival_time_local,utc=True)

In [17]:
flights_df.data_retrieved_on = pd.to_datetime(flights_df.data_retrieved_on)

In [94]:
#flights_df.loc[flights_df.departure_time_local == "unknown","departure_time_local"] = pd.NaT

In [19]:
flights_df.loc[flights_df.departure_time_local == pd.NaT]

Unnamed: 0,arrival_icao,arrival_time_local,arrival_terminal,departure_city,departure_icao,departure_time_local,airline,flight_number,data_retrieved_on


In [26]:
flights_df.departure_time_local = pd.to_datetime(flights_df.departure_time_local,utc=True)

In [20]:
flights_df.departure_time_local.sample(10)

110    2023-01-07 20:15:00+00:00
233    2023-01-07 15:35:00+03:00
54                           NaT
162    2023-01-07 08:05:00+04:00
135    2023-01-07 07:00:00+03:00
140    2023-01-07 08:10:00+03:00
227    2023-01-07 14:35:00+01:00
49                           NaT
97     2023-01-07 17:20:00+01:00
257    2023-01-07 19:30:00+01:00
Name: departure_time_local, dtype: object

## Export Final df

In [28]:
flights_df.to_sql('flight',con=con,if_exists='append',index=False)

286

: 