## 1.Export

###  get airport_T_df

In [1]:
import pandas as pd
import sqlalchemy
sys.path.append('..\keys')
import key

schema="gans_db"   # name of the database you want to use here
host="localhost"        # to connect to your local server
user="root"
password=key.SQL_PASSWORD # your password!!!!
port=3306
con = f'mysql+pymysql://{user}:{password}@{host}:{port}/{schema}'
airport_T_df = pd.read_sql_table('airport',con=con)

In [4]:
airport_T_df.icao[0:2]

0    AZ-0001
1       EDDH
Name: icao, dtype: object

### Connect to API and collect Data

In [9]:
RERUN_TIME_IN_HOURS = 24
FLIGHT_FORCAST_TIMEWINDOW = 12

In [7]:
import math
from pytz import timezone
from datetime import datetime, date, timedelta
import requests 
import json

In [5]:
def connect_flight_api(airport_df):
    # Prepare URL inputs
    today = datetime.now().astimezone(timezone('Europe/Berlin')).date()
    rerun_window = math.ceil(RERUN_TIME_IN_HOURS/24) #roundup to bigger integer
    next_run = (today + timedelta(days=rerun_window))
    times = [["00:00","11:59"],["12:00","23:59"]]
    
    flight_df_l = []
    #Loop over all airports
    for icao in airport_df.icao[1:2]:
        for time in times:
            url = f"https://aerodatabox.p.rapidapi.com/flights/airports/icao/{icao}/{next_run}T{time[0]}/{next_run}T{time[1]}"
            querystring = {"withLeg":"true","direction":"Arrival","withCancelled":"false","withCodeshared":"true","withCargo":"false","withPrivate":"false","withLocation":"false"}
            headers = {
				"X-RapidAPI-Key": f"{key.Flight_API_key}",
				"X-RapidAPI-Host": "aerodatabox.p.rapidapi.com"
			}
            response = requests.request("GET", url, headers=headers, params=querystring)
            response.raise_for_status()
            flights_12h_json = response.json()
            for flight in flights_12h_json["arrivals"]:
                flight_dict = {}
                flight_dict['arrival_icao']         = icao
                flight_dict["arrival_time_local"]   = flight["arrival"].get('scheduledTimeLocal', pd.NaT)
                flight_dict["arrival_terminal"]     = flight['arrival'].get('terminal', "unknown")
                flight_dict["departure_city"]       =  flight["departure"]["airport"].get("name", "unknown")
                flight_dict["departure_icao"]       =  flight["departure"]["airport"].get("icao", "unknown")
                flight_dict["departure_time_local"] = flight["departure"].get("scheduledTimeLocal", pd.NaT)
                flight_dict["airline"]              =  flight["airline"].get("name", "unknown")
                flight_dict["flight_number"]        =  flight.get("number", "unknown")
                flight_dict["data_retrieved_on"]    =  datetime.now().astimezone(timezone('Europe/Berlin')).date()
                flight_df_l.append(flight_dict)
    return flight_df_l

In [10]:
	#TODO: Replace with list
 
#airport_id_l = ["EGBB","EPWA"]
flights_df_l = connect_flight_api(airport_T_df)

In [11]:
flight_df = pd.DataFrame(flights_df_l)

In [12]:
len(flight_df)

90

In [13]:
flight_df

Unnamed: 0,arrival_icao,arrival_time_local,arrival_terminal,departure_city,departure_icao,departure_time_local,airline,flight_number,data_retrieved_on
0,EDDH,2023-01-10 07:35+01:00,2,Frankfurt-am-Main,EDDF,2023-01-10 06:30+01:00,Lufthansa,LH 2,2023-01-09
1,EDDH,2023-01-10 07:45+01:00,1,Stuttgart,EDDS,2023-01-10 06:25+01:00,Eurowings,EW 2044,2023-01-09
2,EDDH,2023-01-10 07:55+01:00,1,Cologne,EDDK,2023-01-10 06:50+01:00,Eurowings,EW 32,2023-01-09
3,EDDH,2023-01-10 08:00+01:00,1,Duesseldorf,EDDL,2023-01-10 07:00+01:00,Eurowings,EW 9030,2023-01-09
4,EDDH,2023-01-10 08:00+01:00,2,Munich,EDDM,2023-01-10 06:40+01:00,Lufthansa,LH 2050,2023-01-09
...,...,...,...,...,...,...,...,...,...
85,EDDH,2023-01-10 22:30+01:00,1,Munich,EDDM,2023-01-10 21:00+01:00,Eurowings,EW 7177,2023-01-09
86,EDDH,2023-01-10 22:35+01:00,2,Munich,EDDM,2023-01-10 21:15+01:00,Lufthansa,LH 2088,2023-01-09
87,EDDH,2023-01-10 22:35+01:00,2,Frankfurt-am-Main,EDDF,2023-01-10 21:30+01:00,Lufthansa,LH 36,2023-01-09
88,EDDH,2023-01-10 18:45+01:00,2,Copenhagen,EKCH,2023-01-10 17:45+01:00,SAS,SK 651,2023-01-09


In [14]:
flight_df.arrival_terminal.unique()

array(['unknown'], dtype=object)

## Transform

### change timestamp type

In [27]:
flight_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 286 entries, 0 to 285
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype              
---  ------                --------------  -----              
 0   arrival_icao          286 non-null    object             
 1   arrival_time_local    286 non-null    datetime64[ns, UTC]
 2   arrival_terminal      286 non-null    object             
 3   departure_city        286 non-null    object             
 4   departure_icao        286 non-null    object             
 5   departure_time_local  246 non-null    datetime64[ns, UTC]
 6   airline               286 non-null    object             
 7   flight_number         286 non-null    object             
 8   data_retrieved_on     286 non-null    datetime64[ns]     
dtypes: datetime64[ns, UTC](2), datetime64[ns](1), object(6)
memory usage: 20.2+ KB


In [23]:
flight_df.arrival_time_local = pd.to_datetime(flight_df.arrival_time_local,utc=True)

In [17]:
flight_df.data_retrieved_on = pd.to_datetime(flight_df.data_retrieved_on)

In [19]:
flight_df.loc[flight_df.departure_time_local == pd.NaT]

Unnamed: 0,arrival_icao,arrival_time_local,arrival_terminal,departure_city,departure_icao,departure_time_local,airline,flight_number,data_retrieved_on


In [26]:
flight_df.departure_time_local = pd.to_datetime(flight_df.departure_time_local,utc=True)

In [20]:
flight_df.departure_time_local.sample(10)

110    2023-01-07 20:15:00+00:00
233    2023-01-07 15:35:00+03:00
54                           NaT
162    2023-01-07 08:05:00+04:00
135    2023-01-07 07:00:00+03:00
140    2023-01-07 08:10:00+03:00
227    2023-01-07 14:35:00+01:00
49                           NaT
97     2023-01-07 17:20:00+01:00
257    2023-01-07 19:30:00+01:00
Name: departure_time_local, dtype: object

## 3.Load

### load the flight info to Local MySQL

In [28]:
flight_df.to_sql('flight',con=con,if_exists='append',index=False)

286

: 

### load the flight info to AWS MySQL

In [None]:
schema="gans_db"   # name of the database you want to use here
host="localhost"        # to connect to your local server
user="root"
password=key.SQL_PASSWORD # your password!!!!
port=3306
con = f'mysql+pymysql://{user}:{password}@{host}:{port}/{schema}'
airport_T_df = pd.read_sql_table('airport',con=con)