### WBS Coding School
___
# Data Engineering Project

This is a data engineering project, in which I use Python, MySQL and AWS Services to create and automatically update an online database.

It is a learning project, in which I practise several data engineering techniques, such as API calls and AWS Lambda functions.

Our tasks are:
- [x] Collect data
- [ ] Clean data
- [ ] Create a database
- [ ] Update the database with the latest data
- [ ] Move the data pipeline to the Cloud (AWS)

We will perform this project using *Python*, *MySQL* and the AWS Services *RDS*, *Lambda* and *EventBridge*.
___

# Data Collection

This script uses API calls and web-scraping to collect data on cities, airports, flight arrivals and weather forecasts.

Most importantly, it sets up the functions for collecting data, which will later on be used in AWS Lambda functions.

### Table of contents:
- [Get Cities data](#cities_data)
- [Get Weather Data](#weather_data)
- [Get Airports & Arrivals Data](#airports_arrivals_data)
- [Export Data](#export_data)

#### Import Libraries

In [5]:
import os
import pandas as pd
from datetime import datetime, timedelta, date
import re

from bs4 import BeautifulSoup # web-scraping
import requests # API calls
import config_file # contains the OpenWeather and aerobox API keys

### Cities of interest
These cities are the ones I will collect data on during this Data Engineering project.

In [2]:
cities_of_interest = ["Berlin", "Dresden", "Madrid", "Tokyo", "London", "Shanghai"]

 <a id="cities_data"></a>
 ## Cities data (Web Scraping)
 Collect data on cities from their Wikipedia websites using web-scraping and the package `BeautifulSoup`.

In [3]:
# Read in wikipedia source code, brew soup
def brew_soup(city: str):
    url = f"https://en.wikipedia.org/wiki/{city}"
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")
    return soup

# Save data from html code in dictionary
def get_city_info(soup):
    city_info = {}
    # City, country, coordinates
    city_info["city_name"] = soup.find('span', {'class': 'mw-page-title-main'}).get_text()
    city_info["country"] = soup.select("table.infobox.ib-settlement.vcard > tbody > tr > td.infobox-data")[0].get_text()
    city_info["latitude"] = soup.find('span', {'class': 'latitude'}).get_text()
    city_info["longitude"] = soup.select('.longitude')[0].string
    # Population
    population_info = soup.select_one('th.infobox-header:-soup-contains("Population")')
    city_info["population"] = population_info.parent.find_next_sibling().find('td', {'class': 'infobox-data'}).string
    # Altitude
    infobox_data = soup.select("table.infobox.ib-settlement.vcard > tbody > tr > td.infobox-data")
    index = ["ft" in x.text for x in infobox_data].index(True) # get the index of the altitude infobox row (contains 'ft' = feet)
    altitude_text = str(infobox_data[index].get_text()) # get the line with the altitude
    match = re.findall("\d+.m", altitude_text)[0].split(maxsplit=1)[0] # get the first number (altitude in meters)
    if match:
        altitude_in_meters = int(match)
    else:
        altitude_in_meters = None
    city_info["altitude"] = altitude_in_meters
    return city_info

def get_cities_data(cities: list):
    # Web scrape the cities' info:
    cities_info = {}
    for city in cities:
        soup = brew_soup(city)
        cities_info[city] = get_city_info(soup)
    # Turn them into dataframes:
    cities_df = pd.DataFrame.from_dict(cities_info, orient='index')
    cities_df = cities_df.reset_index().drop(["index"], axis=1)
    populations_df = cities_df[["city_name", "population"]]
    return cities_df, populations_df

In [4]:
cities_df, populations_df = get_cities_data(cities_of_interest)

 <a id="weather_data"></a>
 ## Weather data (API call)
 Collect the weather forecast for today for the `cities_of_interest` from OpenWeatherMap using the `requests` package.

In [6]:
def create_weather_response(city: str, cnt = 50):
    base_url = "https://api.openweathermap.org/data/2.5/forecast?"
    API_key = config_file.OPEN_WEATHER_API_KEY
    units = "metric"
    language = "en"
    weather_response = requests.get(f"{base_url}appid={API_key}&q={city}&cnt={cnt}&units={units}&lang={language}")
    return weather_response

def create_weather_dataframe(response: requests.models.Response):
    # Save the weather data
    weather_json = response.json()
    weather_df = pd.json_normalize(weather_json["list"]) # Column "weather" contains a dictionary ...
    weather_df["city"] = weather_json["city"]["name"]
    weather_exploded_df = pd.json_normalize(weather_df.explode("weather")["weather"]) # ... we therefore explode that column ...
    weather_exploded_df["city"] = weather_json["city"]["name"]
    weather_exploded_df["dt"] = weather_df["dt"]
    weather_all_df = weather_exploded_df.merge(weather_df, on=["city", "dt"]) # ... and re-merge the two dataframes.
    weather_clean_df = weather_all_df[["city", "dt_txt", "main", "description", "main.temp", "main.feels_like", "wind.speed"]]
    weather_clean_df = weather_clean_df.rename(columns={
        "dt_txt": "forecast_time",
        "main": "outlook",
        "description": "outlook_description",
        "main.temp": "temperature",
        "main.feels_like": "feels_like",
        "wind.speed": "wind_speed"
    })
    # Save city data
    city = weather_json['city']['name']
    latitude = weather_json['city']['coord']['lat']
    longitude = weather_json['city']['coord']['lon']
    country = weather_json['city']['country']
    city_data = {
        "city": city,
        "latitude": latitude,
        "longitude": longitude,
        "country": country
    }
    city_data_df = pd.DataFrame(city_data, index=[0])
    # Merge the two dataframes
    weather_city_df = pd.merge(weather_clean_df, city_data_df, on="city", how="inner")
    return weather_city_df

def get_weather_data(cities: list):
    weather_cities_df = pd.DataFrame()
    for city in cities:
        weather_response = create_weather_response(city)
        weather_city_df = create_weather_dataframe(weather_response)
        weather_cities_df = pd.concat([weather_cities_df, weather_city_df])
    return weather_cities_df.reset_index(drop=True)

In [7]:
weather_df = get_weather_data(cities_of_interest)

 <a id="airports_arrivals_data"></a>
 ## Airports and Arrivals data (API calls)
 Fetch data on airports near my `cities_of_interest` and today's arriving flights from the aerodatabox API from rapidAPI.

#### Airports

In [8]:
def get_airport_data(latitudes: list, longitudes: list, radius=50, limit=10):
  list_of_dfs = []
  for lat, lon in zip(latitudes, longitudes):
    url = "https://aerodatabox.p.rapidapi.com/airports/search/location"
    querystring = {"lat": lat, "lon": lon, "radiusKm": radius, "limit": limit, "withFlightInfoOnly": "true"}
    headers = {
	    "X-RapidAPI-Key": config_file.AERODATABOX_API_KEY,
	    "X-RapidAPI-Host": "aerodatabox.p.rapidapi.com"
    }
    response = requests.request("GET", url, headers=headers, params=querystring)
    airport_data = pd.json_normalize(response.json()['items'])
    list_of_dfs.append(airport_data)
  return pd.concat(list_of_dfs, ignore_index=True)

def convert_coordinates(coordinate_dms):
  deg, minutes, seconds, direction =  re.split('[°\′″]', coordinate_dms)
  coordinate_decimal = (
    float(deg) + float(minutes)/60 + float(seconds)/(60*60)) * (-1 if direction in ['W', 'S'] else 1
  )
  return round(coordinate_decimal, 4)

In [9]:
# Convert coordinates of our cities from degrees, minutes, seonds to decimal
for row in range(len(cities_df)): 
    cities_df.loc[row, "latitude"] = convert_coordinates(cities_df.loc[row, "latitude"])
    cities_df.loc[row, "longitude"] = convert_coordinates(cities_df.loc[row, "longitude"])

In [10]:
latitudes = cities_df["latitude"].to_list()
longitudes = cities_df["longitude"].to_list()

airports_df = get_airport_data(latitudes, longitudes)

#### Arrivals

In [11]:
def create_date_range():
    # The aerodatabox API returns flights data for 12 h intervals, 
    # hence we'll first create a date range with two 12 h intervals, summing up to a whole day.
    date_range_dict = {}
    morning1 = (datetime.combine(date.today(), datetime.min.time()) + timedelta(1)).strftime('%Y-%m-%dT%H:%M')
    evening1 = (datetime.combine(date.today(), datetime.min.time()) + timedelta(1.5)).strftime('%Y-%m-%dT%H:%M')
    morning2 = (datetime.combine(date.today(), datetime.min.time()) + timedelta(1.5)).strftime('%Y-%m-%dT%H:%M')
    evening2 = (datetime.combine(date.today(), datetime.min.time()) + timedelta(2)).strftime('%Y-%m-%dT%H:%M')
    date_range_dict["morning"] = [morning1, morning2]
    date_range_dict["evening"] = [morning2, evening2]
    return date_range_dict


def get_arrivals_response(icao_code: str, date_range: dict, day_time: str):
    url = f"https://aerodatabox.p.rapidapi.com/flights/airports/icao/{icao_code}/{date_range[day_time][0]}/{date_range[day_time][1]}"
    querystring = {
	    "withLeg": "false", "direction": "Arrival", 
	    "withCancelled": "false", "withCodeshared": "false", 
	    "withCargo": "false", "withPrivate": "false", "withLocation": "false"
    }
    headers = {
    	"X-RapidAPI-Key": config_file.AERODATABOX_API_KEY,
    	"X-RapidAPI-Host": "aerodatabox.p.rapidapi.com"
    }
    response = requests.get(url, headers=headers, params=querystring)
    return response


def create_arrivals_df(response: requests.models.Response, icao_code: str):
    arrivals_json = response.json()["arrivals"]
    arrivals_df = pd.json_normalize(arrivals_json)
    arrivals_df["icao"] = icao_code
    new_columns_dict = {
        "icao": "icao",
        "number": "flight_number", 
        "movement.scheduledTime.utc": "arrival_time_utc", 
        "movement.scheduledTime.local": "arrival_time_local",
        "airline.name": "airline", 
        "movement.airport.name": "departing_airport", 
        "movement.airport.icao": "departing_airport_icao"
    }
    old_columns = list(new_columns_dict.keys())
    arrivals_df_clean = arrivals_df[old_columns].rename(columns=new_columns_dict)
    return arrivals_df_clean


def get_arrivals_data(icao_codes: list):
    arrivals_list = []
    # Get start & end date
    date_range = create_date_range()

    for icao in icao_codes:
        for day_time in ['morning', 'evening']:
            response = get_arrivals_response(icao_code=icao, date_range=date_range, day_time=day_time)
            if response.status_code != 200: # for some airports, there is no data in the aerodatabox
                continue
            arrivals_df = create_arrivals_df(response, icao_code=icao)
            arrivals_list.append(arrivals_df)
        arrivals = pd.concat(arrivals_list)
    return arrivals.reset_index(drop=True)

In [12]:
# Get CIAO codes
# (ICAO codes are unique airport identifiers)
icao_codes = airports_df["icao"].to_list()

In [13]:
arrivals_df = get_arrivals_data(icao_codes)

<a id="export_data"></a>
## Export Data

In [15]:
# Create directory
os.makedirs('dataframes/initial', exist_ok=True)

# Export data
cities_df.to_csv("dataframes/initial/cities_df.csv", index=False)
populations_df.to_csv("dataframes/initial/populations_df.csv", index=False)
weather_df.to_csv("dataframes/initial/weather_df.csv", index=False)
airports_df.to_csv("dataframes/initial/airports_df.csv", index=False)
arrivals_df.to_csv("dataframes/initial/arrivals_df.csv", index=False)