# 📨 Data Sourcing.

### In this Notebook, we'll Source our Data from OpenMeteo API and see how the data is formatted and how it can be represented in Tabular Format.

### Importing Necessary Modules

In [1]:
import requests
import wget
import gzip
import json

import time

import pandas as pd

import os
from dotenv import load_dotenv

load_dotenv("../.env")

import sys

sys.path.append("../Scripts/")
sys.path.append("../")

import paths
import config

### In order to obtain all the Raw Data we want, we need to fetch both Hourly and historical (Hourly) data for Weather and Air Quality from OpenMeteo.com

### Let's First import the list of Cities from our config.py file.

In [2]:
Cities = config.Cities

print([x["CityName"] for x in Cities])

['Aosta', 'Torino', 'Trento', 'Milano', 'Venezia', 'Trieste', 'Genova', 'Firenze', 'Bologna', 'Ancona', 'Perugia', 'Roma', 'Napoli', "L'Aquila", 'Campobasso', 'Bari', 'Potenza', 'Catanzaro', 'Palermo', 'Cagliari']


### Now, Let's fetch a sample of current Data of both AirQuality and WeatherData from OpenMeteo.com API.

In [3]:
#Fetching Current Air Quality Hourly Data

url = "https://air-quality-api.open-meteo.com/v1/air-quality"

params = {
    "latitude": 52.52,
    "longitude": 13.41,
    "current": "european_aqi",
    #"hourly": ["pm10", "pm2_5", "carbon_monoxide", "nitrogen_dioxide", "sulphur_dioxide", "ozone", "dust", "uv_index", "european_aqi"],
    "timezone": "Europe/Berlin"
}

response = requests.get(url, params=params)

json.loads(response.text)

{'latitude': 52.549995,
 'longitude': 13.450001,
 'generationtime_ms': 0.10097026824951172,
 'utc_offset_seconds': 3600,
 'timezone': 'Europe/Berlin',
 'timezone_abbreviation': 'CET',
 'elevation': 38.0,
 'current_units': {'time': 'iso8601',
  'interval': 'seconds',
  'european_aqi': 'EAQI'},
 'current': {'time': '2024-03-08T23:00', 'interval': 3600, 'european_aqi': 34}}

In [4]:
#Fetching Current Conditions

url = "https://api.open-meteo.com/v1/forecast"

params = {
    "latitude": 52.52,
    "longitude": 13.41,
    "current": "temperature_2m",
    "timezone": "Europe/Berlin",
}

response = requests.get(url, params=params)

json.loads(response.text)

{'latitude': 52.52,
 'longitude': 13.419998,
 'generationtime_ms': 0.02491474151611328,
 'utc_offset_seconds': 3600,
 'timezone': 'Europe/Berlin',
 'timezone_abbreviation': 'CET',
 'elevation': 38.0,
 'current_units': {'time': 'iso8601',
  'interval': 'seconds',
  'temperature_2m': '°C'},
 'current': {'time': '2024-03-08T23:15',
  'interval': 900,
  'temperature_2m': 2.7}}

In [5]:
#Fetching on a Fixed Date and Time

url = "https://api.open-meteo.com/v1/forecast"

params = {
    "latitude": 52.52,
    "longitude": 13.41,
    "hourly": "temperature_2m",
    "timezone": "Europe/Berlin",
    "start_hour": "2024-03-05T10:00",
    "end_hour": "2024-03-05T10:00"
}

response = requests.get(url, params=params)

json.loads(response.text)

{'latitude': 52.52,
 'longitude': 13.419998,
 'generationtime_ms': 0.030994415283203125,
 'utc_offset_seconds': 3600,
 'timezone': 'Europe/Berlin',
 'timezone_abbreviation': 'CET',
 'elevation': 38.0,
 'hourly_units': {'time': 'iso8601', 'temperature_2m': '°C'},
 'hourly': {'time': ['2024-03-05T10:00'], 'temperature_2m': [4.2]}}

### As you can see OpenMeteo.com provides a super efficient API (They also provide a python sdk too!!) from which we can fetch the data we're interested into.

### Now, let's Fetch a sample of Historical Hourly Data for the past year.

In [6]:
url = "https://air-quality-api.open-meteo.com/v1/air-quality"

params = {
    "latitude": 43.615849, 
    "longitude": 13.518740,
    "hourly": "european_aqi",
    "timezone": "Europe/Berlin",
    "start_date": "2022-09-01",
    "end_date": "2024-02-29"
}

response = requests.get(url, params=params)

json.loads(response.text)

{'latitude': 43.649994,
 'longitude': 13.549999,
 'generationtime_ms': 11.303067207336426,
 'utc_offset_seconds': 3600,
 'timezone': 'Europe/Berlin',
 'timezone_abbreviation': 'CET',
 'elevation': 33.0,
 'hourly_units': {'time': 'iso8601', 'european_aqi': 'EAQI'},
 'hourly': {'time': ['2022-09-01T00:00',
   '2022-09-01T01:00',
   '2022-09-01T02:00',
   '2022-09-01T03:00',
   '2022-09-01T04:00',
   '2022-09-01T05:00',
   '2022-09-01T06:00',
   '2022-09-01T07:00',
   '2022-09-01T08:00',
   '2022-09-01T09:00',
   '2022-09-01T10:00',
   '2022-09-01T11:00',
   '2022-09-01T12:00',
   '2022-09-01T13:00',
   '2022-09-01T14:00',
   '2022-09-01T15:00',
   '2022-09-01T16:00',
   '2022-09-01T17:00',
   '2022-09-01T18:00',
   '2022-09-01T19:00',
   '2022-09-01T20:00',
   '2022-09-01T21:00',
   '2022-09-01T22:00',
   '2022-09-01T23:00',
   '2022-09-02T00:00',
   '2022-09-02T01:00',
   '2022-09-02T02:00',
   '2022-09-02T03:00',
   '2022-09-02T04:00',
   '2022-09-02T05:00',
   '2022-09-02T06:00',
   '

In [7]:
url = "https://archive-api.open-meteo.com/v1/archive"

params = {
    "latitude": 44.494888,
    "longitude": 11.342616,
    "hourly": "temperature_2m",
    "timezone": "Europe/Berlin",
    "start_date": "2022-09-01",
    "end_date": "2024-02-29"
}

response = requests.get(url, params=params)

json.loads(response.text)

{'latitude': 44.46397,
 'longitude': 11.319017,
 'generationtime_ms': 3.5649538040161133,
 'utc_offset_seconds': 3600,
 'timezone': 'Europe/Berlin',
 'timezone_abbreviation': 'CET',
 'elevation': 71.0,
 'hourly_units': {'time': 'iso8601', 'temperature_2m': '°C'},
 'hourly': {'time': ['2022-09-01T00:00',
   '2022-09-01T01:00',
   '2022-09-01T02:00',
   '2022-09-01T03:00',
   '2022-09-01T04:00',
   '2022-09-01T05:00',
   '2022-09-01T06:00',
   '2022-09-01T07:00',
   '2022-09-01T08:00',
   '2022-09-01T09:00',
   '2022-09-01T10:00',
   '2022-09-01T11:00',
   '2022-09-01T12:00',
   '2022-09-01T13:00',
   '2022-09-01T14:00',
   '2022-09-01T15:00',
   '2022-09-01T16:00',
   '2022-09-01T17:00',
   '2022-09-01T18:00',
   '2022-09-01T19:00',
   '2022-09-01T20:00',
   '2022-09-01T21:00',
   '2022-09-01T22:00',
   '2022-09-01T23:00',
   '2022-09-02T00:00',
   '2022-09-02T01:00',
   '2022-09-02T02:00',
   '2022-09-02T03:00',
   '2022-09-02T04:00',
   '2022-09-02T05:00',
   '2022-09-02T06:00',
   '2

In [8]:
#Here we can see all the Keys in our Returned Dict(Json) values.

dict(json.loads(response.text))["hourly"].keys()

dict_keys(['time', 'temperature_2m'])

# ⚠️ NOTE: Make sure the timezone parameter is corretly set in GMT+1 (or Europe/Berlin).
# Also, Temperature is in Celsius, Wind Speed in km/h and Precipitations in Millimeters, while the European Air Quality Index (AQI) ranges from 0-20 (good), 20-40 (fair), 40-60 (moderate), 60-80 (poor), 80-100 (very poor) and exceeds 100 for extremely poor conditions.

### Now that we have played a bit with the API Syntax, let's move on and fetch Historical Hourly Data for Weather and Air Quality for each City in our List, and then dump them to disk as a .parquet file.

In [9]:
#First of all, for each City, we would need to make sure that the coordinates (long, lat) point to the center of the city, and then, fetch Historical Data by Coordinates.
#Code here is a bit dummy, I want it to be understandable here and not efficient, when refactored for deployment will be more efficient than understandable :D

for city in Cities:
    
    #You can check much better to see if it actually is an older file too!!
    #A basic check will work fine for now though for this example
    if os.path.exists(paths.RAW_DATA_DIR / f'{city["CityName"]}_HistoricalData_01092022_29022024.parquet'):
        print("-----------------------------------------------------")   
        print(f'Skipping for {city["CityID"]} - {city["CityName"]} as already into Disk')
        print("-----------------------------------------------------")   
        #If present on Disk, we simply move on.
        continue
    
    print("-----------------------------------------------------")   
    print(f'Fetching Data for {city["CityID"]} - {city["CityName"]}')
    print("-----------------------------------------------------")   
    
    lat = city["Latitude"]
    long = city["Longitude"]
    
    #This will be refactored as a Function obviously, let's just keep the code clean and understandable, leveraging JupyterLab's Cells
    
    #Historical Air Quality Data from September 1st 2022 to February 29th 2024
    
    airquality_url = "https://air-quality-api.open-meteo.com/v1/air-quality"

    params = {
        "latitude": lat,
        "longitude": long,
        "hourly": "european_aqi",
        "timezone": "Europe/Berlin",
        "start_date": "2022-09-01",
        "end_date": "2024-02-29"
    }

    response = requests.get(airquality_url, params=params)

    AQIdict = dict(json.loads(response.text))
    
    #zip(AQIdict["hourly"]["time"], AQIdict["hourly"]["european_aqi"]) equivalent to zipping unpacked(*) values from dictionary
    AQI_df = pd.DataFrame(data = zip(*AQIdict["hourly"].values()), columns = ["Date_GMT+1_Europe/Berlin", "EuropeanAQI"])         

    #Quick Check that Data didn't got messed up during zip operation, as order of values may change
    #Simply put, we create an array where we have True (True is equal to 1) when the DF data is equal to the Raw Outputted data, otherwise false.
    #If we sum up all the Trues, we end up with the total length if every value is in place, otherwise, we end up with a lower number and throw an error.
    #It's a bit mind buzzing, but the logic is simple to grasp:
    #We zip togheter in pairs the keys of the raw data dict and columns of df
    #For each pair then we check that the values are equal as above.
    
    dict_keys = list(AQIdict["hourly"].keys())
    for dictkey, dfcol in zip(dict_keys, AQI_df.columns):
        print(f'checking Air Quality Data Integrity of {city["CityName"]} for: {dictkey, dfcol}')
        
        totTrues = sum(AQI_df[dfcol].values == AQIdict["hourly"][dictkey]) == len(AQI_df)
        
        if not totTrues:
            #You can Actually Raise an Exception Error here
            print(f'An error has occurred, the length of {dictkey} Raw Data Dict Key and {dfcol} DataFrame Column are different or the values are not the exact same, check failed!!')
            break
      
    print("-----------------------------------------------------")
    print(f'Air Quality Data Integrity Check for {city["CityName"]} Passed!!')
    print("-----------------------------------------------------")
        
    #Historical Weather Data from September 1st 2022 to February 29th 2024

    weather_url = "https://archive-api.open-meteo.com/v1/archive"

    params = {
        "latitude": lat,
        "longitude": long,
        "hourly": ["temperature_2m", "relative_humidity_2m", "dew_point_2m", 
                   "precipitation", "pressure_msl", "surface_pressure", "cloud_cover", 
                   "wind_speed_10m", "wind_speed_100m", "wind_direction_10m", "wind_direction_100m", 
                   "soil_temperature_0_to_7cm", "soil_temperature_7_to_28cm", "soil_temperature_28_to_100cm", 
                   "soil_temperature_100_to_255cm", "soil_moisture_0_to_7cm", "soil_moisture_7_to_28cm", "soil_moisture_28_to_100cm"],        
        "timezone": "Europe/Berlin",
        "start_date": "2022-09-01",
        "end_date": "2024-02-29"
    }

    response = requests.get(weather_url, params=params)

    Weatherdict = dict(json.loads(response.text))
    
    Weathercolumns = ["Date_GMT+1_Europe/Berlin",
                      "Temperature_2m", "Relative_Humidity_2m", "Dew_Point_2m", 
                      "Precipitation", "Pressure_msl", "Surface_Pressure", "Cloud_Cover", 
                      "Wind_Speed_10m", "Wind_Speed_100m", "Wind_Wirection_10m", "Wind_Direction_100m", 
                      "Soil_Temperature_0-7cm", "Soil_Temperature_7-28cm", "Soil_Temperature_28-100cm", "Soil_Temperature_100-255cm", 
                      "Soil_Moisture_0-7cm", "Soil_Moisture_7-28cm", "Soil_Moisture_28-100cm"]
    
    Weather_df = pd.DataFrame(data = zip(*Weatherdict["hourly"].values()), columns = Weathercolumns)
    
    #Same Trick as Before
    dict_keys = list(Weatherdict["hourly"].keys())
    for dictkey, dfcol in zip(dict_keys, Weather_df.columns):
        print(f'checking Weather Data Integrity of {city["CityName"]} for: {dictkey, dfcol}')
        
        totTrues = sum(Weather_df[dfcol].values == Weatherdict["hourly"][dictkey]) == len(Weather_df)
        
        if not totTrues:
            #You can Actually Raise an Exception Error here
            print(f'An error has occurred, the length of {dictkey} Raw Data Dict Key and {dfcol} DataFrame Column are different or the values are not the exact same, check failed!!')
            break
            
    print("-----------------------------------------------------")        
    print(f'Weather Data Integrity Check for {city["CityName"]} Passed!!')
    print("-----------------------------------------------------")

    #print("-----------------------------------------------------")        
    print("Merging all Data Together...")
    print("-----------------------------------------------------")
    
    Merged_df = pd.merge(Weather_df, AQI_df, on = ["Date_GMT+1_Europe/Berlin"])

    #A Quick check that the length of the df is untouched as we want an inner join on Date Columns.
    
    if len(Weather_df) == len(AQI_df) == len(Merged_df):
        #print("-----------------------------------------------------")        
        print("Data Correctly Merged!!")
        print("-----------------------------------------------------")
    
    #print("-----------------------------------------------------")        
    print("Dumping Data to Disk!!")
    #print("-----------------------------------------------------")
    
    Merged_df.to_parquet(paths.RAW_DATA_DIR / f'{city["CityName"]}_HistoricalData_01092022_29022024.parquet')
    
    #Required so we won't ping the OpenMeteo API too much!!
    time.sleep(5)

-----------------------------------------------------
Skipping for AO - Aosta as already into Disk
-----------------------------------------------------
-----------------------------------------------------
Skipping for TO - Torino as already into Disk
-----------------------------------------------------
-----------------------------------------------------
Skipping for TN - Trento as already into Disk
-----------------------------------------------------
-----------------------------------------------------
Skipping for MI - Milano as already into Disk
-----------------------------------------------------
-----------------------------------------------------
Skipping for VE - Venezia as already into Disk
-----------------------------------------------------
-----------------------------------------------------
Skipping for TS - Trieste as already into Disk
-----------------------------------------------------
-----------------------------------------------------
Skipping for GE - Gen