In [1]:
# Importing libraries
import requests
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from dotenv import load_dotenv
import json
import pprint

# Task 1: Data Collection via REST APIs
In this initial phase, I implemented the programmatic retrieval of environmental data for Rome (Villa Ada) covering a six-month period from July 2025 to January 2026. I interfaced with the Open-Meteo Archive API for meteorological variables and the OpenAQ v3 API for air quality metrics, managing credentials via a .env file. A key step involved utilizing the OpenAQ v3 /sensors/{id}/days endpoint to obtain daily-aggregated data for NO2​, O3​, and CO. All raw responses were stored locally as JSON files to ensure data persistence. By carefully aligning the query parameters and setting an appropriate record limit, I ensured that both datasets covered the same temporal window, establishing a solid foundation for the subsequent integration.

In [2]:
# Quering weather data from open-meteo API

# Querying the data
weather_url = "https://archive-api.open-meteo.com/v1/archive"
weather_params = {
	"latitude": 41.89193,
	"longitude": 12.51133,
	"start_date": "2025-07-08",
	"end_date": "2026-01-08",
	"daily": [
        "temperature_2m_mean", 
        "wind_speed_10m_max", 
        "precipitation_sum", 
        "relative_humidity_2m_mean"
    ],
	"timezone": "Europe/Rome"
}
weather_response = requests.get(
	weather_url, 
	params=weather_params
)

# Checking the response status
print(weather_response.status_code)

# Saving the data to a file
weather_data = weather_response.json()
with open('data/weather_data.json', 'w') as f:
    json.dump(weather_data, f, indent=4)

200


In [3]:
# Quering air quality data from openaq API using the official wrapper

# Loading the API key
load_dotenv()
api_key = os.getenv("OPENAQ_API_KEY")
headers = {"X-API-Key": api_key}

# We have to get sensor ids from the API
locations_params = {
    "coordinates": "41.89193, 12.51133",
    "radius": 5000,
    "limit": 1000,
    "iso": "IT",
}

locations_response = requests.get("https://api.openaq.org/v3/locations", params=locations_params, headers=headers, timeout=30)
print(locations_response.status_code)

# Printing json file to get sensor ids
pprint.pp(locations_response.json())

200
{'meta': {'name': 'openaq-api',
          'website': '/',
          'page': 1,
          'limit': 1000,
          'found': 12},
 'results': [{'id': 7527,
              'name': 'L.GO MAGNA GRECIA',
              'locality': 'LARGO MAGNA GRECIA - Roma (RM)',
              'timezone': 'Europe/Rome',
              'country': {'id': 91, 'code': 'IT', 'name': 'Italy'},
              'owner': {'id': 4, 'name': 'Unknown Governmental Organization'},
              'provider': {'id': 70, 'name': 'EEA'},
              'isMobile': False,
              'isMonitor': True,
              'instruments': [{'id': 2, 'name': 'Government Monitor'}],
              'sensors': [{'id': 4272966,
                           'name': 'no µg/m³',
                           'parameter': {'id': 19843,
                                         'name': 'no',
                                         'units': 'µg/m³',
                                         'displayName': 'NO mass'}},
                          {'id': 2

In [4]:
# We will use Villa Ada sensors
sensor_ids = ["21950", "21870", "21955"]
pollutants = ["no2", "o3", "co"]

# Quering the API for each sensor
for id, pollutant in zip(sensor_ids, pollutants):
    aq_url = f"https://api.openaq.org/v3/sensors/{id}/days"
    aq_params = {
        "date_from": "2025-07-08T00:00:00",
        "date_to": "2026-01-08T23:59:59",
        "limit": 1000,
        "group_by": "day" 
    }
    aq_response = requests.get(
        aq_url,
        params=aq_params,
        headers=headers
    )

    # Checking the response status
    print(aq_response.status_code)

    # Saving the data to a file
    aq_data = aq_response.json()
    with open(f'data/{pollutant}_data.json', 'w') as f:
        json.dump(aq_data, f, indent=4)

200
200
200


# Task 2: Data Cleaning and Integration
Task 2 focused on cleaning and integrating the raw JSON datasets into a single cohesive structure. The primary challenge was aligning conflicting temporal formats: while Open-Meteo provided 'naive' dates, OpenAQ timestamps included UTC offsets. I developed a processing pipeline that standardizes these by slicing the strings to a YYYY-MM-DD format and normalizing the Pandas DatetimeIndex to a daily resolution. I also implemented a pivoting operation to transform the air quality data from a 'long' format to a 'wide' format. The final result is a unified DataFrame offering a synchronized daily view of the environment, where weather variables and atmospheric pollutants are merged via a horizontal join, creating a clean dataset ready for statistical analysis.

In [5]:
# Loading weather data
with open('data/weather_data.json', 'r') as f:
    w_json = json.load(f)

# Creazione DataFrame con conversione datetime esplicita
df_weather = pd.DataFrame(w_json['daily'], index=pd.to_datetime(w_json['daily']['time'])).drop(columns=['time'])
df_weather.index = pd.to_datetime(df_weather.index).normalize()
df_weather.head()

Unnamed: 0,temperature_2m_mean,wind_speed_10m_max,precipitation_sum,relative_humidity_2m_mean
2025-07-08,26.3,15.7,0.0,56
2025-07-09,24.0,18.8,0.0,49
2025-07-10,23.7,14.3,0.0,48
2025-07-11,24.9,15.2,0.0,46
2025-07-12,25.8,15.8,0.0,46


In [6]:
# Creating a function to wrangle each air quality data
def process_openaq_json(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    # Exctracting results
    results = data.get('results', [])
    
    # Exctracting useful data
    rows = []
    for item in results:
        rows.append({
            'datetime': item.get('period', {}).get('datetimeTo', {}).get('local'),
            'parameter': item.get('parameter', {}).get('name'),
            'value': item.get('value')
        })
    
    # Creating data frames
    df = pd.DataFrame(rows)
    
    # Converting datetime object
    df['datetime'] = pd.to_datetime(df['datetime'].str.slice(0, 10), errors='coerce')
    df = df.dropna(subset=['datetime'])
    
    # Transforming df
    df = df.pivot_table(index='datetime', columns='parameter', values='value')
    return df

# Files to wrangle
files = ['data/co_data.json', 'data/no2_data.json', 'data/o3_data.json']

# Applying the function to each file
dfs = [process_openaq_json(f) for f in files]

# Joining all data frames
df_aq = pd.concat(dfs, axis=1).sort_index()
df_aq.head()

parameter,co,no2,o3
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2025-07-09,113.0,4.38,77.9
2025-07-10,125.0,7.61,77.1
2025-07-11,108.0,11.3,83.1
2025-07-12,163.0,19.3,83.2
2025-07-13,188.0,20.2,79.5


In [15]:
# Joining air quality data and weather ones
df = pd.concat([df_aq, df_weather], axis=1).sort_index()

# Removing rows with missing values
df.dropna(inplace=True)

# Saving the data to a csv file
df.to_csv('data/combined_data.csv')

df.head()

Unnamed: 0,co,no2,o3,temperature_2m_mean,wind_speed_10m_max,precipitation_sum,relative_humidity_2m_mean
2025-07-09,113.0,4.38,77.9,24.0,18.8,0.0,49.0
2025-07-10,125.0,7.61,77.1,23.7,14.3,0.0,48.0
2025-07-11,108.0,11.3,83.1,24.9,15.2,0.0,46.0
2025-07-12,163.0,19.3,83.2,25.8,15.8,0.0,46.0
2025-07-13,188.0,20.2,79.5,23.9,21.1,8.8,63.0


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 183 entries, 2025-07-09 to 2026-01-08
Data columns (total 7 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   co                         183 non-null    float64
 1   no2                        183 non-null    float64
 2   o3                         183 non-null    float64
 3   temperature_2m_mean        183 non-null    float64
 4   wind_speed_10m_max         183 non-null    float64
 5   precipitation_sum          183 non-null    float64
 6   relative_humidity_2m_mean  183 non-null    float64
dtypes: float64(7)
memory usage: 11.4 KB


# Task 3: Exploratory Data Analysis

# Task 4: Machine Learning

# Task 5: Evaluation and Interpretation