In [1]:
# Importing libraries
import requests
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from dotenv import load_dotenv
import json
import pprint

# Task 1: Data Collection via REST APIs
In this initial phase, I implemented the programmatic retrieval of environmental data for Rome (specifically the Villa Ada area) spanning a six-month period. Interfacing with the Open-Meteo and OpenAQ APIs required managing sensitive credentials via .env files and overcoming several technical hurdles. Notably, OpenAQ’s transition to version 3 of their infrastructure necessitated a significant refactoring of the request logic to resolve persistent 410 (Gone) and 404 (Not Found) errors. During collection, a critical challenge emerged regarding the 1,000-record default limit per query, which initially created a temporal discrepancy compared to the meteorological data. To ensure dataset integrity, I developed a local persistence system to store every response as a JSON file, establishing the framework for a scalable data acquisition strategy using pagination logic to bypass API response constraints.

In [2]:
# Quering weather data from open-meteo API

# Querying the data
weather_url = "https://archive-api.open-meteo.com/v1/archive"
weather_params = {
	"latitude": 41.89193,
	"longitude": 12.51133,
	"start_date": "2025-07-08",
	"end_date": "2026-01-08",
	"hourly": ["temperature_2m", "wind_speed_10m", "precipitation", "relative_humidity_2m"],
}
weather_response = requests.get(
	weather_url, 
	params=weather_params
)

# Checking the response status
print(weather_response.status_code)

# Saving the data to a file
weather_data = weather_response.json()
with open('data/weather_data.json', 'w') as f:
    json.dump(weather_data, f, indent=4)

200


In [3]:
# Quering air quality data from openaq API using the official wrapper

# Loading the API key
load_dotenv()
api_key = os.getenv("OPENAQ_API_KEY")
headers = {"X-API-Key": api_key}

# We have to get sensor ids from the API
locations_params = {
    "coordinates": "41.89193, 12.51133",
    "radius": 5000,
    "limit": 1000,
    "iso": "IT",
}

locations_response = requests.get("https://api.openaq.org/v3/locations", params=locations_params, headers=headers, timeout=30)
print(locations_response.status_code)

# Printing json file to get sensor ids
pprint.pp(locations_response.json())

200
{'meta': {'name': 'openaq-api',
          'website': '/',
          'page': 1,
          'limit': 1000,
          'found': 12},
 'results': [{'id': 7527,
              'name': 'L.GO MAGNA GRECIA',
              'locality': 'LARGO MAGNA GRECIA - Roma (RM)',
              'timezone': 'Europe/Rome',
              'country': {'id': 91, 'code': 'IT', 'name': 'Italy'},
              'owner': {'id': 4, 'name': 'Unknown Governmental Organization'},
              'provider': {'id': 70, 'name': 'EEA'},
              'isMobile': False,
              'isMonitor': True,
              'instruments': [{'id': 2, 'name': 'Government Monitor'}],
              'sensors': [{'id': 4272966,
                           'name': 'no µg/m³',
                           'parameter': {'id': 19843,
                                         'name': 'no',
                                         'units': 'µg/m³',
                                         'displayName': 'NO mass'}},
                          {'id': 2

In [4]:
# We will use Villa Ada sensors
sensor_ids = ["21950", "21870", "21955"]
pollutants = ["no2", "o3", "co"]

# Quering the API for each sensor
for id, pollutant in zip(sensor_ids, pollutants):
    aq_url = f"https://api.openaq.org/v3/sensors/{id}/measurements/hourly"
    aq_params={
        "datetime_from": "2025-07-08",
        "datetime_to": "2026-01-08",
        "limit": 1000
    }
    aq_response = requests.get(
        aq_url,
        params=aq_params,
        headers=headers
    )

    # Checking the response status
    print(aq_response.status_code)

    # Saving the data to a file
    aq_data = aq_response.json()
    with open(f'data/{pollutant}_data.json', 'w') as f:
        json.dump(aq_data, f, indent=4)

200
200
200


# Task 2: Data Cleaning and Integration
Task 2 focused on the cleaning and integration (wrangling) of the raw JSON datasets. The primary challenge involved aligning data from different sources with conflicting temporal structures. While the meteorological data utilized a 'naive' ISO 8601 format, the OpenAQ data included a UTC offset (+02:00), which prevented direct synchronization. To enable a coherent join, I developed a custom Pandas processing function that standardizes timestamps by removing timezone localization and transforms the data from a 'long' to a 'wide' format through pivoting operations. The resulting unified DataFrame provides a synchronized hourly view of the environment, integrating weather variables (temperature, humidity, wind) with atmospheric pollutants (CO,NO2​,O3​). This process was essential for identifying systemic gaps (missing values) caused by API query limits, which serve as the starting point for subsequent statistical analysis.

In [5]:
# Loading weather data
with open('data/weather_data.json', 'r') as f:
    w_json = json.load(f)

# Creazione DataFrame con conversione datetime esplicita
df_weather = pd.DataFrame(w_json['hourly'], index=pd.to_datetime(w_json['hourly']['time'])).drop(columns=['time'])
df_weather.index.name = 'datetime'
df_weather.head()

Unnamed: 0_level_0,temperature_2m,wind_speed_10m,precipitation,relative_humidity_2m
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2025-07-08 00:00:00,24.1,8.5,0.0,74
2025-07-08 01:00:00,24.6,11.5,0.0,65
2025-07-08 02:00:00,24.4,10.9,0.0,66
2025-07-08 03:00:00,24.5,12.0,0.0,65
2025-07-08 04:00:00,24.4,13.1,0.0,68


In [6]:
# Creating a function to wrangle each air quality data
def process_openaq_json(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    # Exctracting results
    results = data.get('results', [])
    
    # Exctracting useful data
    rows = []
    for item in results:
        rows.append({
            'datetime': item['period']['datetimeTo']['local'],
            'parameter': item['parameter']['name'],
            'value': item['value']
        })
    
    # Creating data frames
    df = pd.DataFrame(rows)
    
    # Converting datetime object
    df['datetime'] = pd.to_datetime(df['datetime']).dt.tz_localize(None)
    
    # Transforming df
    df = df.pivot(index='datetime', columns='parameter', values='value')
    return df

# Files to wrangle
files = ['data/co_data.json', 'data/no2_data.json', 'data/o3_data.json']

# Applying the function to each file
dfs = [process_openaq_json(f) for f in files]

# Joining all data frames
df_aq = pd.concat(dfs, axis=1).sort_index()
df_aq.head()

parameter,co,no2,o3
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2025-07-08 01:00:00,100.0,6.0,75.0
2025-07-08 02:00:00,100.0,4.0,73.0
2025-07-08 03:00:00,100.0,4.0,76.0
2025-07-08 04:00:00,200.0,4.0,77.0
2025-07-08 05:00:00,200.0,4.0,77.0


In [7]:
# Joining air quality data and weather ones
df = pd.concat([df_aq, df_weather], axis=1).sort_index()

# Saving the data to a csv file
df.to_csv('data/combined_data.csv')

print(df.head(), df.info())

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 4440 entries, 2025-07-08 00:00:00 to 2026-01-08 23:00:00
Freq: h
Data columns (total 7 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   co                    1000 non-null   float64
 1   no2                   1000 non-null   float64
 2   o3                    1000 non-null   float64
 3   temperature_2m        4440 non-null   float64
 4   wind_speed_10m        4440 non-null   float64
 5   precipitation         4440 non-null   float64
 6   relative_humidity_2m  4440 non-null   int64  
dtypes: float64(6), int64(1)
memory usage: 277.5 KB
                        co  no2    o3  temperature_2m  wind_speed_10m  \
datetime                                                                
2025-07-08 00:00:00    NaN  NaN   NaN            24.1             8.5   
2025-07-08 01:00:00  100.0  6.0  75.0            24.6            11.5   
2025-07-08 02:00:00  100.0  4.0  73.0          

# Task 3: Exploratory Data Analysis

# Task 4: Machine Learning

# Task 5: Evaluation and Interpretation