This script scrapes the air pollutiondata corresponsind to fixed sensors in Dublin.
- To get sensor data from https://data.smartdublin.ie/sonitus-api
- Map that displays the data : https://dublincityairandnoise.ie/
- Page : https://data.gov.ie/dataset/sonitus/resource/38a117c9-79b5-4e1c-9080-ed862bbe689d


In [2]:
import requests
import pandas as pd
from datetime import datetime, timedelta
import os

Now, let's fetch the data from all monitors.

In [3]:
# Define the API endpoint
url = 'https://data.smartdublin.ie/sonitus-api/api/monitors'

# Set the parameters for the POST request
params = {
    'username': 'dublincityapi',
    'password': 'Xpa5vAQ9ki'
}

# Set headers
headers = {
    'accept': '*/*'
}

# Send the POST request
response = requests.post(url, headers=headers, params=params)

# Check if the request was successful
if response.status_code == 200:
    print("Data fetched successfully!")
    # Get JSON response data
    monitors_data = response.json()
    
    # Creating a list to store each monitor's data
    monitors_list = []
    
    # Assume each entry in the JSON data is a monitor
    for monitor in monitors_data:
        # Parse each monitor's data into a structured dictionary
        monitor_data = {
            'serial_number': monitor['serial_number'],  # Assuming 'id' is part of the monitor data
            'label': monitor['label'],  # Assuming 'location' details
            'location': monitor['location'],  # Assuming 'status' indicates if the monitor is active
            'latitude': monitor['latitude'],  # Assuming latitude info
            'longitude': monitor['longitude'],  # Assuming longitude info
            'last_calibrated': monitor['last_calibrated'],
            # Add additional fields as per your API response structure
            #'Current Rating' : monitor['current_rating']
        }
        monitors_list.append(monitor_data)
    
    # Convert the list of dictionaries to a pandas DataFrame
    monitors_df = pd.DataFrame(monitors_list)
    
    # Print the DataFrame
    print(monitors_df)
else:
    print("Failed to fetch data:")
    print("Status Code:", response.status_code)
    print("Response Body:", response.text)


Data fetched successfully!
         serial_number               label  \
0             10.1.1.1             Noise 1   
1                01749             Noise 2   
2                01508             Noise 3   
3                10118             Noise 4   
4                01548             Noise 5   
5                10115             Noise 6   
6             10.1.1.7             Noise 7   
7                01870             Noise 8   
8                01575             Noise 9   
9                01737            Noise 10   
10           10.1.1.11            Noise 11   
11           10.1.1.12            Noise 12   
12               01550            Noise 13   
13               01534            Noise 14   
14               01535            Noise 16   
15               01509            Noise 17   
16               01529            Noise 18   
17               01530      Noise 19 Spare   
18               01528            Noise 20   
19             DCC-AQ1      National Air 1   
20     

The dataset both contains monitors that collect noise pollution and ones that collect air pollution. The next code only selects air pollution ones. 

In [4]:
# Filtering to create a new DataFrame where 'description' contains "Air"
monitors_air = monitors_df[monitors_df['label'].str.contains("Air")]

selected_columns = ['serial_number', 'latitude', 'longitude']
air_monitor_df = monitors_air[selected_columns]

# then we loop over the serial number to get the hourly averages. 
serial_numbers = monitors_air['serial_number'].tolist() # get a list of serial numbers
serial_numbers



['DCC-AQ1',
 'DCC-AQ2',
 'DCC-AQ3',
 'DCC-AQ4',
 'DCC-AQ5',
 'DCC-AQ6',
 'DCC-AQ7',
 'DCC-AQ8',
 'DCC-AQ9',
 'DCC-AQ10',
 'TNT1088',
 'TNT1138',
 'TNT1296',
 'TNO2161',
 'TNO2162',
 'DCC-AQ13',
 'DCC-AQ17',
 'DCC-AQ22',
 'DCC-AQ52',
 'DCC-AQ69',
 'TNO4435',
 'TNO4438',
 'TNO4488',
 'TNO4390',
 'TNO4324',
 'TNO4323',
 'TNO4325',
 'TNO4437',
 '0110-000157-000000',
 '0110-000180-000000',
 '0110-000141-000000',
 'DM30-00530',
 'DM30-00531',
 'DCC-AQ91']

In [4]:
# running time 12 minutes 

# Define the API endpoint
url = 'https://data.smartdublin.ie/sonitus-api/api/hourly-averages'

# Set headers
headers = {
    'accept': 'application/json'
}

# Define your credentials
username = 'dublincityapi'
password = 'Xpa5vAQ9ki'

# Convert date string to unix timestamp
def date_to_unix(date_str):
    return int(datetime.strptime(date_str, '%Y-%m-%d').timestamp())

# Generate date ranges for the period, 29 days at a time
def generate_date_ranges(start_date, end_date):
    current_date = start_date
    while current_date < end_date:
        yield current_date, min(end_date, current_date + timedelta(days=15))
        current_date += timedelta(days=30)

# Monitor serial numbers and date range
serial_numbers = serial_numbers 
start_date = datetime(2021, 5, 1)
end_date = datetime(2022, 8, 31)

# Directory for all CSVs
base_directory = "Air_Pollution_Data"
os.makedirs(base_directory, exist_ok=True)

# Data container for all monitors
all_data = []

# Loop through each serial number and fetch data
for serial in serial_numbers:
    # Create directory for each monitor
    monitor_directory = os.path.join(base_directory, serial)
    os.makedirs(monitor_directory, exist_ok=True)
    
    monitor_data_list = []
    
    # Generate data for each time period
    for start, end in generate_date_ranges(start_date, end_date):
        # Set parameters for POST request
        params = {
            'username': username,
            'password': password,
            'monitor': serial,
            'start': date_to_unix(start.strftime('%Y-%m-%d')),
            'end': date_to_unix(end.strftime('%Y-%m-%d'))
        }
        
        # Send the POST request
        response = requests.post(url, headers=headers, params=params)
        
        # Check if the request was successful
        if response.status_code == 200:
            if response.text:  # Check if there is any text in the response
                data = response.json()
                
                # Add serial number to each record
                for record in data:
                    record['serial_number'] = serial
                    monitor_data_list.append(record)
            else:
                print(f"No data returned for {serial} from {start} to {end}")
        else:
            print(f"Failed to fetch data for {serial} from {start} to {end}")
            print("Status Code:", response.status_code)
            print("Response Body:", response.text)
        
    # Convert the list of dictionaries to a pandas DataFrame
    monitor_df = pd.DataFrame(monitor_data_list)
    
    # Save to CSV in its respective directory
    csv_path = os.path.join(monitor_directory, f"{serial}.csv")
    monitor_df.to_csv(csv_path, index=False)
    
    # Collect data for final aggregation
    all_data.append(monitor_df)

# Combine all data into one DataFrame
combined_df = pd.concat(all_data, ignore_index=True)

# Save the combined data to a CSV
combined_csv_path = os.path.join(base_directory, "Air_pollution_Sonitus.csv")
combined_df.to_csv(combined_csv_path, index=False)

print("All data has been fetched and saved successfully.")




No data returned for DCC-AQ7 from 2021-05-01 00:00:00 to 2021-05-16 00:00:00
No data returned for DCC-AQ7 from 2021-05-31 00:00:00 to 2021-06-15 00:00:00
No data returned for DCC-AQ7 from 2021-06-30 00:00:00 to 2021-07-15 00:00:00
No data returned for DCC-AQ7 from 2021-07-30 00:00:00 to 2021-08-14 00:00:00
No data returned for DCC-AQ7 from 2021-08-29 00:00:00 to 2021-09-13 00:00:00
No data returned for DCC-AQ7 from 2021-09-28 00:00:00 to 2021-10-13 00:00:00
No data returned for DCC-AQ7 from 2021-10-28 00:00:00 to 2021-11-12 00:00:00
No data returned for DCC-AQ7 from 2021-11-27 00:00:00 to 2021-12-12 00:00:00
No data returned for DCC-AQ7 from 2021-12-27 00:00:00 to 2022-01-11 00:00:00
No data returned for DCC-AQ7 from 2022-01-26 00:00:00 to 2022-02-10 00:00:00
No data returned for DCC-AQ7 from 2022-02-25 00:00:00 to 2022-03-12 00:00:00
No data returned for DCC-AQ7 from 2022-03-27 00:00:00 to 2022-04-11 00:00:00
No data returned for DCC-AQ7 from 2022-04-26 00:00:00 to 2022-05-11 00:00:00

the next code adds latitude and longitude to the Air_pollution_Sonitus csv file. In an improved version of this script, lat an long are added in the loop above already

In [13]:

import pandas as pd


sonitus_df = pd.read_csv("C:/Users/Giulia Maria/Documents/GitHub/ML Proj Dublin/Hyperlocal-Air-Quality-Prediction-in-Dublin\Weather-Air_pollution/Air_pollution_Sonitus.csv")
# Merge the DataFrames based on the 'serial_numbers' column
Air_Pollution_Sensors = pd.merge(sonitus_df, air_monitor_df, on='serial_number', how='left')

# Display the merged DataFrame
#print(Air_Pollution_Sensors)
Air_Pollution_Sensors.head(5)

# save as csv file
#Air_Pollution_Sensors.to_csv('Air_Pollution_Sensors.csv', index=False)




Unnamed: 0,datetime,co,no,no2,so2,serial_number,pm1,pm10,pm2_5,pm4,tsp,o3,latitude_x,longitude_x,latitude_y,longitude_y
0,2021-04-30 22:00:00,0.47,6.54,68.0,0.73,DCC-AQ1,,,,,,,53.344239,-6.271525,53.3442389,-6.271525
1,2021-04-30 23:00:00,0.55,8.21,60.76,1.05,DCC-AQ1,,,,,,,53.344239,-6.271525,53.3442389,-6.271525
2,2021-05-01 00:00:00,0.49,2.19,52.77,0.99,DCC-AQ1,,,,,,,53.344239,-6.271525,53.3442389,-6.271525
3,2021-05-01 01:00:00,0.47,5.92,51.68,0.95,DCC-AQ1,,,,,,,53.344239,-6.271525,53.3442389,-6.271525
4,2021-05-01 02:00:00,0.39,2.32,42.85,0.69,DCC-AQ1,,,,,,,53.344239,-6.271525,53.3442389,-6.271525


Missing information

In [10]:
import pandas as pd

# Assuming Air_Pollution_Sensors is your DataFrame

# Check for missing values
missing_info = Air_Pollution_Sensors.isnull().sum()

# Displaying missing information
print("Missing Information:")
print(missing_info)

Air_Pollution_Sensors.head(5)

Missing Information:
datetime              0
co               116641
no                92731
no2               76796
so2               99841
serial_number         0
pm1               18314
pm10              11822
pm2_5             10212
pm4               68295
tsp               73017
o3               111101
latitude_x            0
longitude_x           0
latitude_y            0
longitude_y           0
dtype: int64


Unnamed: 0,datetime,co,no,no2,so2,serial_number,pm1,pm10,pm2_5,pm4,tsp,o3,latitude_x,longitude_x,latitude_y,longitude_y
0,2021-04-30 22:00:00,0.47,6.54,68.0,0.73,DCC-AQ1,,,,,,,53.344239,-6.271525,53.3442389,-6.271525
1,2021-04-30 23:00:00,0.55,8.21,60.76,1.05,DCC-AQ1,,,,,,,53.344239,-6.271525,53.3442389,-6.271525
2,2021-05-01 00:00:00,0.49,2.19,52.77,0.99,DCC-AQ1,,,,,,,53.344239,-6.271525,53.3442389,-6.271525
3,2021-05-01 01:00:00,0.47,5.92,51.68,0.95,DCC-AQ1,,,,,,,53.344239,-6.271525,53.3442389,-6.271525
4,2021-05-01 02:00:00,0.39,2.32,42.85,0.69,DCC-AQ1,,,,,,,53.344239,-6.271525,53.3442389,-6.271525


In [53]:
# number of unique observation in datetime column
print(Air_Pollution_Sensors['datetime'].nunique())


5943


In [None]:

# check if there is missing information for days or hours, besides the one clearly labeled as no data returned. 
        # basically, check if the data period in air pollution sonitus and the lacking data add up 

# put code stuff in folders so the gh is better 
# make your code prettier in both cases, hourly weather data and sonitus.api 


# done 
# insert every sensor in the serial number for loop thing

# add lat and long to each sensor
       # instead of a list of serial numbers, need to get a dataframe with serial number, latitude and longitude. 
        # then, adapt the code next so that it can only loop over the serial umbers while appending lat and lon to every row  


The next code only scrapes the deets for monitor DCC-AQ1 and produces the csv file Air_Pollution_data_DCC-AQ1

In [5]:

# Define the API endpoint
url = 'https://data.smartdublin.ie/sonitus-api/api/hourly-averages'

# Set headers
headers = {
    'accept': 'application/json'
}

# Define your credentials
username = 'dublincityapi'
password = 'Xpa5vAQ9ki'

# Convert date string to unix timestamp
def date_to_unix(date_str):
    return int(datetime.strptime(date_str, '%Y-%m-%d').timestamp())

# Generate date ranges for the period, 29 days at a time
def generate_date_ranges(start_date, end_date):
    current_date = start_date
    while current_date < end_date:
        yield current_date, min(end_date, current_date + timedelta(days=15))
        current_date += timedelta(days=30)

# Monitor serial numbers and date range
serial_numbers = ["DCC-AQ1"]
start_date = datetime(2021, 5, 1)
end_date = datetime(2022, 8, 31)

# Directory for all CSVs
base_directory = "Air_Pollution_Data"
os.makedirs(base_directory, exist_ok=True)

# Data container for all monitors
all_data = []

# Loop through each serial number and fetch data
for serial in serial_numbers:
    # Create directory for each monitor
    monitor_directory = os.path.join(base_directory, serial)
    os.makedirs(monitor_directory, exist_ok=True)
    
    monitor_data_list = []
    
    # Generate data for each time period
    for start, end in generate_date_ranges(start_date, end_date):
        # Set parameters for POST request
        params = {
            'username': username,
            'password': password,
            'monitor': serial,
            'start': date_to_unix(start.strftime('%Y-%m-%d')),
            'end': date_to_unix(end.strftime('%Y-%m-%d'))
        }
        
        # Send the POST request
        response = requests.post(url, headers=headers, params=params)
        
        # Check if the request was successful
        if response.status_code == 200:
            if response.text:  # Check if there is any text in the response
                data = response.json()
                
                # Add serial number to each record
                for record in data:
                    record['serial_number'] = serial
                    monitor_data_list.append(record)
            else:
                print(f"No data returned for {serial} from {start} to {end}")
        else:
            print(f"Failed to fetch data for {serial} from {start} to {end}")
            print("Status Code:", response.status_code)
            print("Response Body:", response.text)
        
    # Convert the list of dictionaries to a pandas DataFrame
    monitor_df = pd.DataFrame(monitor_data_list)
    
    # Save to CSV in its respective directory
    csv_path = os.path.join(monitor_directory, f"{serial}.csv")
    monitor_df.to_csv(csv_path, index=False)
    
    # Collect data for final aggregation
    all_data.append(monitor_df)

# Combine all data into one DataFrame
combined_df = pd.concat(all_data, ignore_index=True)

# Save the combined data to a CSV
combined_csv_path = os.path.join(base_directory, "Air_Pollution_data_DCC-AQ1.csv")
combined_df.to_csv(combined_csv_path, index=False)

print("All data has been fetched and saved successfully.")




All data has been fetched and saved successfully.
