In [1]:
# get sensor data from https://data.smartdublin.ie/sonitus-api
# map that displays the data : https://dublincityairandnoise.ie/
# page : https://data.gov.ie/dataset/sonitus/resource/38a117c9-79b5-4e1c-9080-ed862bbe689d

import requests
import pandas as pd

# Define the API endpoint
url = 'https://data.smartdublin.ie/sonitus-api/api/monitors'

# Set the parameters for the POST request
params = {
    'username': 'dublincityapi',
    'password': 'Xpa5vAQ9ki'
}

# Set headers
headers = {
    'accept': '*/*'
}

# Send the POST request
response = requests.post(url, headers=headers, params=params)

# Check if the request was successful
if response.status_code == 200:
    print("Data fetched successfully!")
    # Get JSON response data
    monitors_data = response.json()
    
    # Creating a list to store each monitor's data
    monitors_list = []
    
    # Assume each entry in the JSON data is a monitor
    for monitor in monitors_data:
        # Parse each monitor's data into a structured dictionary
        monitor_data = {
            'serial_number': monitor['serial_number'],  # Assuming 'id' is part of the monitor data
            'label': monitor['label'],  # Assuming 'location' details
            'location': monitor['location'],  # Assuming 'status' indicates if the monitor is active
            'latitude': monitor['latitude'],  # Assuming latitude info
            'longitude': monitor['longitude'],  # Assuming longitude info
            'last_calibrated': monitor['last_calibrated'],
            # Add additional fields as per your API response structure
            #'Current Rating' : monitor['current_rating']
        }
        monitors_list.append(monitor_data)
    
    # Convert the list of dictionaries to a pandas DataFrame
    monitors_df = pd.DataFrame(monitors_list)
    
    # Print the DataFrame
    print(monitors_df)
else:
    print("Failed to fetch data:")
    print("Status Code:", response.status_code)
    print("Response Body:", response.text)


Data fetched successfully!
         serial_number               label  \
0             10.1.1.1             Noise 1   
1                01749             Noise 2   
2                01508             Noise 3   
3                10118             Noise 4   
4                01548             Noise 5   
5                10115             Noise 6   
6             10.1.1.7             Noise 7   
7                01870             Noise 8   
8                01575             Noise 9   
9                01737            Noise 10   
10           10.1.1.11            Noise 11   
11           10.1.1.12            Noise 12   
12               01550            Noise 13   
13               01534            Noise 14   
14               01535            Noise 16   
15               01509            Noise 17   
16               01529            Noise 18   
17               01530      Noise 19 Spare   
18               01528            Noise 20   
19             DCC-AQ1      National Air 1   
20     

#### this code selects only the air data monitors and fetches their serial numbers

In [19]:
# Take the df created above and only save the rows where air appears. 
# This is because we are only interested in the air quality sensors.

# Filtering to create a new DataFrame where 'description' contains "Air"
monitors_air = monitors_df[monitors_df['label'].str.contains("Air")]

# Print the new DataFrame
#print(monitors_air)
# These are the sensors we care about

# then we loop over the serial number to get the hourly averages. 
serial_numbers = monitors_air['serial_number'].tolist() # get a list of serial numbers
serial_numbers
#len(serial_numbers) # 34 sensors for air quality
# use this list for the next code 


# limitation, no last calibration data for the pollution data so we 
# have to acknowledge that the data might have been calibrated last before our period of analysis 

['DCC-AQ1',
 'DCC-AQ2',
 'DCC-AQ3',
 'DCC-AQ4',
 'DCC-AQ5',
 'DCC-AQ6',
 'DCC-AQ7',
 'DCC-AQ8',
 'DCC-AQ9',
 'DCC-AQ10',
 'TNT1088',
 'TNT1138',
 'TNT1296',
 'TNO2161',
 'TNO2162',
 'DCC-AQ13',
 'DCC-AQ17',
 'DCC-AQ22',
 'DCC-AQ52',
 'DCC-AQ69',
 'TNO4435',
 'TNO4438',
 'TNO4488',
 'TNO4390',
 'TNO4324',
 'TNO4323',
 'TNO4325',
 'TNO4437',
 '0110-000157-000000',
 '0110-000180-000000',
 '0110-000141-000000',
 'DM30-00530',
 'DM30-00531',
 'DCC-AQ91']

#### This code generates a list containing Unix timestamps for each day from May 1, 2021, to August 31, 2022.

In [14]:

from datetime import datetime, timedelta

# Function to convert date to Unix timestamp
def unix_timestamp(date):
    return int(date.timestamp())

# Generate dates from May 1, 2021, to August 31, 2022
start_date = datetime(2021, 5, 1)
end_date = datetime(2022, 8, 31)

dates = [start_date + timedelta(days=x) for x in range((end_date - start_date).days + 1)]

# Convert dates to Unix timestamps
unix_timestamps = [unix_timestamp(date) for date in dates]

# Store Unix timestamps in a list
timestamp_list = unix_timestamps

# Print the list
print(timestamp_list)

len(timestamp_list) # 488 


[1619820000, 1619906400, 1619992800, 1620079200, 1620165600, 1620252000, 1620338400, 1620424800, 1620511200, 1620597600, 1620684000, 1620770400, 1620856800, 1620943200, 1621029600, 1621116000, 1621202400, 1621288800, 1621375200, 1621461600, 1621548000, 1621634400, 1621720800, 1621807200, 1621893600, 1621980000, 1622066400, 1622152800, 1622239200, 1622325600, 1622412000, 1622498400, 1622584800, 1622671200, 1622757600, 1622844000, 1622930400, 1623016800, 1623103200, 1623189600, 1623276000, 1623362400, 1623448800, 1623535200, 1623621600, 1623708000, 1623794400, 1623880800, 1623967200, 1624053600, 1624140000, 1624226400, 1624312800, 1624399200, 1624485600, 1624572000, 1624658400, 1624744800, 1624831200, 1624917600, 1625004000, 1625090400, 1625176800, 1625263200, 1625349600, 1625436000, 1625522400, 1625608800, 1625695200, 1625781600, 1625868000, 1625954400, 1626040800, 1626127200, 1626213600, 1626300000, 1626386400, 1626472800, 1626559200, 1626645600, 1626732000, 1626818400, 1626904800, 162

1622239200

In [30]:
# here you get timestampt for singular days 
from datetime import datetime

# Create a datetime object for May 30, 2021
date = datetime(2021, 5, 29)

# Convert the datetime object to a Unix timestamp
unix_timestamp = int(date.timestamp())

# Print the Unix timestamp
print(unix_timestamp)


1622239200


#### My chatgpt prompt

Ok, we have a list of serial numbers and we have a list of unix_timestamps for every day from 2021-05-01 to 2022-08-31. The api allows me to fetch 29 days at the time. Need a code that updates the params so that for every monitor in the list, the data for the period from 2021-05-01 to 2022-08-31 is scraped 29 days at the time. the last scraping batch does not have to be 29 days and can be the remaining days to be scraped.  there must be a column with the serial_number corresponding to the row scraped. 

I would like all the air pollution data for every monitor to be saved in different csvs in different folders. In the end, I want to have as many folders as the number of air monitors (34 i believe). Then, would like all these monitor data to be merged in a big dataset called "Air_pollution_Sonitus". This is the code to start from: 

In [39]:
import requests
import pandas as pd

# Define the API endpoint
url = 'https://data.smartdublin.ie/sonitus-api/api/hourly-averages'

# Set the parameters for the POST request
params = {
    'username': 'dublincityapi',
    'password': 'Xpa5vAQ9ki',
    'monitor': 'TNO4435',  # air monitor serial numbers 
    'start': '1619827200',  # 2021-05-01 00:00:00
    'end': '1622239200'     # 2021-05-29 00:00:00
}

# Set headers
headers = {
    'accept': 'application/json'
}

# Send the POST request
response = requests.post(url, headers=headers, params=params)

# Check if the request was successful
if response.status_code == 200:
    print("Data fetched successfully!")
    # Get JSON response data
    data = response.json()
    
    # Creating a list to store each record's data
    records_list = []
    for record in data:
        # Extracting details for each record
        record_data = {
            'datetime': record['datetime'],
            'pm1': record.get('pm1', 0),        # Defaulting to 0 if any key is missing
            'pm10': record.get('pm10', 0),
            'pm2_5': record.get('pm2_5', 0),
            'tsp': record.get('tsp', 0)
        }
        records_list.append(record_data)
    
    # Convert the list of dictionaries to a pandas DataFrame
    df = pd.DataFrame(records_list)
    
    # Print the DataFrame
    print(df)
else:
    print("Failed to fetch data:")
    print("Status Code:", response.status_code)
    print("Response Body:", response.text)



    # things missing in this code: 
    # must add the monitor id 
    # then put the longitude and latitude 
    # must figure out a way that has the code scrap 29 days at the time. 
           # figure out if date format can be produced through code otherwise ask chat to provide a list for the time span 


# question,
# relevant even if the longitudes and latitudes are not the same with the intersection ones? 

Data fetched successfully!
                datetime   pm1   pm10  pm2_5    tsp
0    2021-05-01 00:00:00  2.88  10.87   5.62  17.40
1    2021-05-01 01:00:00  2.27   7.55   4.25  10.92
2    2021-05-01 02:00:00  2.44   8.32   4.55  12.33
3    2021-05-01 03:00:00  2.73   8.85   4.99  12.67
4    2021-05-01 04:00:00  2.44   7.83   4.43  10.82
..                   ...   ...    ...    ...    ...
666  2021-05-28 18:00:00  3.63  11.47   7.34  14.95
667  2021-05-28 19:00:00  3.57  10.90   7.09  14.02
668  2021-05-28 20:00:00  2.87   8.90   5.76  11.60
669  2021-05-28 21:00:00  2.52   7.73   4.91  10.15
670  2021-05-28 22:00:00  2.53   7.77   4.79  10.32

[671 rows x 5 columns]


In [37]:
# test 
import requests
import pandas as pd
import os
from datetime import datetime, timedelta

# Define the API endpoint
url = 'https://data.smartdublin.ie/sonitus-api/api/hourly-averages'

# Set headers
headers = {
    'accept': 'application/json'
}

# Define your credentials
username = 'dublincityapi'
password = 'Xpa5vAQ9ki'

# Convert date string to unix timestamp
def date_to_unix(date_str):
    return int(datetime.strptime(date_str, '%Y-%m-%d').timestamp())

# Generate date ranges for the period, 29 days at a time
def generate_date_ranges(start_date, end_date):
    current_date = start_date
    while current_date < end_date:
        yield current_date, min(end_date, current_date + timedelta(days=15))
        current_date += timedelta(days=30)

# Monitor serial numbers and date range
serial_numbers = serial_numbers  
start_date = datetime(2021, 5, 1)
end_date = datetime(2022, 8, 31)

# Directory for all CSVs
base_directory = "Air_Pollution_Data"
os.makedirs(base_directory, exist_ok=True)

# Data container for all monitors
all_data = []

# Loop through each serial number and fetch data
for serial in serial_numbers:
    # Create directory for each monitor
    monitor_directory = os.path.join(base_directory, serial)
    os.makedirs(monitor_directory, exist_ok=True)
    
    monitor_data_list = []
    
    # Generate data for each time period
    for start, end in generate_date_ranges(start_date, end_date):
        # Set parameters for POST request
        params = {
            'username': username,
            'password': password,
            'monitor': serial,
            'start': date_to_unix(start.strftime('%Y-%m-%d')),
            'end': date_to_unix(end.strftime('%Y-%m-%d'))
        }
        
        # Send the POST request
        response = requests.post(url, headers=headers, params=params)
        
        # Check if the request was successful
        if response.status_code == 200:
            data = response.json()
            
            # Add serial number to each record
            for record in data:
                record['serial_number'] = serial
                monitor_data_list.append(record)
                
        else:
            print(f"Failed to fetch data for {serial} from {start} to {end}")
            print("Status Code:", response.status_code)
            print("Response Body:", response.text)
        
    # Convert the list of dictionaries to a pandas DataFrame
    monitor_df = pd.DataFrame(monitor_data_list)
    
    # Save to CSV in its respective directory
    csv_path = os.path.join(monitor_directory, f"{serial}.csv")
    monitor_df.to_csv(csv_path, index=False)
    
    # Collect data for final aggregation
    all_data.append(monitor_df)

# Combine all data into one DataFrame
combined_df = pd.concat(all_data, ignore_index=True)

# Save the combined data to a CSV
combined_csv_path = os.path.join(base_directory, "Air_pollution_Sonitus.csv")
combined_df.to_csv(combined_csv_path, index=False)

print("All data has been fetched and saved successfully.")


JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [None]:
# test 
import requests
import pandas as pd
import os
from datetime import datetime, timedelta

# Define the API endpoint
url = 'https://data.smartdublin.ie/sonitus-api/api/hourly-averages'

# Set headers
headers = {
    'accept': 'application/json'
}

# Define your credentials
username = 'dublincityapi'
password = 'Xpa5vAQ9ki'

# Convert date string to unix timestamp
def date_to_unix(date_str):
    return int(datetime.strptime(date_str, '%Y-%m-%d').timestamp())

# Generate date ranges for the period, 29 days at a time
def generate_date_ranges(start_date, end_date):
    current_date = start_date
    while current_date < end_date:
        yield current_date, min(end_date, current_date + timedelta(days=15))
        current_date += timedelta(days=30)

# Monitor serial numbers and date range
serial_numbers = serial_numbers  
start_date = datetime(2021, 5, 1)
end_date = datetime(2022, 8, 31)

# Directory for all CSVs
base_directory = "Air_Pollution_Data"
os.makedirs(base_directory, exist_ok=True)

# Data container for all monitors
all_data = []

# Loop through each serial number and fetch data
for serial in serial_numbers:
    # Create directory for each monitor
    monitor_directory = os.path.join(base_directory, serial)
    os.makedirs(monitor_directory, exist_ok=True)
    
    monitor_data_list = []
    
    # Generate data for each time period
    for start, end in generate_date_ranges(start_date, end_date):
        # Set parameters for POST request
        params = {
            'username': username,
            'password': password,
            'monitor': serial,
            'start': date_to_unix(start.strftime('%Y-%m-%d')),
            'end': date_to_unix(end.strftime('%Y-%m-%d'))
        }
        
        # Send the POST request
        response = requests.post(url, headers=headers, params=params)
        
        # Check if the request was successful
        if response.status_code == 200:
            data = response.json()
            
            # Add serial number to each record
            for record in data:
                record['serial_number'] = serial
                monitor_data_list.append(record)
                
        else:
            print(f"Failed to fetch data for {serial} from {start} to {end}")
            print("Status Code:", response.status_code)
            print("Response Body:", response.text)
        
    # Convert the list of dictionaries to a pandas DataFrame
    monitor_df = pd.DataFrame(monitor_data_list)
    
    # Save to CSV in its respective directory
    csv_path = os.path.join(monitor_directory, f"{serial}.csv")
    monitor_df.to_csv(csv_path, index=False)
    
    # Collect data for final aggregation
    all_data.append(monitor_df)

# Combine all data into one DataFrame
combined_df = pd.concat(all_data, ignore_index=True)

# Save the combined data to a CSV
combined_csv_path = os.path.join(base_directory, "Air_pollution_Sonitus.csv")
combined_df.to_csv(combined_csv_path, index=False)

print("All data has been fetched and saved successfully.")


JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [18]:
serial_numbers[:]

['TNO4435', 'TNO4438', 'TNO4488']

seeing what the response looks like 

In [38]:
data = response.json()
print(data)  # See what the data actually looks like
type(data)  # Check the data type

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

troubleshooting 

looks like it works, now i modify serial number list and check that it all makes sense. 
check that it does not create a df every time i run this code but that it only updates the one i have 

In [7]:
# this works 

import requests
import pandas as pd
import os
from datetime import datetime, timedelta

# Define the API endpoint
url = 'https://data.smartdublin.ie/sonitus-api/api/hourly-averages'

# Set headers
headers = {
    'accept': 'application/json'
}

# Define your credentials
username = 'dublincityapi'
password = 'Xpa5vAQ9ki'

# Convert date string to unix timestamp
def date_to_unix(date_str):
    return int(datetime.strptime(date_str, '%Y-%m-%d').timestamp())

# Generate date ranges for the period, 29 days at a time
def generate_date_ranges(start_date, end_date):
    current_date = start_date
    while current_date < end_date:
        yield current_date, min(end_date, current_date + timedelta(days=15))
        current_date += timedelta(days=30)

# Monitor serial numbers and date range
serial_numbers = serial_numbers 
start_date = datetime(2021, 5, 1)
end_date = datetime(2022, 8, 31)

# Directory for all CSVs
base_directory = "Air_Pollution_Data"
os.makedirs(base_directory, exist_ok=True)

# Data container for all monitors
all_data = []

# Loop through each serial number and fetch data
for serial in serial_numbers:
    # Create directory for each monitor
    monitor_directory = os.path.join(base_directory, serial)
    os.makedirs(monitor_directory, exist_ok=True)
    
    monitor_data_list = []
    
    # Generate data for each time period
    for start, end in generate_date_ranges(start_date, end_date):
        # Set parameters for POST request
        params = {
            'username': username,
            'password': password,
            'monitor': serial,
            'start': date_to_unix(start.strftime('%Y-%m-%d')),
            'end': date_to_unix(end.strftime('%Y-%m-%d'))
        }
        
        # Send the POST request
        response = requests.post(url, headers=headers, params=params)
        
        # Check if the request was successful
        if response.status_code == 200:
            if response.text:  # Check if there is any text in the response
                data = response.json()
                
                # Add serial number to each record
                for record in data:
                    record['serial_number'] = serial
                    monitor_data_list.append(record)
            else:
                print(f"No data returned for {serial} from {start} to {end}")
        else:
            print(f"Failed to fetch data for {serial} from {start} to {end}")
            print("Status Code:", response.status_code)
            print("Response Body:", response.text)
        
    # Convert the list of dictionaries to a pandas DataFrame
    monitor_df = pd.DataFrame(monitor_data_list)
    
    # Save to CSV in its respective directory
    csv_path = os.path.join(monitor_directory, f"{serial}.csv")
    monitor_df.to_csv(csv_path, index=False)
    
    # Collect data for final aggregation
    all_data.append(monitor_df)

# Combine all data into one DataFrame
combined_df = pd.concat(all_data, ignore_index=True)

# Save the combined data to a CSV
combined_csv_path = os.path.join(base_directory, "Air_pollution_Sonitus.csv")
combined_df.to_csv(combined_csv_path, index=False)

print("All data has been fetched and saved successfully.")


# add lat and long to each sensor 
# insert every sensor in the serial number for loop thing
# check if there is missing information for days or hours 
# put code stuff in folders so the gh is better 
# make your code prettier in both cases, hourly weather data and sonitus.api 


All data has been fetched and saved successfully.


In [20]:
import pandas as pd

sonitus_data = pd.read_csv("C:/Users/Giulia Maria/Documents/GitHub/ML Proj Dublin/Hyperlocal-Air-Quality-Prediction-in-Dublin/Air_Pollution_Data/Air_pollution_Sonitus.csv")
sonitus_data.head()

# # Check the unique entries in the 'serial_number' column
unique_serial_numbers = sonitus_data['serial_number'].unique()

# # Print the unique serial numbers
print("Unique Serial Numbers in the Dataset:")
print(unique_serial_numbers)

Unique Serial Numbers in the Dataset:
['TNO4435' 'TNO4438' 'TNO4488']
