In [1]:
✅ Final Suggested Schema for Your Dataset


Column	Type	Required	Purpose
latitude	float	✅ Yes	Needed for spatial interpolation
longitude	float	✅ Yes	Needed for spatial interpolation
timestamp	datetime	✅ Yes	For extracting temporal patterns
co2	float	✅ Yes	Pollution indicator (can be your main target)
pm2_5	float	✅ Yes	Fine particles, another pollution target
pm10	float	✅ Yes	Coarser particles, also important
humidity	float	Optional	Affects particle suspension and dispersion
temperature	float	Optional	Influences chemical reactions and pollutant behavior
wind_speed	float	Optional	Strong wind → disperses pollutants
pressure	float	Optional	Can impact air quality readings
traffic_index	float/int	Optional	Strong signal for urban pollution levels

SyntaxError: invalid character '✅' (U+2705) (552190771.py, line 1)

In [2]:
latitude | longitude | timestamp | co2 | pm2_5 | pm10 | humidity | temperature | wind_speed | pressure | traffic_index
44.4328 | 26.1043 | 2023-04-12 09:15:00 | 410 | 12.3 | 25.6 | 55 | 21.0°C | 2.4 | 1012 | 73

SyntaxError: leading zeros in decimal integer literals are not permitted; use an 0o prefix for octal integers (3733562308.py, line 2)

In [24]:
import requests
import pandas as pd
from datetime import datetime, timedelta
import os

API_KEY = "41a32c6d87e64eb16c24f687a699b887a98aad1786c3f6f0376aad1da0fdca86"  # 🔐 Paste your OpenAQ API key here

HEADERS = {
    "X-API-Key": API_KEY
}

def get_all_locations(limit=500):
    url = "https://api.openaq.org/v3/locations"
    all_locations = []
    page = 1

    while True:
        print(f"🌍 Fetching page {page}...")
        response = requests.get(url, headers=HEADERS, params={
            "limit": limit,
            "page": page,
            "sort": "desc"
        })

        results = response.json().get("results", [])
        if not results:
            break

        all_locations.extend(results)
        page += 1

        if page > 4:  # Avoid downloading thousands of entries
            break

    print(f"✅ Retrieved {len(all_locations)} total locations.")
    return all_locations

def filter_bucharest_stations(locations):
    return [
        loc for loc in locations
        if (
            loc.get("coordinates") and
            abs(loc["coordinates"]["latitude"] - 44.43) < 0.3 and
            abs(loc["coordinates"]["longitude"] - 26.10) < 0.3
        )
    ]

def fetch_measurements(location_name, parameters=["pm25", "pm10", "co"], days_back=7):
    measurements = []
    base_url = "https://api.openaq.org/v3/measurements"
    date_to = datetime.utcnow()
    date_from = date_to - timedelta(days=days_back)

    for param in parameters:
        page = 1
        while True:
            response = requests.get(base_url, headers=HEADERS, params={
                "location": location_name,
                "parameter": param,
                "date_from": date_from.isoformat() + "Z",
                "date_to": date_to.isoformat() + "Z",
                "limit": 100,
                "page": page,
                "sort": "desc"
            })

            data = response.json().get("results", [])
            if not data:
                break

            for entry in data:
                measurements.append({
                    "timestamp": entry["date"]["utc"],
                    "location": entry["location"],
                    "parameter": entry["parameter"],
                    "value": entry["value"],
                    "unit": entry["unit"],
                    "latitude": entry["coordinates"]["latitude"],
                    "longitude": entry["coordinates"]["longitude"]
                })

            page += 1

    return measurements

# 🚀 Run the flow
print("🔍 Fetching global locations...")
locations = get_all_locations()

print("📍 Filtering by Bucharest coordinates...")
bucharest_stations = filter_bucharest_stations(locations)

if not bucharest_stations:
    print("❌ No stations found near Bucharest.")
else:
    print(f"✅ Found {len(bucharest_stations)} Bucharest station(s):")
    for s in bucharest_stations:
        print(" -", s["name"])

    all_data = []
    for station in bucharest_stations:
        print(f"📡 Downloading from {station['name']}...")
        data = fetch_measurements(station["name"])
        all_data.extend(data)

    if all_data:
        df = pd.DataFrame(all_data)
        os.makedirs("data", exist_ok=True)
        df.to_csv("data/bucharest_openaq_data.csv", index=False)
        print("✅ Saved to data/bucharest_openaq_data.csv")
        display(df.head())
    else:
        print("⚠️ No measurement data found.")


🔍 Fetching global locations...
🌍 Fetching page 1...
🌍 Fetching page 2...
🌍 Fetching page 3...
🌍 Fetching page 4...
✅ Retrieved 2000 total locations.
📍 Filtering by Bucharest coordinates...
❌ No stations found near Bucharest.


In [None]:
### OpenAQ does not have a valid station for Bucharest 

In [None]:
### Creating dataset: https://www.eea.europa.eu/en/analysis/maps-and-charts/index?activeTab=265e2bee-7de3-46e8-b6ee-76005f3f434f - for pm10

### PM10

In [4]:
import pandas as pd
from pathlib import Path
# Folder where your files are
folder = Path("../data/Checking data/pm10_2013-2023")

# Match all files named file1.parquet to file6.parquet
files = sorted(folder.glob("SPO-*.parquet"))

# Read and concatenate them
dfs = [pd.read_parquet(f) for f in files]
df_all = pd.concat(dfs, ignore_index=True)

# Extract station code from Samplingpoint
df_all["station_code"] = df_all["Samplingpoint"].str.extract(r"(RO\d{4}A)")

# Filter only for RO0070A (B-6)
df_b4 = df_all[df_all["station_code"] == "RO0070A"].copy()

print(f"✅ Found {len(df_b4)} rows")
df_b4.head(-4)




✅ Found 8760 rows


Unnamed: 0,Samplingpoint,Pollutant,Start,End,Value,Unit,AggType,Validity,Verification,ResultTime,DataCapture,FkObservationLog,station_code
87600,RO/SPO-RO0070A_00005_101,5,2021-12-31 23:00:00,2022-01-01 00:00:00,58.790000000000000000,ug.m-3,hour,1,1,2023-09-26 14:37:36,,ab93038e-61fd-4244-a046-d391945af0a4,RO0070A
87601,RO/SPO-RO0070A_00005_101,5,2022-01-01 00:00:00,2022-01-01 01:00:00,53.680000000000000000,ug.m-3,hour,1,1,2023-09-26 14:37:36,,ab93038e-61fd-4244-a046-d391945af0a4,RO0070A
87602,RO/SPO-RO0070A_00005_101,5,2022-01-01 01:00:00,2022-01-01 02:00:00,50.130000000000000000,ug.m-3,hour,1,1,2023-09-26 14:37:36,,ab93038e-61fd-4244-a046-d391945af0a4,RO0070A
87603,RO/SPO-RO0070A_00005_101,5,2022-01-01 02:00:00,2022-01-01 03:00:00,38.550000000000000000,ug.m-3,hour,1,1,2023-09-26 14:37:36,,ab93038e-61fd-4244-a046-d391945af0a4,RO0070A
87604,RO/SPO-RO0070A_00005_101,5,2022-01-01 03:00:00,2022-01-01 04:00:00,32.870000000000000000,ug.m-3,hour,1,1,2023-09-26 14:37:36,,ab93038e-61fd-4244-a046-d391945af0a4,RO0070A
...,...,...,...,...,...,...,...,...,...,...,...,...,...
96351,RO/SPO-RO0070A_00005_101,5,2022-12-31 14:00:00,2022-12-31 15:00:00,0E-18,ug.m-3,hour,-1,1,2023-09-26 14:37:36,,ab93038e-61fd-4244-a046-d391945af0a4,RO0070A
96352,RO/SPO-RO0070A_00005_101,5,2022-12-31 15:00:00,2022-12-31 16:00:00,0E-18,ug.m-3,hour,-1,1,2023-09-26 14:37:36,,ab93038e-61fd-4244-a046-d391945af0a4,RO0070A
96353,RO/SPO-RO0070A_00005_101,5,2022-12-31 16:00:00,2022-12-31 17:00:00,0E-18,ug.m-3,hour,-1,1,2023-09-26 14:37:36,,ab93038e-61fd-4244-a046-d391945af0a4,RO0070A
96354,RO/SPO-RO0070A_00005_101,5,2022-12-31 17:00:00,2022-12-31 18:00:00,0E-18,ug.m-3,hour,-1,1,2023-09-26 14:37:36,,ab93038e-61fd-4244-a046-d391945af0a4,RO0070A


In [5]:
# 1. Selectăm doar coloanele necesare
df_pm10 = df_b4[["Start", "End", "Value", "Unit"]].copy()

# 2. Renumim coloanele
df_pm10.rename(columns={
    "Start": "start",
    "End": "end",
    "Value": "pm10",
    "Unit": "pm10_unit"
}, inplace=True)

# 3. Filtrăm valorile valide
df_pm10_filtered = df_pm10[df_pm10["pm10"] > 1e-6].copy()
df_pm10_filtered.reset_index(drop=True, inplace=True)

# 4. Adăugăm coordonatele stației
df_pm10_filtered["latitude"] = 44.444925
df_pm10_filtered["longitude"] = 26.127289

# 5. Salvăm dataset-ul
df_pm10_filtered.to_csv("../data/Checking data/pm10_bucharest_clean-ro0070a.csv", index=False)

print(f"✅ Cleaned dataset saved. Remaining rows: {len(df_pm10_filtered)}")
df_pm10_filtered.head(-4)



✅ Cleaned dataset saved. Remaining rows: 8207


Unnamed: 0,start,end,pm10,pm10_unit,latitude,longitude
0,2021-12-31 23:00:00,2022-01-01 00:00:00,58.790000000000000000,ug.m-3,44.444925,26.127289
1,2022-01-01 00:00:00,2022-01-01 01:00:00,53.680000000000000000,ug.m-3,44.444925,26.127289
2,2022-01-01 01:00:00,2022-01-01 02:00:00,50.130000000000000000,ug.m-3,44.444925,26.127289
3,2022-01-01 02:00:00,2022-01-01 03:00:00,38.550000000000000000,ug.m-3,44.444925,26.127289
4,2022-01-01 03:00:00,2022-01-01 04:00:00,32.870000000000000000,ug.m-3,44.444925,26.127289
...,...,...,...,...,...,...
8198,2022-12-22 00:00:00,2022-12-22 01:00:00,46.460000000000000000,ug.m-3,44.444925,26.127289
8199,2022-12-22 01:00:00,2022-12-22 02:00:00,46.460000000000000000,ug.m-3,44.444925,26.127289
8200,2022-12-22 02:00:00,2022-12-22 03:00:00,46.460000000000000000,ug.m-3,44.444925,26.127289
8201,2022-12-22 03:00:00,2022-12-22 04:00:00,46.460000000000000000,ug.m-3,44.444925,26.127289


### CO2

In [78]:
# import pandas as pd
# from pathlib import Path

# # Folder where your files are
# folder = Path("data/co2-2022-e1a")

# # Match all files named file1.parquet to file6.parquet
# files = sorted(folder.glob("SPO-*.parquet"))

# # Read and concatenate them
# dfs = [pd.read_parquet(f) for f in files]
# df_all = pd.concat(dfs, ignore_index=True)

# # Preview
# print(f"✅ Loaded {len(df_all)} rows from {len(files)} files")
# df_all.head(-1)

### PM2.5

In [70]:
import pandas as pd
from pathlib import Path

# Folder where your files are
folder = Path("../data/Checking data/pm25_2013-2023")

# Match all files named file1.parquet to file6.parquet
files = sorted(folder.glob("SPO-*.parquet"))

# Read and concatenate them
dfs = [pd.read_parquet(f) for f in files]
df_all = pd.concat(dfs, ignore_index=True)

# Extract station code from Samplingpoint
df_all["station_code"] = df_all["Samplingpoint"].str.extract(r"(RO\d{4}A)")

# Filter only for RO0070A (B-6)
df_b4 = df_all[df_all["station_code"] == "RO0066A"].copy()

print(f"✅ Found {len(df_b4)} rows")
df_b4.head(-4)

✅ Found 0 rows


Unnamed: 0,Samplingpoint,Pollutant,Start,End,Value,Unit,AggType,Validity,Verification,ResultTime,DataCapture,FkObservationLog,station_code


In [29]:
# 1. Selectăm doar coloanele necesare
df_pm25 = df_b4[["Start", "End", "Value", "Unit"]].copy()

# 2. Renumim coloanele
df_pm25.rename(columns={
    "Start": "start",
    "End": "end",
    "Value": "pm2_5",
    "Unit": "pm2_5_unit"
}, inplace=True)

# 3. Filtrăm valorile valide
df_pm25_filtered = df_pm25[df_pm25["pm2_5"] > 1e-6].copy()
df_pm25_filtered.reset_index(drop=True, inplace=True)

# 4. Adăugăm coordonatele stației
df_pm25_filtered["latitude"] = 44.4264
df_pm25_filtered["longitude"] = 26.1406

# 5. Salvăm dataset-ul
df_pm25_filtered.to_csv("../data/Checking data/pm25_bucharest_clean-2021-2022.csv", index=False)

print(f"✅ Cleaned dataset saved. Remaining rows: {len(df_pm25_filtered)}")
df_pm25_filtered.head()



✅ Cleaned dataset saved. Remaining rows: 16889


Unnamed: 0,start,end,pm2_5,pm2_5_unit,latitude,longitude
0,2021-12-31 23:00:00,2022-01-01 00:00:00,80.42,ug.m-3,44.4264,26.1406
1,2022-01-01 00:00:00,2022-01-01 01:00:00,97.23,ug.m-3,44.4264,26.1406
2,2022-01-01 01:00:00,2022-01-01 02:00:00,58.21,ug.m-3,44.4264,26.1406
3,2022-01-01 02:00:00,2022-01-01 03:00:00,41.24,ug.m-3,44.4264,26.1406
4,2022-01-01 03:00:00,2022-01-01 04:00:00,31.92,ug.m-3,44.4264,26.1406


### NO2

In [6]:
import pandas as pd
from pathlib import Path

# Folder where your files are
folder = Path("../data/Checking data/no2_2013-2023")

# Match all files named file1.parquet to file6.parquet
files = sorted(folder.glob("SPO-*.parquet"))

# Read and concatenate them
dfs = [pd.read_parquet(f) for f in files]
df_all = pd.concat(dfs, ignore_index=True)

# Extract station code from Samplingpoint
df_all["station_code"] = df_all["Samplingpoint"].str.extract(r"(RO\d{4}A)")

# Filter only for RO0070A (B-6)
df_b4 = df_all[df_all["station_code"] == "RO0070A"].copy()

print(f"✅ Found {len(df_b4)} rows for station")
df_b4.head(-4)

✅ Found 61367 rows for station


Unnamed: 0,Samplingpoint,Pollutant,Start,End,Value,Unit,AggType,Validity,Verification,ResultTime,DataCapture,FkObservationLog,station_code
315545,RO/SPO-RO0070A_00008_100,8,2015-12-31 23:00:00,2016-01-01 00:00:00,0E-18,ug.m-3,hour,-1,1,2017-10-13 10:48:36,,abe57c3b-04fa-4a38-8952-91a6c8a419c6,RO0070A
315546,RO/SPO-RO0070A_00008_100,8,2016-01-01 00:00:00,2016-01-01 01:00:00,0E-18,ug.m-3,hour,-1,1,2017-10-13 10:48:36,,abe57c3b-04fa-4a38-8952-91a6c8a419c6,RO0070A
315547,RO/SPO-RO0070A_00008_100,8,2016-01-01 01:00:00,2016-01-01 02:00:00,0E-18,ug.m-3,hour,-1,1,2017-10-13 10:48:36,,abe57c3b-04fa-4a38-8952-91a6c8a419c6,RO0070A
315548,RO/SPO-RO0070A_00008_100,8,2016-01-01 02:00:00,2016-01-01 03:00:00,0E-18,ug.m-3,hour,-1,1,2017-10-13 10:48:36,,abe57c3b-04fa-4a38-8952-91a6c8a419c6,RO0070A
315549,RO/SPO-RO0070A_00008_100,8,2016-01-01 03:00:00,2016-01-01 04:00:00,0E-18,ug.m-3,hour,-1,1,2017-10-13 10:48:36,,abe57c3b-04fa-4a38-8952-91a6c8a419c6,RO0070A
...,...,...,...,...,...,...,...,...,...,...,...,...,...
376903,RO/SPO-RO0070A_00008_100,8,2022-12-31 14:00:00,2022-12-31 15:00:00,0E-18,ug.m-3,hour,-1,1,2023-09-26 14:37:36,,70aa51b7-9559-4d33-82ae-009d2e946ce2,RO0070A
376904,RO/SPO-RO0070A_00008_100,8,2022-12-31 15:00:00,2022-12-31 16:00:00,0E-18,ug.m-3,hour,-1,1,2023-09-26 14:37:36,,70aa51b7-9559-4d33-82ae-009d2e946ce2,RO0070A
376905,RO/SPO-RO0070A_00008_100,8,2022-12-31 16:00:00,2022-12-31 17:00:00,0E-18,ug.m-3,hour,-1,1,2023-09-26 14:37:36,,70aa51b7-9559-4d33-82ae-009d2e946ce2,RO0070A
376906,RO/SPO-RO0070A_00008_100,8,2022-12-31 17:00:00,2022-12-31 18:00:00,0E-18,ug.m-3,hour,-1,1,2023-09-26 14:37:36,,70aa51b7-9559-4d33-82ae-009d2e946ce2,RO0070A


In [9]:
# 1. Selectăm doar coloanele necesare
df_no2 = df_b4[["Start", "End", "Value", "Unit"]].copy()

# 2. Renumim coloanele
df_no2.rename(columns={
    "Start": "start",
    "End": "end",
    "Value": "no2",
    "Unit": "no2_unit"
}, inplace=True)

# 3. Filtrăm valorile valide
df_no2_filtered = df_no2[df_no2["no2"] > 1e-6].copy()
df_no2_filtered.reset_index(drop=True, inplace=True)

# 4. Adăugăm coordonatele stației
df_no2_filtered["latitude"] = 44.444925
df_no2_filtered["longitude"] = 26.127289

# 5. Salvăm dataset-ul
df_no2_filtered.to_csv("../data/Checking data/no2_bucharest_clean-ro0070a.csv", index=False)

print(f"✅ Cleaned dataset saved. Remaining rows: {len(df_no2_filtered)}")
df_no2_filtered.head()


✅ Cleaned dataset saved. Remaining rows: 52485


Unnamed: 0,start,end,no2,no2_unit,latitude,longitude
0,2016-06-29 02:00:00,2016-06-29 03:00:00,20.22,ug.m-3,44.444925,26.127289
1,2016-06-29 03:00:00,2016-06-29 04:00:00,18.93,ug.m-3,44.444925,26.127289
2,2016-06-29 07:00:00,2016-06-29 08:00:00,28.63,ug.m-3,44.444925,26.127289
3,2016-06-29 08:00:00,2016-06-29 09:00:00,28.4,ug.m-3,44.444925,26.127289
4,2016-06-29 09:00:00,2016-06-29 10:00:00,30.64,ug.m-3,44.444925,26.127289


In [15]:
### no PM25

import pandas as pd

# 1. Încarcă fișierele
df_pm10 = pd.read_csv("../data/Checking data/pm10_bucharest_clean-ro0070a.csv")
df_no2  = pd.read_csv("../data/Checking data/no2_bucharest_clean-ro0070a.csv")

# 3. Merge între PM10 și NO2
df_merged = pd.merge(df_pm10, df_no2, on=["start", "end"], how="inner")

# 4. Păstrăm doar o singură coloană pentru lat și lon
df_merged["latitude"] = df_merged["latitude_x"]
df_merged["longitude"] = df_merged["longitude_x"]

# 5. Eliminăm coloanele duplicate
df_merged.drop(columns=[
    "pm10_unit", "no2_unit",
    "latitude_x", "longitude_x",
    "latitude_y", "longitude_y"
], inplace=True)

# 6. Reordonăm coloanele (opțional)
df_merged = df_merged[["start", "end", "pm10", "no2", "latitude", "longitude"]]

# 7. Salvăm rezultatul
df_merged.to_csv("../data/bucharest_merged_pm10_no2_70a.csv", index=False)

# print(f"✅ Merged Dataset {len(df_merged)} rows.")
df_merged.head(-3)


Unnamed: 0,start,end,pm10,no2,latitude,longitude
0,2021-12-31 23:00:00,2022-01-01 00:00:00,58.79,25.78,44.444925,26.127289
1,2022-01-01 00:00:00,2022-01-01 01:00:00,53.68,29.63,44.444925,26.127289
2,2022-01-01 01:00:00,2022-01-01 02:00:00,50.13,22.50,44.444925,26.127289
3,2022-01-01 02:00:00,2022-01-01 03:00:00,38.55,23.24,44.444925,26.127289
4,2022-01-01 03:00:00,2022-01-01 04:00:00,32.87,25.12,44.444925,26.127289
...,...,...,...,...,...,...
7768,2022-12-20 22:00:00,2022-12-20 23:00:00,41.39,42.09,44.444925,26.127289
7769,2022-12-20 23:00:00,2022-12-21 00:00:00,39.22,29.53,44.444925,26.127289
7770,2022-12-21 00:00:00,2022-12-21 01:00:00,41.27,19.78,44.444925,26.127289
7771,2022-12-21 01:00:00,2022-12-21 02:00:00,38.43,16.22,44.444925,26.127289


In [None]:
# 

In [16]:
# import pandas as pd

# # 1. Încarcă fișierele
# df_pm10 = pd.read_csv("../data/Checking data/pm10_bucharest_clean-2021-2023.csv")
# df_pm25 = pd.read_csv("../data/Checking data/pm25_bucharest_clean-2021-2023.csv")
# df_no2  = pd.read_csv("../data/Checking data/no2_bucharest_clean-2021-2023.csv")

# # 2. Merge între PM10 și PM2.5 (doar dacă ambele au același interval)
# df_merge_1 = pd.merge(df_pm10, df_pm25, on=["start", "end"], how="inner")
# # 3. Merge între (PM10+PM2.5) și NO2
# df_merged = pd.merge(df_merge_1, df_no2, on=["start", "end"], how="inner")

# # 4. Păstrăm doar o singură coloană pentru lat și lon
# df_merged["latitude"] = df_merged["latitude_x"]
# df_merged["longitude"] = df_merged["longitude_x"]

# # 5. Eliminăm coloanele duplicate
# df_merged.drop(columns=[
#     "pm10_unit", "pm2_5_unit", "no2_unit",
#     "latitude_x", "longitude_x",
#     "latitude_y", "longitude_y"
# ], inplace=True)

# # 6. Reordonăm coloanele (opțional)
# df_merged = df_merged[["start", "end", "pm10", "pm2_5", "no2", "latitude", "longitude"]]

# # 7. Salvăm rezultatul
# df_merged.to_csv("..data/bucharest_merged_pm10_pm25_no2.csv", index=False)

# # print(f"✅ Merged Dataset {len(df_merged)} rows.")
# df_merged.head(-3)



In [19]:
# 8. Încarcă datele meteo
df_weather = pd.read_csv("../data/meteo_data_2022/weather_bucharest_2022_ro070a.csv")

# 9. Asigură-te că 'start' este datetime în ambele
df_merged["start"] = pd.to_datetime(df_merged["start"])
df_weather["start"] = pd.to_datetime(df_weather["start"])

# 10. Facem merge pe coloana 'start'
df_final = pd.merge(df_merged, df_weather, on="start", how="inner")

# Reorder columns
df_final = df_final[[
    "start", "end",
    "pm10", "no2",
    "temperature", "humidity", "wind_speed", "pressure",
    "longitude", "latitude"
]]
# (Optional) Save again if needed
df_final.to_csv("../data/Station-RO0070A-2022/Meteo+PFactors.csv", index=False)

print(f"✅ Final dataset combinat: {len(df_final)} rânduri.")
df_final.head(20)


✅ Final dataset combinat: 7774 rânduri.


Unnamed: 0,start,end,pm10,no2,temperature,humidity,wind_speed,pressure,longitude,latitude
0,2022-01-01 00:00:00,2022-01-01 01:00:00,53.68,29.63,3.8,99.0,3.6,1017.7,26.127289,44.444925
1,2022-01-01 01:00:00,2022-01-01 02:00:00,50.13,22.5,3.9,99.0,7.2,1017.2,26.127289,44.444925
2,2022-01-01 02:00:00,2022-01-01 03:00:00,38.55,23.24,3.8,100.0,7.2,1017.0,26.127289,44.444925
3,2022-01-01 03:00:00,2022-01-01 04:00:00,32.87,25.12,3.9,99.0,7.2,1016.7,26.127289,44.444925
4,2022-01-01 04:00:00,2022-01-01 05:00:00,33.31,24.12,3.7,98.0,7.2,1016.2,26.127289,44.444925
5,2022-01-01 05:00:00,2022-01-01 06:00:00,36.34,23.99,2.9,100.0,7.2,1016.2,26.127289,44.444925
6,2022-01-01 06:00:00,2022-01-01 07:00:00,39.3,26.76,3.0,100.0,3.6,1016.8,26.127289,44.444925
7,2022-01-01 07:00:00,2022-01-01 08:00:00,42.64,25.75,3.7,100.0,3.6,1016.8,26.127289,44.444925
8,2022-01-01 08:00:00,2022-01-01 09:00:00,44.12,18.94,4.3,95.0,7.2,1017.5,26.127289,44.444925
9,2022-01-01 09:00:00,2022-01-01 10:00:00,44.7,23.2,6.6,87.0,3.6,1017.2,26.127289,44.444925


In [12]:
print(df_final.loc[5000:5020])

                   start                  end   pm10  pm2_5    no2  \
5000 2022-08-27 17:00:00  2022-08-27 18:00:00  93.76  17.68   3.46   
5001 2022-08-27 18:00:00  2022-08-27 19:00:00  88.38  17.38   7.02   
5002 2022-08-27 19:00:00  2022-08-27 20:00:00  66.14  16.05  28.77   
5003 2022-08-27 21:00:00  2022-08-27 22:00:00  71.61  24.50  60.19   
5004 2022-08-27 22:00:00  2022-08-27 23:00:00  74.43  26.79  68.00   
5005 2022-08-27 23:00:00  2022-08-28 00:00:00  83.19  25.02  66.27   
5006 2022-08-28 00:00:00  2022-08-28 01:00:00  77.76  25.29  40.17   
5007 2022-08-28 01:00:00  2022-08-28 02:00:00  67.28  24.63  34.34   
5008 2022-08-28 02:00:00  2022-08-28 03:00:00  70.14  22.18  30.59   
5009 2022-08-28 03:00:00  2022-08-28 04:00:00  66.03  19.35  23.01   
5010 2022-08-28 04:00:00  2022-08-28 05:00:00  63.10  19.57  20.03   
5011 2022-08-28 05:00:00  2022-08-28 06:00:00  57.25  18.94  17.53   
5012 2022-08-28 06:00:00  2022-08-28 07:00:00  56.52  19.75  17.84   
5013 2022-08-28 07:0

In [53]:
computing = df_final[['pm10', 'pm2_5', 'start', 'end']]

In [64]:
computing[100:120]

Unnamed: 0,pm10,pm2_5,start,end,month
100,39.27,34.31,2022-01-06 08:00:00,2022-01-06 09:00:00,2022-01
101,36.9,30.83,2022-01-06 09:00:00,2022-01-06 10:00:00,2022-01
102,32.36,25.0,2022-01-06 10:00:00,2022-01-06 11:00:00,2022-01
103,24.82,17.19,2022-01-06 11:00:00,2022-01-06 12:00:00,2022-01
104,22.0,14.57,2022-01-06 12:00:00,2022-01-06 13:00:00,2022-01
105,21.89,14.5,2022-01-06 13:00:00,2022-01-06 14:00:00,2022-01
106,26.29,16.15,2022-01-06 14:00:00,2022-01-06 15:00:00,2022-01
107,26.94,17.39,2022-01-06 15:00:00,2022-01-06 16:00:00,2022-01
108,26.9,18.15,2022-01-06 16:00:00,2022-01-06 17:00:00,2022-01
109,29.68,22.07,2022-01-06 17:00:00,2022-01-06 18:00:00,2022-01


In [63]:
#start in datetime format
computing = computing.copy()
computing.loc[:, 'start'] = pd.to_datetime(computing['start'])

computing.loc[:, 'month'] = computing['start'].dt.to_period('M')

monthly_avg = computing.groupby('month')[['pm10', 'pm2_5']].mean()

monthly_avg['pm25_from_pm10'] = monthly_avg['pm2_5'] / monthly_avg['pm10']

monthly_avg


  computing.loc[:, 'start'] = pd.to_datetime(computing['start'])
  computing.loc[:, 'month'] = computing['start'].dt.to_period('M')


Unnamed: 0_level_0,pm10,pm2_5,pm25_from_pm10
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2022-01,22.846237,17.097931,0.748392
2022-02,32.951121,24.356293,0.739164
2022-03,32.893948,21.110435,0.641773
2022-04,22.089339,12.023385,0.544307
2022-05,18.830781,8.328882,0.442301
2022-06,18.92042,10.368275,0.547994
2022-07,21.335966,10.649732,0.499145
2022-08,27.440266,11.536513,0.420423
2022-09,19.279074,9.963588,0.516808
2022-10,33.401921,17.494209,0.523749


In [50]:
pm25_from_pm10

0.5941797591431658

In [20]:
import pandas as pd

# # 1. Copie de lucru
# df_merged_pm25 = df_merged.copy()
df_merged_pm25 = pd.read_csv("../data/Station-RO0070A-2022/Meteo+PFactors.csv")
# 2. Conversie în datetime
df_merged_pm25['start'] = pd.to_datetime(df_merged_pm25['start'], errors='coerce')

# 3. Extragem luna în format YYYY-MM (ca Period)
df_merged_pm25['month'] = df_merged_pm25['start'].dt.to_period('M')

# 4. Dicționar coeficienți lunar pm2.5 / pm10
pm25_ratios = {
    "2022-01": 0.748392,
    "2022-02": 0.739164,
    "2022-03": 0.641773,
    "2022-04": 0.544307,
    "2022-05": 0.442301,
    "2022-06": 0.547994,
    "2022-07": 0.499145,
    "2022-08": 0.420423,
    "2022-09": 0.516808,
    "2022-10": 0.523749,
    "2022-11": 0.648200,
    "2022-12": 0.753550,
}

# 5. Filtrăm doar lunile care există în dicționar
df_merged_pm25 = df_merged_pm25[df_merged_pm25['month'].astype(str).isin(pm25_ratios.keys())]

# 6. Calculăm pm2_5 DOAR pentru rândurile păstrate
df_merged_pm25['pm2_5'] = df_merged_pm25.apply(
    lambda row: row['pm10'] * pm25_ratios[str(row['month'])],
    axis=1
)

# 7. Eliminăm coloana auxiliară 'month'
df_merged_pm25.drop(columns=['month'], inplace=True)


In [21]:
df_merged_pm25

Unnamed: 0,start,end,pm10,no2,temperature,humidity,wind_speed,pressure,longitude,latitude,pm2_5
0,2022-01-01 00:00:00,2022-01-01 01:00:00,53.68,29.63,3.8,99.0,3.6,1017.7,26.127289,44.444925,40.173683
1,2022-01-01 01:00:00,2022-01-01 02:00:00,50.13,22.50,3.9,99.0,7.2,1017.2,26.127289,44.444925,37.516891
2,2022-01-01 02:00:00,2022-01-01 03:00:00,38.55,23.24,3.8,100.0,7.2,1017.0,26.127289,44.444925,28.850512
3,2022-01-01 03:00:00,2022-01-01 04:00:00,32.87,25.12,3.9,99.0,7.2,1016.7,26.127289,44.444925,24.599645
4,2022-01-01 04:00:00,2022-01-01 05:00:00,33.31,24.12,3.7,98.0,7.2,1016.2,26.127289,44.444925,24.928938
...,...,...,...,...,...,...,...,...,...,...,...
7769,2022-12-21 01:00:00,2022-12-21 02:00:00,38.43,16.22,-3.4,99.0,7.2,1029.7,26.127289,44.444925,28.958927
7770,2022-12-21 02:00:00,2022-12-21 03:00:00,34.46,15.73,-3.4,99.0,7.2,1028.9,26.127289,44.444925,25.967333
7771,2022-12-21 03:00:00,2022-12-21 04:00:00,33.99,15.48,-3.3,99.0,7.2,1028.2,26.127289,44.444925,25.613165
7772,2022-12-21 04:00:00,2022-12-21 05:00:00,35.19,17.65,-3.3,99.0,10.8,1027.5,26.127289,44.444925,26.517425


In [7]:
# 8. Încarcă datele meteo
df_weather = pd.read_csv("../data/meteo_data_2022/weather_bucharest_2022_ro067.csv")

# 9. Asigură-te că 'start' este datetime în ambele
df_merged_pm25["start"] = pd.to_datetime(df_merged_pm25["start"])
df_weather["start"] = pd.to_datetime(df_weather["start"])

# 10. Facem merge pe coloana 'start'
df_final = pd.merge(df_merged_pm25, df_weather, on="start", how="inner")

# Reorder columns
df_final = df_final[[
    "start", "end",
    "pm10", "pm2_5", "no2",
    "temperature", "humidity", "wind_speed", "pressure",
    "longitude", "latitude"
]]
# (Optional) Save again if needed
df_final.to_csv("../data/Station-RO0067A-2022/Meteo+PFactors", index=False)

print(f"✅ Final dataset combinat: {len(df_final)} rânduri.")
df_final.head(20)


KeyError: "['temperature', 'humidity', 'wind_speed', 'pressure'] not in index"

In [6]:
df_weather = pd.read_csv("../data/meteo_data_2022/weather_bucharest_2022_ro067.csv")
df_weather

Unnamed: 0,start,temperature,humidity,wind_speed,pressure
0,2022-01-01 00:00:00,3.8,99.0,3.6,1017.7
1,2022-01-01 01:00:00,3.9,99.0,7.2,1017.2
2,2022-01-01 02:00:00,3.8,100.0,7.2,1017.0
3,2022-01-01 03:00:00,3.9,99.0,7.2,1016.7
4,2022-01-01 04:00:00,3.7,98.0,7.2,1016.2
...,...,...,...,...,...
8754,2022-12-31 19:00:00,5.7,97.0,3.6,1029.6
8755,2022-12-31 20:00:00,5.0,99.0,3.6,1029.7
8756,2022-12-31 21:00:00,5.1,98.0,7.2,1030.1
8757,2022-12-31 22:00:00,5.9,93.0,7.2,1030.2


In [8]:
df_merged_pm25

Unnamed: 0,start,end,pm10,no2,temperature,humidity,wind_speed,pressure,longitude,latitude,pm2_5
0,2022-01-01 00:00:00,2022-01-01 01:00:00,57.34,30.48,3.8,99.0,3.6,1017.7,26.127289,44.444925,42.912797
1,2022-01-01 01:00:00,2022-01-01 02:00:00,54.32,30.11,3.9,99.0,7.2,1017.2,26.127289,44.444925,40.652653
2,2022-01-01 02:00:00,2022-01-01 03:00:00,29.33,26.94,3.8,100.0,7.2,1017.0,26.127289,44.444925,21.950337
3,2022-01-01 03:00:00,2022-01-01 04:00:00,27.50,31.09,3.9,99.0,7.2,1016.7,26.127289,44.444925,20.580780
4,2022-01-01 04:00:00,2022-01-01 05:00:00,27.82,30.76,3.7,98.0,7.2,1016.2,26.127289,44.444925,20.820265
...,...,...,...,...,...,...,...,...,...,...,...
8353,2022-12-31 19:00:00,2022-12-31 20:00:00,52.22,55.42,5.7,97.0,3.6,1029.6,26.127289,44.444925,39.350381
8354,2022-12-31 20:00:00,2022-12-31 21:00:00,50.60,61.44,5.0,99.0,3.6,1029.7,26.127289,44.444925,38.129630
8355,2022-12-31 21:00:00,2022-12-31 22:00:00,41.54,52.71,5.1,98.0,7.2,1030.1,26.127289,44.444925,31.302467
8356,2022-12-31 22:00:00,2022-12-31 23:00:00,46.29,43.59,5.9,93.0,7.2,1030.2,26.127289,44.444925,34.881830


In [22]:
# Reorder columns
df_final = df_merged_pm25[[
    "start", "end",
    "pm10", "pm2_5", "no2",
    "temperature", "humidity", "wind_speed", "pressure",
    "longitude", "latitude"
]]
df_final

Unnamed: 0,start,end,pm10,pm2_5,no2,temperature,humidity,wind_speed,pressure,longitude,latitude
0,2022-01-01 00:00:00,2022-01-01 01:00:00,53.68,40.173683,29.63,3.8,99.0,3.6,1017.7,26.127289,44.444925
1,2022-01-01 01:00:00,2022-01-01 02:00:00,50.13,37.516891,22.50,3.9,99.0,7.2,1017.2,26.127289,44.444925
2,2022-01-01 02:00:00,2022-01-01 03:00:00,38.55,28.850512,23.24,3.8,100.0,7.2,1017.0,26.127289,44.444925
3,2022-01-01 03:00:00,2022-01-01 04:00:00,32.87,24.599645,25.12,3.9,99.0,7.2,1016.7,26.127289,44.444925
4,2022-01-01 04:00:00,2022-01-01 05:00:00,33.31,24.928938,24.12,3.7,98.0,7.2,1016.2,26.127289,44.444925
...,...,...,...,...,...,...,...,...,...,...,...
7769,2022-12-21 01:00:00,2022-12-21 02:00:00,38.43,28.958927,16.22,-3.4,99.0,7.2,1029.7,26.127289,44.444925
7770,2022-12-21 02:00:00,2022-12-21 03:00:00,34.46,25.967333,15.73,-3.4,99.0,7.2,1028.9,26.127289,44.444925
7771,2022-12-21 03:00:00,2022-12-21 04:00:00,33.99,25.613165,15.48,-3.3,99.0,7.2,1028.2,26.127289,44.444925
7772,2022-12-21 04:00:00,2022-12-21 05:00:00,35.19,26.517425,17.65,-3.3,99.0,10.8,1027.5,26.127289,44.444925


In [23]:
df_final.to_csv("../data/Station-RO0070A-2022/Meteo+PFactors.csv", index=False)
