In [20]:
import pandas as pd
import re
from meteostat import Hourly, Point
from datetime import datetime

In [5]:
file_path = "belgium_provinces_centroids.csv" 
locations_df = pd.read_csv(file_path)

def extract_coordinates(point_str):
    match = re.match(r"POINT \(([-\d.]+) ([-\d.]+)\)", point_str)
    if match:
        lon, lat = float(match.group(1)), float(match.group(2))
        return lat, lon
    return None, None 

# Apply function to extract coordinates
locations_df[['Latitude', 'Longitude']] = locations_df['centroid'].apply(lambda x: pd.Series(extract_coordinates(x)))

start_date = datetime(2014, 1, 1)
end_date = datetime(2025, 1, 1)

In [None]:
all_data = pd.DataFrame()

# Loop through each province and fetch hourly weather data
for _, row in locations_df.iterrows():
    lat, lon = row["Latitude"], row["Longitude"]
    province = row["NAME_2"] 
    
    if pd.notna(lat) and pd.notna(lon): 
        print(f"Fetching data for {province} ({lat}, {lon})...")
        location = Point(lat, lon)
        data = Hourly(location, start=start_date, end=end_date).fetch()
        
        data["Province"] = province
        
        all_data = pd.concat([all_data, data])

output_file = "hourly_weather_data.csv"
all_data.to_csv(output_file, index=True)

print(f"✅ Hourly weather data successfully saved to {output_file}!")

Fetching data for Bruxelles (50.83642580516115, 4.367414882050033)...
Fetching data for Antwerpen (51.231900712244745, 4.721043779498864)...
Fetching data for Limburg (50.98831159244354, 5.4357209224443475)...
Fetching data for Oost-Vlaanderen (51.03629347952689, 3.8188605810442007)...
Fetching data for Vlaams Brabant (50.87303943005528, 4.59072072950776)...
Fetching data for West-Vlaanderen (51.01019740435232, 3.0620446668608117)...
Fetching data for Brabant Wallon (50.66605965845173, 4.589736973914637)...
Fetching data for Hainaut (50.46387637972707, 3.9681364929631004)...
Fetching data for Liège (50.518876085615986, 5.7372263688290746)...
Fetching data for Luxembourg (49.95849015160492, 5.516735682549194)...
Fetching data for Namur (50.252686235186815, 4.862346452029854)...
✅ Hourly weather data successfully saved to hourly_weather_data.csv!


In [19]:
df = pd.read_csv('hourly_weather_data.csv')

print(df.isnull().sum())

display(df)

temp        0
dwpt        0
rhum        0
prcp        0
snow        0
wdir        0
wspd        0
pres        0
Province    0
date        0
hour        0
dtype: int64


Unnamed: 0,temp,dwpt,rhum,prcp,snow,wdir,wspd,pres,Province,date,hour
0,6.3,4.6,89.0,0.0,0.0,210.0,18.0,1008.8,0,2014-01-01,00:00:00
1,6.6,4.8,88.0,0.0,0.0,220.0,18.0,1009.2,0,2014-01-01,01:00:00
2,6.6,4.1,84.0,0.0,0.0,230.0,18.0,1009.6,0,2014-01-01,02:00:00
3,6.0,3.0,81.0,0.0,0.0,210.0,20.5,1010.0,0,2014-01-01,03:00:00
4,5.8,3.3,84.0,0.0,0.0,200.0,18.0,1009.5,0,2014-01-01,04:00:00
...,...,...,...,...,...,...,...,...,...,...,...
1052307,1.5,1.4,99.0,0.0,0.0,190.0,32.4,1025.7,10,2024-12-31,20:00:00
1052308,1.7,2.0,100.0,0.0,0.0,190.0,28.8,1025.2,10,2024-12-31,21:00:00
1052309,2.2,2.2,100.0,0.0,0.0,200.0,28.8,1024.7,10,2024-12-31,22:00:00
1052310,2.0,2.0,100.0,0.0,0.0,210.0,32.4,1024.7,10,2024-12-31,23:00:00


In [11]:
df.drop(['coco', 'wpgt'], axis=1, inplace=True)

df['snow'].fillna(0, inplace=True)
df['prcp'].fillna(0, inplace=True)

df.to_csv('hourly_weather_data.csv', index=False)

In [None]:
import requests
import pandas as pd
import time

url = "https://archive-api.open-meteo.com/v1/archive"

df = pd.read_csv("hourly_weather_data.csv")

hourly_params = [
    "windgusts_10m",
    "weather_code",
    "visibility"
]

full = pd.DataFrame()
for _, row in locations_df.iterrows():
    lat, lon = row["Latitude"], row["Longitude"]


    params = {
        "latitude": lat,
        "longitude": lon,
        "start_date": "2014-01-01",
        "end_date": "2025-01-01",
        "hourly": hourly_params,
        "timezone": "Europe/Brussels"
    }
    response = requests.get(url, params=params)
    
    data = response.json()
    temp = pd.DataFrame(data["hourly"])

    temp = pd.DataFrame(data["hourly"])
    full = pd.concat([full, temp])
    #Delay needed due otherwise the requests after the first will fail due to limit
    time.sleep(100)




ValueError: cannot reindex on an axis with duplicate labels

In [14]:
print(full.isnull().sum())
display(full)

time                   0
windgusts_10m          0
weather_code           0
visibility       1061016
dtype: int64


Unnamed: 0,time,windgusts_10m,weather_code,visibility
0,2014-01-01T00:00,40.7,51,
1,2014-01-01T01:00,37.1,3,
2,2014-01-01T02:00,34.9,1,
3,2014-01-01T03:00,34.9,0,
4,2014-01-01T04:00,35.3,0,
...,...,...,...,...
96451,2025-01-01T19:00,57.6,3,
96452,2025-01-01T20:00,61.2,3,
96453,2025-01-01T21:00,61.2,51,
96454,2025-01-01T22:00,62.3,53,


In [None]:
df = pd.read_csv('hourly_weather_data.csv')

df["time"] = pd.to_datetime(df["time"])
df["date"] = df["time"].dt.strftime("%Y-%m-%d")
df["hour"] = df["time"].dt.strftime("%H:%M:%S")
df.drop(columns=["time"], inplace=True)


KeyError: 'time'

In [18]:
df = df.dropna()
df.to_csv('hourly_weather_data.csv', index=False)

In [None]:
file_path = "belgium_provinces_centroids.csv" 
locations_df = pd.read_csv(file_path)

province_list = locations_df['NAME_2'].unique()
province_mapping = {name: idx for idx, name in enumerate(province_list)}

# Read the CSV file

province_mapping_df = pd.DataFrame(list(province_mapping.items()), columns=['province', 'ID'])
province_mapping_df = province_mapping_df[['ID', 'province']]
province_mapping_df['Region'] = 0
province_mapping_df.at[0, 'Region'] = 3
province_mapping_df.at[1, 'Region'] = 2
province_mapping_df.at[2, 'Region'] = 2
province_mapping_df.at[3, 'Region'] = 2
province_mapping_df.at[4, 'Region'] = 2
province_mapping_df.at[5, 'Region'] = 2
province_mapping_df.at[6, 'Region'] = 1
province_mapping_df.at[7, 'Region'] = 1
province_mapping_df.at[8, 'Region'] = 1
province_mapping_df.at[9, 'Region'] = 1
province_mapping_df.at[10, 'Region'] = 1
province_mapping_df.to_csv('province_mapping.csv', index=False)

df.to_csv('hourly_weather_data.csv', index=False)

