In [None]:
import copy
import datetime
from datetime import datetime
import gc
import glob
import heapq
import itertools
from itertools import combinations, count, product
import math
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
from matplotlib.colors import LinearSegmentedColormap
import numpy as np
import os
import pandas as pd
import pickle
import random
import re
import seaborn as sns
import subprocess
import time

In [None]:
BASE_HOURLY = "http://cli.fusio.net/cli/climate_data/webdata/hourly/"
STATION_LIST_URL = "http://cli.fusio.net/cli/climate_data/webdata/StationDetails.csv"
SINCE = 2014

# 1) Download & read the StationDetails CSV 
meta = pd.read_csv(
    STATION_LIST_URL,
    sep=',',
    engine='python',
    quotechar='"',
    on_bad_lines='warn',
    na_values=["(null)"]
)

# 2) Normalize column names robustly
lower = {c.lower().strip(): c for c in meta.columns}
def pick(*cands):
    for k in cands:
        if k in lower:
            return lower[k]
    raise KeyError(f"None of {cands} found in station details columns.")

id_col    = pick('station number', 'station no', 'station', 'stationid', 'id')
name_col  = pick('station name', 'name')
open_col  = pick('open year', 'opened', 'openyear')
close_col = pick('close year', 'closed', 'closeyear')

meta = meta.rename(columns={
    id_col:    'station_number',
    name_col:  'station_name',
    open_col:  'open_year',
    close_col: 'close_year'
})

# 3) Filter to stations operating at any time since 2014
meta['station_number'] = pd.to_numeric(meta['station_number'], errors='coerce').astype('Int64')
meta['open_year']      = pd.to_numeric(meta['open_year'], errors='coerce')
meta['close_year']     = pd.to_numeric(meta['close_year'], errors='coerce')

mask = (meta['open_year'] <= SINCE) & (meta['close_year'].isna() | (meta['close_year'] >= SINCE))
meta_since2014 = meta.loc[mask].dropna(subset=['station_number']).copy()

# 4) Helpers to guess remote filenames and fetch CSV/ZIP
def name_variants(name: str):
    # Upper-underscore: "ROCHES_POINT", "DUBLIN_AIRPORT"
    upper_underscore = re.sub(r'[^A-Za-z0-9]+', '_', name.upper()).strip('_')
    # CamelNoSpace: "RochesPoint", "DublinAirport"
    camel_nospace = re.sub(r'[^A-Za-z0-9]+', '', name.title())
    return upper_underscore, camel_nospace

def try_read_csv_bytes(raw: bytes) -> pd.DataFrame | None:
    try:
        return pd.read_csv(io.BytesIO(raw), low_memory=False)
    except Exception:
        return None

def fetch_station_hourly(code: int, name: str) -> pd.DataFrame | None:
    uu, cm = name_variants(name)
    # Candidate remote filenames (CSV then ZIP); include a bare {code}_hourly fallback
    candidates = [
        f"{code}_{uu}_hourly.csv",
        f"{code}_{cm}_hourly.csv",
        f"{code}_hourly.csv",
        f"{code}_{uu}_hourly.zip",
        f"{code}_{cm}_hourly.zip",
        f"{code}_hourly.zip",
    ]
    session = requests.Session()
    session.headers.update({"User-Agent": "python-requests"})
    for fn in candidates:
        url = BASE_HOURLY + fn
        try:
            r = session.get(url, timeout=30)
            if r.status_code != 200 or not r.content:
                continue
            if fn.endswith(".csv"):
                df = try_read_csv_bytes(r.content)
                if df is not None:
                    return df
            else:
                # ZIP: try to locate a CSV inside the archive
                with zipfile.ZipFile(io.BytesIO(r.content)) as zf:
                    # Prefer a file that looks like *_hourly.csv; else first CSV
                    names = zf.namelist()
                    prefer = [n for n in names if n.lower().endswith("_hourly.csv")]
                    target = prefer[0] if prefer else next((n for n in names if n.lower().endswith(".csv")), None)
                    if target:
                        with zf.open(target) as fh:
                            data = fh.read()
                        df = try_read_csv_bytes(data)
                        if df is not None:
                            return df
        except Exception:
            # try next candidate
            continue
    return None

# 5) Download hourly datasets and assemble `stations` dict
stations = {}
success, tried = 0, 0

for _, row in meta_since2014[['station_number', 'station_name']].drop_duplicates().iterrows():
    code = int(row['station_number'])
    tried += 1
    df = fetch_station_hourly(code, str(row['station_name']))
    if df is None:
        continue

    # Standardize datetime column
    dt_col = None
    for cand in ('date', 'datetime', 'time', 'timestamp'):
        if cand in df.columns:
            dt_col = cand
            break
    if dt_col is None:
        # not usable for our pipeline
        continue

    df[dt_col] = pd.to_datetime(df[dt_col], format="%d/%m/%Y %H:%M", errors='coerce')
    df = df.dropna(subset=[dt_col]).set_index(dt_col).sort_index()

    # Keep only the fields w2 uses downstream
    keep = [c for c in ['temp', 'wdsp', 'wddir'] if c in df.columns]
    if not keep:
        # must have at least one of the expected columns
        continue

    stations[code] = df[keep].copy()
    success += 1

print(f"Tried {tried} stations; loaded {success} station DataFrames with operation since {SINCE}.")

In [None]:
# incase you have the station csv files locally already

'''df_532_DUBLIN_AIRPORT = pd.read_csv(
    r"D:\Colab\Summer Project\Met Éireann\Try 2\raw_hourly\New\532_DUBLIN_AIRPORT_hourly.csv",
    low_memory=False
)
df_1875_ATHENRY = pd.read_csv(
    r"D:\Colab\Summer Project\Met Éireann\Try 2\raw_hourly\New\1875_ATHENRY_hourly.csv",
    low_memory=False
)
df_1275_MARKREE = pd.read_csv(
    r"D:\Colab\Summer Project\Met Éireann\Try 2\raw_hourly\New\1275_MARKREE_hourly.csv",
    low_memory=False
)
df_1475_GURTEEN = pd.read_csv(
    r"D:\Colab\Summer Project\Met Éireann\Try 2\raw_hourly\New\1475_GURTEEN_hourly.csv",
    low_memory=False
)
df_175_PHOENIX_PARK = pd.read_csv(
    r"D:\Colab\Summer Project\Met Éireann\Try 2\raw_hourly\New\175_PHOENIX_PARK_hourly.csv",
    low_memory=False
)
df_1375_DUNSANY = pd.read_csv(
    r"D:\Colab\Summer Project\Met Éireann\Try 2\raw_hourly\New\1375_DUNSANY_hourly.csv",
    low_memory=False
)
df_1175_NEWPORT = pd.read_csv(
    r"D:\Colab\Summer Project\Met Éireann\Try 2\raw_hourly\New\1175_NEWPORT_hourly.csv",
    low_memory=False
)
df_1975_MT_DILLON = pd.read_csv(
    r"D:\Colab\Summer Project\Met Éireann\Try 2\raw_hourly\New\1975_MT_DILLON_hourly.csv",
    low_memory=False
)
df_3402_SHERKIN_ISLAND = pd.read_csv(
    r"D:\Colab\Summer Project\Met Éireann\Try 2\raw_hourly\New\3402_SHERKIN_ISLAND_hourly.csv",
    low_memory=False
)
df_775_SherkinIsland = pd.read_csv(
    r"D:\Colab\Summer Project\Met Éireann\Try 2\raw_hourly\New\775_SherkinIsland_hourly.csv",
    low_memory=False
)
df_675_BALLYHAISE = pd.read_csv(
    r"D:\Colab\Summer Project\Met Éireann\Try 2\raw_hourly\New\675_BALLYHAISE_hourly.csv",
    low_memory=False
)
df_575_MOORE_PARK = pd.read_csv(
    r"D:\Colab\Summer Project\Met Éireann\Try 2\raw_hourly\New\575_MOORE_PARK_hourly.csv",
    low_memory=False
)
df_375_OAK_PARK = pd.read_csv(
    r"D:\Colab\Summer Project\Met Éireann\Try 2\raw_hourly\New\375_OAK_PARK_hourly.csv",
    low_memory=False
)
df_2125_MACE_HEAD = pd.read_csv(
    r"D:\Colab\Summer Project\Met Éireann\Try 2\raw_hourly\New\2125_MACE_HEAD_hourly.csv",
    low_memory=False
)
df_275_MACE_HEAD = pd.read_csv(
    r"D:\Colab\Summer Project\Met Éireann\Try 2\raw_hourly\New\275_MACE_HEAD_hourly.csv",
    low_memory=False
)
df_5237_FINNER = pd.read_csv(
    r"D:\Colab\Summer Project\Met Éireann\Try 2\raw_hourly\New\5237_FINNER_hourly.csv",
    low_memory=False
)
df_2075_FINNER = pd.read_csv(
    r"D:\Colab\Summer Project\Met Éireann\Try 2\raw_hourly\New\2075_FINNER_hourly.csv",
    low_memory=False
)
df_4935_KNOCK_AIRPORT = pd.read_csv(
    r"D:\Colab\Summer Project\Met Éireann\Try 2\raw_hourly\New\4935_KNOCK_AIRPORT_hourly.csv",
    low_memory=False
)
df_875_MULLINGAR = pd.read_csv(
    r"D:\Colab\Summer Project\Met Éireann\Try 2\raw_hourly\New\875_MULLINGAR_hourly.csv",
    low_memory=False
)
df_1075_ROCHES_POINT = pd.read_csv(
    r"D:\Colab\Summer Project\Met Éireann\Try 2\raw_hourly\New\1075_ROCHES_POINT_hourly.csv",
    low_memory=False
)
df_1004_ROCHES_POINT = pd.read_csv(
    r"D:\Colab\Summer Project\Met Éireann\Try 2\raw_hourly\New\1004_ROCHES_POINT_hourly.csv",
    low_memory=False
)
df_2727_CLAREMORRIS = pd.read_csv(
    r"D:\Colab\Summer Project\Met Éireann\Try 2\raw_hourly\New\2727_CLAREMORRIS_hourly.csv",
    low_memory=False
)
df_2175_CLAREMORRIS = pd.read_csv(
    r"D:\Colab\Summer Project\Met Éireann\Try 2\raw_hourly\New\2175_CLAREMORRIS_hourly.csv",
    low_memory=False
)
df_3723_CASEMENT = pd.read_csv(
    r"D:\Colab\Summer Project\Met Éireann\Try 2\raw_hourly\New\3723_CASEMENT_hourly.csv",
    low_memory=False
)
df_3904_CORK_AIRPORT = pd.read_csv(
    r"D:\Colab\Summer Project\Met Éireann\Try 2\raw_hourly\New\3904_CORK_AIRPORT_hourly.csv",
    low_memory=False
)
df_2375_BELMULLET = pd.read_csv(
    r"D:\Colab\Summer Project\Met Éireann\Try 2\raw_hourly\New\2375_BELMULLET_hourly.csv",
    low_memory=False
)
df_1575_MALIN_HEAD = pd.read_csv(
    r"D:\Colab\Summer Project\Met Éireann\Try 2\raw_hourly\New\1575_MALIN_HEAD_hourly.csv",
    low_memory=False
)
df_2275_VALENTIA_OBSERVATORY = pd.read_csv(
    r"D:\Colab\Summer Project\Met Éireann\Try 2\raw_hourly\New\2275_VALENTIA_OBSERVATORY_hourly.csv",
    low_memory=False
)
df_518_SHANNON_AIRPORT = pd.read_csv(
    r"D:\Colab\Summer Project\Met Éireann\Try 2\raw_hourly\New\518_SHANNON_AIRPORT_hourly.csv",
    low_memory=False
)'''

In [None]:
df_1875_ATHENRY

Unnamed: 0,date,ind,rain,ind.1,temp,ind.2,wetb,dewpt,vappr,rhum,msl,ind.3,wdsp,ind.4,wddir
0,25/02/2010 01:00,-1,,4,,4,,,,,,7,,7,
1,25/02/2010 02:00,-1,,4,,4,,,,,,7,,7,
2,25/02/2010 03:00,-1,,4,,4,,,,,,7,,7,
3,25/02/2010 04:00,-1,,4,,4,,,,,,7,,7,
4,25/02/2010 05:00,-1,,4,,4,,,,,,7,,7,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133051,30/04/2025 20:00,0,0,0,19.3,0,15.5,12.6,14.6,65,1018.5,2,4,2,240
133052,30/04/2025 21:00,0,0,0,16.9,0,14.6,12.7,14.7,76,1018.6,2,2,2,210
133053,30/04/2025 22:00,0,0,0,13.7,0,12.5,11.5,13.6,86,1018.8,2,2,2,360
133054,30/04/2025 23:00,0,0,0,13.2,0,11.4,9.6,11.9,78,1018.9,2,5,2,340


In [5]:
df_175_PHOENIX_PARK

Unnamed: 0,date,ind,rain,ind.1,temp,ind.2,wetb,dewpt,vappr,rhum,msl
0,16/08/2003 01:00,0,0,0,9.2,0,8.9,8.5,11.1,95,1021.9
1,16/08/2003 02:00,0,0,0,9,0,8.7,8.5,11.1,96,1021.7
2,16/08/2003 03:00,0,0,0,8.2,0,8,7.7,10.5,96,1021.2
3,16/08/2003 04:00,0,0,0,8.4,0,8.1,7.9,10.7,97,1021.2
4,16/08/2003 05:00,0,0,0,7.7,0,7.5,7.3,10.2,97,1021.1
...,...,...,...,...,...,...,...,...,...,...,...
190291,30/04/2025 20:00,0,0,0,13.6,0,11.3,9,11.5,73,1019.8
190292,30/04/2025 21:00,0,0,0,12.1,0,10.5,8.8,11.3,80,1019.7
190293,30/04/2025 22:00,0,0,0,11.3,0,10.2,9.1,11.6,86,1019.7
190294,30/04/2025 23:00,0,0,0,10.1,0,9.5,8.8,11.3,91,1019.5


In [6]:
df_875_MULLINGAR

Unnamed: 0,date,ind,rain,ind.1,temp,ind.2,wetb,dewpt,vappr,rhum,msl,ind.3,wdsp,ind.4,wddir
0,07/11/1973 01:00,0,0,0,5,0,4.2,3,7.6,87,1030.3,0,7,0,220
1,07/11/1973 02:00,0,0,0,5.3,0,4.6,4,7.9,89,1029.8,0,7,0,220
2,07/11/1973 03:00,0,0,0,6,0,5,4,7.9,85,1029.3,0,9,0,230
3,07/11/1973 04:00,0,0,0,6.1,0,5,3,7.9,83,1028.8,0,9,0,230
4,07/11/1973 05:00,0,0,0,6.1,0,5,3,7.9,83,1028.1,0,9,0,220
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
451267,30/04/2025 20:00,0,0,0,17.8,0,13.6,9.9,12.2,60,1018.7,2,4,2,190
451268,30/04/2025 21:00,0,0,0,15.6,0,12.7,10.1,12.3,69,1018.9,2,3,2,180
451269,30/04/2025 22:00,0,0,0,15.1,0,12.5,10.1,12.4,72,1019.1,2,4,2,190
451270,30/04/2025 23:00,0,0,0,13.5,0,11.6,9.7,12,77,1018.9,2,4,2,200


In [7]:
df_2125_MACE_HEAD

Unnamed: 0,date,ind,rain,ind.1,temp,ind.2,wetb,dewpt,vappr,rhum,msl,ind.3,wdsp,ind.4,wddir
0,13/08/2003 01:00,-1,,4,,4,,,,,,7,,7,
1,13/08/2003 02:00,-1,,4,,4,,,,,,7,,7,
2,13/08/2003 03:00,-1,,4,,4,,,,,,7,,7,
3,13/08/2003 04:00,-1,,4,,4,,,,,,7,,7,
4,13/08/2003 05:00,-1,,4,,4,,,,,,7,,7,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
190363,30/04/2025 20:00,0,0,0,14.2,0,11.7,9.4,11.8,73,1018.5,2,12,2,300
190364,30/04/2025 21:00,0,0,0,14.3,0,11.8,9.5,11.9,73,1019.1,2,12,2,320
190365,30/04/2025 22:00,0,0,0,13.4,0,11.4,9.5,11.9,77,1019,2,7,2,320
190366,30/04/2025 23:00,0,0,0,12.8,0,11.1,9.5,11.9,80,1019,2,4,2,340


In [8]:
df_532_DUBLIN_AIRPORT

Unnamed: 0,date,ind,rain,ind.1,temp,ind.2,wetb,dewpt,vappr,rhum,...,ind.3,wdsp,ind.4,wddir,ww,w,sun,vis,clht,clamt
0,01/01/1945 00:00,2,0.0,0,4.9,0,4.6,4.4,8.2,95,...,1,0,1,0,50,4,0.0,200,2,8
1,01/01/1945 01:00,3,0.0,0,5.1,0,4.9,4.4,8.5,97,...,1,0,1,0,45,4,0.0,200,2,8
2,01/01/1945 02:00,2,0.0,0,5.1,0,4.8,4.4,8.5,97,...,1,0,1,0,50,4,0.0,4800,4,8
3,01/01/1945 03:00,0,0.2,0,5.2,0,5.0,4.4,8.5,97,...,1,0,1,0,50,4,0.0,6000,4,8
4,01/01/1945 04:00,2,0.0,0,5.6,0,5.4,5.0,8.8,97,...,1,7,1,250,50,5,0.0,6000,4,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
704156,30/04/2025 20:00,0,0.0,0,13.7,0,10.5,7.1,10.1,65,...,2,6,2,140,2,11,0.1,35000,999,3
704157,30/04/2025 21:00,0,0.0,0,13.2,0,10.1,6.8,9.9,65,...,2,6,2,140,2,11,0.0,30000,999,3
704158,30/04/2025 22:00,0,0.0,0,8.7,0,7.6,6.3,9.6,85,...,2,4,2,140,2,11,0.0,30000,999,3
704159,30/04/2025 23:00,0,0.0,0,8.1,0,7.2,6.2,9.4,87,...,2,1,2,130,2,11,0.0,30000,999,1


In [None]:
import os, glob
import numpy as np
import pandas as pd

# Circular mean in degrees (row-wise on a DataFrame of degree columns)
def circular_mean_deg(df_rowwise_deg: pd.DataFrame) -> pd.Series:
    rad = np.deg2rad(df_rowwise_deg)
    sin_mean = np.nanmean(np.sin(rad), axis=1)
    cos_mean = np.nanmean(np.cos(rad), axis=1)
    out = (np.rad2deg(np.arctan2(sin_mean, cos_mean)) + 360) % 360
    return pd.Series(out, index=df_rowwise_deg.index)

def to_numeric_inplace(df, cols):
    for c in cols:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors='coerce')


In [None]:
# Root where the CSVs live
root = r"D:\Colab\Summer Project\Met Éireann\Try 2\raw_hourly\New"

# Load all *_hourly.csv
paths = glob.glob(os.path.join(root, "*_hourly.csv"))

stations = {}  # code -> df (raw)
for p in paths:
    # filename like "1875_ATHENRY_hourly.csv" or "775_SherkinIsland_hourly.csv"
    fname = os.path.basename(p)
    code = int(fname.split('_')[0])
    df = pd.read_csv(p, low_memory=False)
    # standardize date
    if 'date' in df.columns:
        df['date'] = pd.to_datetime(df['date'], format="%d/%m/%Y %H:%M", errors='coerce')
    else:
        # if a different time col exists, adapt here
        raise ValueError(f"{fname} missing 'date' column")
    df = df.set_index('date').sort_index()
    # keep only needed cols if present
    keep = [c for c in ['temp','wdsp','wddir'] if c in df.columns]
    stations[code] = df[keep].copy()

In [None]:
stations_df = pd.read_csv(r"C:\Users\m1029\Downloads\open_stations_loc2.csv")
stations_df['station_number'] = stations_df['station_number'].astype(int)
available = stations_df[stations_df['station_number'].isin(stations.keys())].copy()
available.set_index('station_number', inplace=True)

neighbours_map = {}
for code, row in available.iterrows():
    lat0, lon0 = row['latitude'], row['longitude']
    others = available.drop(index=code)

    def nearest(series, val, keep='>'):
        if keep == '>':
            cands = others[series > val]
            return cands[series].sub(val).abs().idxmin() if not cands.empty else None
        else:
            cands = others[series < val]
            return (val - cands[series]).abs().idxmin() if not cands.empty else None

    north = nearest('latitude', lat0, '>')
    south = nearest('latitude', lat0, '<')
    east  = nearest('longitude', lon0, '>')
    west  = nearest('longitude', lon0, '<')
    neighbours_map[code] = [c for c in [north,south,east,west] if c is not None]

In [None]:
def fill_from_neighbors(code, df, stations, neighbours_map):
    # Coerce numeric
    to_numeric_inplace(df, ['temp','wdsp','wddir'])
    # Ensure datetime index sorted
    df = df.sort_index()

    # Find neighbor DFs reindexed on this station's index
    n_codes = neighbours_map.get(code, [])
    ndfs = [stations[nc].reindex(df.index) for nc in n_codes if nc in stations]

    # temp: interpolate time, then neighbor mean
    if 'temp' in df.columns:
        df['temp'] = df['temp'].interpolate(method='time')
        if ndfs:
            temp_df = pd.concat([(n['temp'] if 'temp' in n.columns else pd.Series(index=df.index, dtype=float))
                                 for n in ndfs], axis=1)
            df['temp'] = df['temp'].fillna(temp_df.mean(axis=1, skipna=True))
        df['temp'] = df['temp'].ffill().bfill()

    # wdsp: neighbor mean, then time interpolate
    if 'wdsp' in df.columns:
        if ndfs:
            ws_df = pd.concat([(n['wdsp'] if 'wdsp' in n.columns else pd.Series(index=df.index, dtype=float))
                               for n in ndfs], axis=1)
            df['wdsp'] = df['wdsp'].fillna(ws_df.mean(axis=1, skipna=True))
        df['wdsp'] = df['wdsp'].interpolate(method='time').ffill().bfill()

    # wddir: circular mean of neighbors, then ffill/bfill
    if 'wddir' in df.columns and ndfs:
        wd_df = pd.concat([(n['wddir'] if 'wddir' in n.columns else pd.Series(index=df.index, dtype=float))
                           for n in ndfs], axis=1)
        wd_filled = circular_mean_deg(wd_df)
        df['wddir'] = df['wddir'].fillna(wd_filled).ffill().bfill()

    return df

# Apply to all
for code, df in stations.items():
    stations[code] = fill_from_neighbors(code, df, stations, neighbours_map)

In [None]:
def resample_15min(df):
    # Split direction from scalars
    scalars = [c for c in df.columns if c in ['temp','wdsp']]
    dirs    = [c for c in df.columns if c == 'wddir']

    out = pd.DataFrame(index=pd.date_range(df.index.min().ceil('15T'),
                                           df.index.max().floor('15T'),
                                           freq='15T'))
    if scalars:
        out[scalars] = df[scalars].resample('15T').mean()
    if dirs:
        # circular mean on each 15T bin
        grouped = df[dirs].resample('15T')
        out['wddir'] = grouped.apply(lambda g: circular_mean_deg(g).iloc[0] if not g.empty else np.nan)
    return out

stations_15T = {code: resample_15min(df) for code, df in stations.items()}

In [None]:
clusters = {
    "NW": ["1175_NEWPORT", "1275_MARKREE", "2175_CLAREMORRIS",
           "2375_BELMULLET", "2727_CLAREMORRIS", "4935_KNOCK_AIRPORT"],
    "NE": ["532_DUBLIN_AIRPORT", "675_BALLYHAISE", "875_MULLINGAR",
           "1375_DUNSANY", "1575_MALIN_HEAD", "175_PHOENIX_PARK",
           "1975_MT_DILLON", "2075_FINNER"],
    "SW": ["275_MACE_HEAD", "518_SHANNON_AIRPORT", "775_SherkinIsland",
           "1875_ATHENRY", "2125_MACE_HEAD", "3402_SHERKIN_ISLAND",
           "3904_CORK_AIRPORT", "575_MOORE_PARK", "2275_VALENTIA_OBSERVATORY"],
    "SE": ["375_OAK_PARK", "1004_ROCHES_POINT", "1075_ROCHES_POINT",
           "1475_GURTEEN", "3723_CASEMENT"]
}

# Helper to resolve "2175_CLAREMORRIS" -> 2175 code key
def code_from_tag(tag):
    return int(tag.split('_')[0])

# Build common time index across all 15T frames (inner join)
common_index = None
for df in stations_15T.values():
    common_index = df.index if common_index is None else common_index.intersection(df.index)
common_index = common_index.sort_values()

# Cluster aggregation
cols = ['temp','wdsp','wddir']
region_15T = pd.DataFrame(index=common_index)

for region, station_tags in clusters.items():
    codes = [c for c in [code_from_tag(t) for t in station_tags] if c in stations_15T]
    if not codes:
        continue
    # Collect aligned frames
    aligned = [stations_15T[c].reindex(common_index) for c in codes]

    # temp & wdsp: mean
    if any('temp' in a.columns for a in aligned):
        region_15T[f'{region}_temp'] = pd.concat([a['temp'] for a in aligned if 'temp' in a.columns], axis=1).mean(axis=1)
    if any('wdsp' in a.columns for a in aligned):
        region_15T[f'{region}_wdsp'] = pd.concat([a['wdsp'] for a in aligned if 'wdsp' in a.columns], axis=1).mean(axis=1)

    # wddir: circular mean
    if any('wddir' in a.columns for a in aligned):
        wddir_df = pd.concat([a['wddir'] for a in aligned if 'wddir' in a.columns], axis=1)
        region_15T[f'{region}_wddir'] = circular_mean_deg(wddir_df)

In [None]:
start = pd.Timestamp("2014-01-01 00:00:00")
end   = pd.Timestamp("2025-12-31 23:59:59")

region_15T = region_15T.loc[(region_15T.index >= start) & (region_15T.index <= end)].copy()

# Round (your original used 5 dp for all; keep same for wdsp, wddir; temp to 2 if you prefer)
region_15T = region_15T.round(5)

# Quick integrity checks
print("Index freq approx:", pd.infer_freq(region_15T.index[:100]) or "irregular")
print("Any NaNs?", region_15T.isna().any().any())

In [None]:
# Optional: simple NaN count bar
nan_counts = region_15T.isna().sum()
print(nan_counts[nan_counts>0].sort_values(ascending=False))

In [None]:
# Save
out_csv = r"D:\Colab\Summer Project\Met Éireann\met_eireann_regions_15min.csv"
region_15T.to_csv(out_csv, index_label='date')