# UK Marine Buoy Data Preparation (Met Office DataPoint)
Mirror of the Irish workflow using `UKBuoyData` to verify the schema and fetch logic.


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys

import xgboost as xgb
from sklearn.metrics import mean_squared_error
color_pal = sns.color_palette()
plt.style.use('fivethirtyeight')
sys.path.append('../src')
from uk_buoy_data import UKBuoyData


In [None]:
# List available UK stations (id, name, lat, lon)
uk = UKBuoyData()
stations = uk.list_stations()
pd.DataFrame(stations).head()


In [None]:
# Fetch the last year of observations for one station
target_station = uk.station_id or stations[0]['id']
uk.station_id = target_station
df_raw = uk.fetch_data(days_back=365)
df_raw = df_raw[~df_raw.index.duplicated(keep='first')]
df_raw = df_raw.drop(columns=['station_id'])
df_raw.head()


In [None]:
# Confirm schema matches the Irish output columns (names + ordering)
expected_cols = uk.met_features + uk.wave_features
display(df_raw.columns)
print('Matches expected feature names:', list(df_raw.columns) == expected_cols)


In [None]:
# Build a single-target series (wind speed is present in Met Office data)
target_col = 'WindSpeed (knots)'
df = df_raw[[target_col]].copy()
df.index.name = 'Datetime'
df.index = pd.to_datetime(df.index)
df.tail()


### Train / Test split


In [None]:
train = df.loc[df.index < '2025-01-01']
test = df.loc[(df.index >= '2025-01-01') & (df.index < '2025-01-08')]

fig, ax = plt.subplots(figsize=(15, 5))
train.plot(ax=ax, label='Training Set', title='Data Train/Test Split (WindSpeed)')
test.plot(ax=ax, label='Test Set')
ax.axvline('01-01-2025', color='black', ls='--')
ax.legend(['Training Set', 'Test Set'])
plt.show()


### Feature Creation


In [None]:
def create_features(df):
    """Create comprehensive time series features based on time index."""
    df = df.copy()
    df['hour'] = df.index.hour
    df['quarter'] = df.index.quarter
    df['month'] = df.index.month
    df['dayofyear'] = df.index.dayofyear
    df['weekofyear'] = df.index.isocalendar().week

    # Cyclical encoding
    df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
    df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
    df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
    df['dayofyear_sin'] = np.sin(2 * np.pi * df['dayofyear'] / 365.25)
    df['dayofyear_cos'] = np.cos(2 * np.pi * df['dayofyear'] / 365.25)
    df['season'] = df['month'] % 12 // 3 + 1
    return df

train = create_features(train)
test = create_features(test)

FEATURES = ['dayofyear', 'hour', 'quarter', 'month', 'weekofyear',
            'hour_sin', 'hour_cos', 'month_sin', 'month_cos',
            'dayofyear_sin', 'dayofyear_cos', 'season']
TARGET = target_col

X_train = train[FEATURES]
y_train = train[TARGET]
X_test = test[FEATURES]
y_test = test[TARGET]


### Model


In [None]:
model = xgb.XGBRegressor(
    objective='reg:squarederror',
    n_estimators=500,
    learning_rate=0.05,
    max_depth=5,
    min_child_weight=3,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.1,
    reg_lambda=1.0,
    gamma=0.1,
    early_stopping_rounds=25,
    random_state=42,
    n_jobs=-1
)

model.fit(
    X_train, y_train,
    eval_set=[(X_train, y_train), (X_test, y_test)],
    verbose=True
)

y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)
print('Train RMSE:', mean_squared_error(y_train, y_pred_train, squared=False))
print('Test RMSE:', mean_squared_error(y_test, y_pred_test, squared=False))


### Feature Importance


In [None]:
fi = pd.DataFrame(data=model.feature_importances_,
             index=model.feature_names_in_,
             columns=['importance'])
fi.sort_values('importance').plot(kind='barh', title='Feature Importance')
plt.show()


### Forecast Test


In [None]:
test = test.copy()
test['prediction'] = model.predict(X_test)
df_pred = df.merge(test[['prediction']], how='left', left_index=True, right_index=True)

df_week = df_pred.loc[(df_pred.index >= '2025-01-01') & (df_pred.index < '2025-01-08')]
ax = df_week[[target_col]].plot(figsize=(15, 5), title='Wind Speed Truth vs Prediction (sample week)')
df_week['prediction'].plot(ax=ax, style='.')
plt.legend(['Truth Data', 'Predictions'])
plt.show()
