In [1]:
import requests
import datetime
import time
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split


In [4]:
#!git clone https://github.com/MBWestcott/gas-forecast.git

# 2. Change into the repo directory
%cd /content/gas-forecast/notebooks



/content/gas-forecast/notebooks


In [5]:
def download_csv(url, output_file):
    """
    Downloads a CSV file from the given URL and saves it to the specified file.

    :param url: URL to download the CSV data from.
    :param output_file: Path to the local file where the CSV will be saved.
    """
    try:
        # Send a GET request to the URL
        response = requests.get(url)
        response.raise_for_status()  # Ensure we notice bad responses

        # Write the content (CSV data) to a file in binary mode
        with open(output_file, 'wb') as f:
            f.write(response.content)

        print(f"CSV file has been successfully downloaded and saved as '{output_file}'.")

    except requests.HTTPError as http_err:
        print(f"HTTP error occurred: {http_err}")
    except Exception as err:
        print(f"An error occurred: {err}")

In [11]:
raw_data_folder = "../data/raw/"
# download the raw data from national gas data portal
def download_raw_data():
    with open("../PUB ids.txt") as f:
        pubIds = f.read()
        pubIds = pubIds.replace("\n", ",").strip()

    earliest = datetime.date(2020,4,1) # Download data going back 5 years
    # Loop from week 0 (today) to week 13 (13 weeks ago)
    download_from = datetime.date.today().replace(day=1) # start first download on first day of current month
    download_to = datetime.date.today() # end first download on today's date
    while(download_from > earliest):

        # Format the date in yyyy-mm-dd format
        formatted_from = download_from.strftime("%Y-%m-%d")
        formatted_to = download_to.strftime("%Y-%m-%d")

        csv_url = f"https://data.nationalgas.com/api/find-gas-data-download?applicableFor=Y&dateFrom={formatted_from}&dateTo={formatted_to}&dateType=GASDAY&latestFlag=Y&ids={pubIds}&type=CSV"
        month_format = download_from.strftime("%Y-%m")
        output_filename = f"{raw_data_folder}{month_format}.csv"

        download_csv(csv_url, output_filename)
        time.sleep(3) # brief courtesy sleep
        download_to = download_from - datetime.timedelta(days=1) # next download should go up to the day before the previous download start date
        download_from = download_to.replace(day=1) # next download should start on the first day of the month

In [13]:
label_cols = ["SAP", "SMPBuy", "SMPSell"]

def pivot(df, cols):

    #only keep the values we are interested in
    mask = df["Data Item"].isin(cols)

    df_filtered = df[mask]

    # if there are duplicates for the field and gas day, take the latest
    df_latest = (
        df_filtered
        .sort_values("Applicable At")
        .groupby(["Gas Day", "Data Item"])
        .last()  # this takes the row with the highest (i.e. latest) "Applicable At" per group
        .reset_index()
    )

    # pivot to get 1 row per gas day
    df_latest = df_latest.pivot(index="Gas Day", columns="Data Item", values="Value").reset_index()

    # Drop 1 column that accounts for most of the NaNs
    df_latest.drop(columns=["Composite Weather Variable - Actual"], inplace=True)

    return df_latest

def load_data():
    #Read raw CSVs
    pathlist = list(Path(raw_data_folder).rglob('*.csv'))
    file_count = len(pathlist)
    dfs = []
    files_done = 0
    for path_obj in pathlist:
        path = str(path_obj)

        df = pd.read_csv(path,
            parse_dates=["Applicable At", "Applicable For", "Generated Time"],
            dayfirst=True)

        df.rename(columns={'Applicable For': 'Gas Day'}, inplace=True)
        df['Gas Day'] = pd.to_datetime(df['Gas Day'], dayfirst=True)
        # daily summary columns:

        daily_cols = df["Data Item"].unique()
        # print(daily_cols)
        # Get price and demand columns, to use as tommorow's ground truth, and with 1-3 days lag

        #label_cols = ["SAP, Actual Day", "SMP Buy, Actual Day", "SMP Sell, Actual Day", "Demand Actual, NTS, D+1"]

        #df_labels = pivot(df, label_cols)


        df_daily = pivot(df, daily_cols)
        dfs.append(df_daily)

        files_done += 1
        if files_done % 10 == 0:
            print(f"Processed {files_done} of {file_count} raw files")

    df = pd.concat(dfs)

    #Rename the columns that are going to be reused for ground truth and time series
    df.rename(columns={"SAP, Actual Day": 'SAP', "SMP Buy, Actual Day": 'SMPBuy', "SMP Sell, Actual Day": 'SMPSell'}, inplace=True)

    # add lagged features
    lag_days = 5
    for i in range(1, lag_days+1):
        for col in label_cols:
            df[f"{col} D-{i}"] = df[col].shift(i)


    # add rolling averages and stds
    for col in label_cols:
        for window in [7, 30]:
            df[f'{col} D{window} roll mean'] = (
                df[col]
                .shift(1)               # so today's feature doesn't include today's price
                .rolling(window=window, min_periods=1)  # you can require fewer points if you like
                .mean()
                )
            df[f'{col} D{window} roll std'] = (
                df[col]
                .shift(1)               # so today's feature doesn't include today's price
                .rolling(window=window, min_periods=1)  # you can require fewer points if you like
                .std()
            )

    # add day of week
    df['Day of Week'] = df['Gas Day'].dt.weekday

    # cyclic encoding for seasonality
    df['Day of Year'] = df['Gas Day'].dt.dayofyear
    df['sin_DoY'] = np.sin(2 * np.pi * df['Day of Year'] / 365)
    df['cos_DoY'] = np.cos(2 * np.pi * df['Day of Year'] / 365)

    # Add labels for next day's actuals
    for col in label_cols:
        df[f"Next Day {col}"] = df[col].shift(-1)

    # There should be very few rows that have any NaNs so we can drop any that do
    df.dropna(inplace=True)
    return df

def split_train_test(df, split_date, discard_before_date):
    """
    Splits the DataFrame into training set (gas days before split date) and test set (ga days fron the split date on)

    :param df: The DataFrame to split.
    :param split_date: The date to split the DataFrame on.
    :param discard_before_date: Discard anything before this date. Added to exclude time of Covid lockdowns.
    :return: Tuple of (training set, testing set).
    """

    # Split the DataFrame into training and testing sets
    train_df = df[df['Gas Day'].between(discard_before_date, split_date, inclusive = "neither")]
    test_df = df[df['Gas Day'] >= split_date]

    return train_df, test_df

def split_with_test_half_of_last_half(df, discard_before_date):
    df2 = df[df['Gas Day'] >= discard_before_date]
    mid_date = df2['Gas Day'].mean()
    first_half = df2[df2['Gas Day'] < mid_date]
    second_half = df2[df2['Gas Day'] >= mid_date]
    #Use all the earlier half, and half the later half, to train
    # Use the other half of the later half to test
    train_df, test_df = train_test_split(second_half, test_size=0.5, shuffle=True)
    train_df = pd.concat([first_half, train_df])
    return train_df, test_df

def n_train_n_test(df, n_train, n_test, discard_before_date):
    #df.to_csv("..\\data\\processed\\all.csv", index=False)
    df = df[df['Gas Day'] >= discard_before_date]
    # Split the DataFrame into training and testing sets
    train_df, test_df = train_test_split(df, test_size=n_test, train_size=n_train, shuffle=True)
    #train_df.to_csv("..\\data\\processed\\train.csv", index=False)
    #test_df.to_csv("..\\data\\processed\\test.csv", index=False)
    return train_df, test_df

def get_X(df):
    ys = ["Next Day " + col for col in label_cols]
    df2 = df.drop(columns=ys)
    df2.drop(columns=["Gas Day"], inplace=True)
    # experimentally - just include the price time series columns
    #for col in df2.columns.tolist():      # iterate over a copy of the column list
    #    if not is_price_column(col):             # if the substring isn’t found
    #        df2.drop(columns=col, inplace=True)
    #df2 = df2[label_cols]
    #df2 = df2[["SAP"]]
    return df2

def is_price_column(column_name):
    if "SMP" in column_name or "SAP" in column_name:
        return True
    return False

def get_y(df, col):
    return df["Next Day " + col]

In [14]:
# Root mean squared error - penalises larger errors more than smaller ones
def get_rmse(actuals, predictions):
    rmse =  np.sqrt(np.mean((predictions - actuals)**2))
    return round(rmse, 2)

#Mean absolute percentage error
def get_mape(actuals, predictions):
    mape = np.mean(np.abs((predictions - actuals) / actuals)) * 100
    return round(mape, 2)


def print_model_stats(model, X):

    # 1. Coefficients and intercept
    if hasattr(model, "coef_"):
        #print("Coefficients:", model.coef_)      # array of shape (n_features,)
        cdf = pd.DataFrame(model.coef_, X.columns, columns=['Coefficients'])
        cdf = cdf.sort_values(by='Coefficients', ascending=False)
        print(cdf)
    if hasattr(model, "intercept_"):
        print("Intercept:", model.intercept_)    # scalar (or array if multi-output)

    # 2. Model parameters
    print("Parameters:", model.get_params())

    # 3. Data‐related attributes
    #print("Number of features seen during fit:", model.n_features_in_)
    #if hasattr(model, "feature_names_in_"):
    #    print("Feature names:", model.feature_names_in_)

    # 4. Linear algebra internals (rarely needed)
    if hasattr(model, "rank_"):
        print("Rank of design matrix:", model.rank_)
    if hasattr(model, "singular_"):
        print("Singular values of X:", model.singular_)

In [17]:
#load data
#download_raw_data() # uncomment this to download the data again
df = load_data()

# split on date, or random proportions
#train, test = split_train_test(df, '2024-10-01', '2021-03-01')
#train, test = n_train_n_test(df, n_train=250, n_test=50, discard_before_date='2021-03-01')
train, test = n_train_n_test(df, n_train=0.7, n_test=0.3, discard_before_date='2019-03-01')
#train = train[train['Gas Day'] > '2021-03-01']
#test = test[test['Gas Day'] > '2021-03-01']
#train, test = split_with_test_half_of_last_half(df, '2021-03-01')
X_train = get_X(train)
X_test = get_X(test)



Processed 10 of 60 raw files
Processed 20 of 60 raw files
Processed 30 of 60 raw files
Processed 40 of 60 raw files
Processed 50 of 60 raw files
Processed 60 of 60 raw files


In [18]:
# Use the previous day's actual as a naive predictor

def test_model(model, X, y):
    y_pred = model.predict(X)
    rmse = get_rmse(y, y_pred)
    return rmse

def train_and_test_model(model, df_train, df_test, col):
    X_train = get_X(df_train)
    X_test = get_X(df_test)
    y_train = get_y(df_train, col)
    y_test = get_y(df_test, col)
    #scaler = StandardScaler()
    #X_train_scaled = scaler.fit_transform(X_train)
    #X_test_scaled = scaler.fit_transform(X_test)
    X_train_scaled = X_train
    X_test_scaled = X_test

    model.fit(X_train_scaled, y_train)

    rmse_train = test_model(model, X_train_scaled, y_train)
    rmse_test = test_model(model, X_test_scaled, y_test)

    return model, rmse_train, rmse_test

def naive_predictions(df_train, df_test, col):
    naive_predictions_train = df_train[col]
    actuals_train = df_train[f"Next Day {col}"]
    #mape_naive_train = get_mape(actuals_train, naive_predictions_train)
    #print(f"MAPE train (naive predictor) for {col}: {mape_naive_train}")
    rmse_naive_train = get_rmse(actuals_train, naive_predictions_train)
    #print(f"RMSE train (naive predictor) for {col}: {rmse_naive_train}")

    naive_predictions_test = df_test[col]
    actuals_test = df_test[f"Next Day {col}"]
    #mape_naive_test = get_mape(actuals_test, naive_predictions_test)
    #print(f"MAPE test (naive predictor) for {col}: {mape_naive_test}")
    rmse_naive_test = get_rmse(actuals_test, naive_predictions_test)
    #print(f"RMSE test (naive predictor) for {col}: {rmse_naive_test}")
    return rmse_naive_train, rmse_naive_test

def print_results(case, rmse_naive, rmse_model):
    headline = "Worse" if rmse_naive <= rmse_model else "Better"
    print(f"{case} - {headline} - model {rmse_model} v naive {rmse_naive}")


In [19]:
print ("Linear regression model:")
for col in label_cols:
    # Instantiate linear regression model.
    model = LinearRegression()

    # Train and test it
    model, rmse_train, rmse_test = train_and_test_model(model, train, test, col)

    # Print model details
    X_train = get_X(train)
    #print_model_stats(model, X_train)

    # Get naive prediction stats for comparison
    rmse_naive_train, rmse_naive_test = naive_predictions(train, test, col)

    print_results(col + " train", rmse_naive_train, rmse_train)
    print_results(col + " test", rmse_naive_test, rmse_test)



Linear regression model:
SAP train - Better - model 0.7 v naive 0.78
SAP test - Worse - model 0.78 v naive 0.78
SMPBuy train - Better - model 0.76 v naive 0.85
SMPBuy test - Worse - model 0.86 v naive 0.83
SMPSell train - Better - model 0.8 v naive 0.86
SMPSell test - Worse - model 0.86 v naive 0.83


In [20]:
from sklearn.ensemble import RandomForestRegressor
print ("Random forest model:")
print("Train rows:", train.shape[0])
print("Test rows:", test.shape[0])
for col in label_cols:
    # Instantiate linear regression model.
    model = RandomForestRegressor()

    # Train and test it
    model, rmse_train, rmse_test = train_and_test_model(model, train, test, col)

    # Print model details
    X_train = get_X(train)
    print_model_stats(model, X_train)

    # Get naive prediction stats for comparison
    rmse_naive_train, rmse_naive_test = naive_predictions(train, test, col)

    print_results(col + " train", rmse_naive_train, rmse_train)
    print_results(col + " test", rmse_naive_test, rmse_test)

Random forest model:
Train rows: 1251
Test rows: 537
Parameters: {'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': None, 'max_features': 1.0, 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}
SAP train - Better - model 0.3 v naive 0.78
SAP test - Worse - model 0.79 v naive 0.78
Parameters: {'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': None, 'max_features': 1.0, 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}
SMPBuy train - Better - model 0.32 v

In [22]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_error

param_grid = {
    'n_estimators':     [100, 200, 500],
    'max_depth':        [None, 10, 20],
    'min_samples_split':[2, 5, 10],
    'min_samples_leaf': [2],
    'max_features':     ['sqrt', 'log2', 0.3],
    'ccp_alpha':        [0.0, 0.001, 0.01]
}
#param_grid = {
#    'n_estimators':     [100, 200],
#    'max_depth':        [None, 10, 20],
#    'min_samples_split':[2, 5],
#    'min_samples_leaf': [1, 2],
#    'max_features':     ['sqrt'],
#    'ccp_alpha':        [0.001, 0.01]
#}
for col in ["SAP"]: #label_cols:

    rf = RandomForestRegressor(
        random_state=42,
        n_jobs=-1,
        oob_score=True   # optional: get out‑of‑bag score on your train set
    )

    grid = GridSearchCV(
        estimator=rf,
        param_grid=param_grid,
        cv=5,                             # 5‑fold CV on X_train/y_train
        scoring='neg_root_mean_squared_error',
        n_jobs=-1,
        verbose=3
    )

    X_train = get_X(train)
    y_train = get_y(train, col)

    grid.fit(X_train, y_train)

    print("Best hyperparameters:", grid.best_params_)
    print("Best CV RMSE on train set: {:.4f}".format(-grid.best_score_))


    X_test = get_X(test)
    y_test = get_y(test, col)
    # -----------------------------------------------------------------------------
    # 5. Evaluate the best model on the TEST set
    # -----------------------------------------------------------------------------
    best_model = grid.best_estimator_

    rmse_test = test_model(best_model, X_test, y_test)
    rmse_naive_train, rmse_naive_test = naive_predictions(train, test, col)

    print_results(col + " test", rmse_naive_test, rmse_test)

Fitting 5 folds for each of 243 candidates, totalling 1215 fits


KeyboardInterrupt: 

In [None]:
# 1. Install the TCN layer
#!pip install keras-tcn

import numpy as np
import pandas as pd
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tcn import TCN
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# 2. Suppose you already have a DataFrame `df` with:
#    - 'Gas Day' datetime index or column
#    - features (lags, rolling stats, calendar encodings, …)
#    - target column 'price'

# If 'Gas Day' is a column:
# df['Gas Day'] = pd.to_datetime(df['Gas Day'])
# df.set_index('Gas Day', inplace=True)

# 3. Prepare sequences for TCN
def make_sequences(X, y, seq_len):
    Xs, ys = [], []
    for i in range(len(X) - seq_len):
        Xs.append(X[i : i + seq_len].values)
        ys.append(y[i + seq_len])
    return np.stack(Xs), np.array(ys)

# parameters
SEQ_LEN    = 30          # e.g. use past 30 days to predict next-day price
TEST_SIZE  = 0.2

# split features/target
y = df['price']
X = df.drop(columns=['price'])

# scale features
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns, index=X.index)

# build sequences
X_seq, y_seq = make_sequences(X_scaled, y, SEQ_LEN)

# train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X_seq, y_seq, test_size=TEST_SIZE, shuffle=False
)

# 4. Build a simple TCN model
n_features = X_train.shape[2]

inputs = Input(shape=(SEQ_LEN, n_features))
# TCN defaults: 64 filters, kernel_size=3, 8 stacks with exponentially increasing dilation
tcn_layer = TCN(return_sequences=False)(inputs)
output    = Dense(1)(tcn_layer)

model = Model(inputs, output)
model.compile(optimizer='adam', loss='mse', metrics=['mae'])
model.summary()

# 5. Train
history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=50,
    batch_size=16,
    callbacks=[
        # optional: early stopping
        # tf.keras.callbacks.EarlyStopping(patience=5, restore_best_weights=True)
    ]
)

# 6. Evaluate & predict
loss, mae = model.evaluate(X_test, y_test)
y_pred = model.predict(X_test)
print(f"Test MAE: {mae:.4f}")

KeyError: 'price'