In [47]:
import requests
import datetime
import time
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split


In [6]:
def download_csv(url, output_file):
    """
    Downloads a CSV file from the given URL and saves it to the specified file.
    
    :param url: URL to download the CSV data from.
    :param output_file: Path to the local file where the CSV will be saved.
    """
    try:
        # Send a GET request to the URL
        response = requests.get(url)
        response.raise_for_status()  # Ensure we notice bad responses

        # Write the content (CSV data) to a file in binary mode
        with open(output_file, 'wb') as f:
            f.write(response.content)
        
        print(f"CSV file has been successfully downloaded and saved as '{output_file}'.")
        
    except requests.HTTPError as http_err:
        print(f"HTTP error occurred: {http_err}")
    except Exception as err:
        print(f"An error occurred: {err}")

In [7]:
raw_data_folder = "..\\data\\raw\\"
# download the raw data from national gas data portal
def download_raw_data():
    with open("..\\PUB ids.txt") as f:
        pubIds = f.read()
        pubIds = pubIds.replace("\n", ",").strip() 
    
    earliest = datetime.date(2020,4,1) # Download data going back 5 years
    # Loop from week 0 (today) to week 13 (13 weeks ago)
    download_from = datetime.date.today().replace(day=1) # start first download on first day of current month
    download_to = datetime.date.today() # end first download on today's date
    while(download_from > earliest):
        
        # Format the date in yyyy-mm-dd format
        formatted_from = download_from.strftime("%Y-%m-%d")
        formatted_to = download_to.strftime("%Y-%m-%d")
            
        csv_url = f"https://data.nationalgas.com/api/find-gas-data-download?applicableFor=Y&dateFrom={formatted_from}&dateTo={formatted_to}&dateType=GASDAY&latestFlag=Y&ids={pubIds}&type=CSV"
        month_format = download_from.strftime("%Y-%m")
        output_filename = f"{raw_data_folder}{month_format}.csv"

        download_csv(csv_url, output_filename)
        time.sleep(3) # brief courtesy sleep
        download_to = download_from - datetime.timedelta(days=1) # next download should go up to the day before the previous download start date
        download_from = download_to.replace(day=1) # next download should start on the first day of the month

In [80]:
label_cols = ["SAP", "SMPBuy", "SMPSell", "Demand"]

def pivot(df, cols):

    #only keep the values we are interested in
    mask = df["Data Item"].isin(cols)

    df_filtered = df[mask]  

    # if there are duplicates for the field and gas day, take the latest
    df_latest = (
        df_filtered
        .sort_values("Applicable At")
        .groupby(["Gas Day", "Data Item"])
        .last()  # this takes the row with the highest (i.e. latest) "Applicable At" per group
        .reset_index()
    )
    
    # pivot to get 1 row per gas day
    df_latest = df_latest.pivot(index="Gas Day", columns="Data Item", values="Value").reset_index()
    
    # Drop 1 column that accounts for most of the NaNs
    df_latest.drop(columns=["Composite Weather Variable - Actual"], inplace=True)

    return df_latest

def load_data():
    #Read raw CSVs
    pathlist = list(Path(raw_data_folder).rglob('*.csv'))
    file_count = len(pathlist)
    dfs = []
    files_done = 0
    for path_obj in pathlist:
        path = str(path_obj)   
        
        df = pd.read_csv(path,
            parse_dates=["Applicable At", "Applicable For", "Generated Time"],
            dayfirst=True)

        df.rename(columns={'Applicable For': 'Gas Day'}, inplace=True)
        df['Gas Day'] = pd.to_datetime(df['Gas Day'], dayfirst=True)
        # daily summary columns: 

        daily_cols = df["Data Item"].unique()
        # print(daily_cols)
        # Get price and demand columns, to use as tommorow's ground truth, and with 1-3 days lag

        #label_cols = ["SAP, Actual Day", "SMP Buy, Actual Day", "SMP Sell, Actual Day", "Demand Actual, NTS, D+1"]

        #df_labels = pivot(df, label_cols)
        

        df_daily = pivot(df, daily_cols)
        dfs.append(df_daily)
        
        files_done += 1
        if files_done % 10 == 0:
            print(f"Processed {files_done} of {file_count} raw files")

    df = pd.concat(dfs)

    #Rename the columns that are going to be reused for ground truth and time series
    df.rename(columns={"SAP, Actual Day": 'SAP', "SMP Buy, Actual Day": 'SMPBuy', "SMP Sell, Actual Day": 'SMPSell', "Demand Actual, NTS, D+1":"Demand"}, inplace=True)
    
    # add lagged features
    lag_days = 5
    for i in range(1, lag_days+1):
        for col in label_cols:
            df[f"{col} D-{i}"] = df[col].shift(i)


    # add rolling averages and stds
    for col in label_cols:
        for window in [7, 30]:
            df[f'{col} D{window} roll mean'] = (
                df[col]
                .shift(1)               # so today's feature doesn't include today's price
                .rolling(window=window, min_periods=1)  # you can require fewer points if you like
                .mean()
                )
            df[f'{col} D{window} roll std'] = (
                df[col]
                .shift(1)               # so today's feature doesn't include today's price
                .rolling(window=window, min_periods=1)  # you can require fewer points if you like
                .std()
            )

    # add day of week
    df['Day of Week'] = df['Gas Day'].dt.weekday
    
    # Add labels for next day's actuals
    for col in label_cols:
        df[f"Next Day {col}"] = df[col].shift(-1)

    # There should be very few rows that have any NaNs so we can drop any that do
    df.dropna(inplace=True)
    return df

def split_train_test(df, split_date):
    """
    Splits the DataFrame into training set (gas days before split date) and test set (ga days fron the split date on)
    
    :param df: The DataFrame to split.
    :param split_date: The date to split the DataFrame on.
    :return: Tuple of (training set, testing set).
    """

    # Split the DataFrame into training and testing sets
    train_df = df[df['Gas Day'] < split_date]
    test_df = df[df['Gas Day'] >= split_date]

    return train_df, test_df

def n_train_n_test(df, n_train, n_test):
    df.to_csv("..\\data\\processed\\all.csv", index=False)
    # Split the DataFrame into training and testing sets
    train_df, test_df = train_test_split(df, test_size=n_test, train_size=n_train, shuffle=False)
    train_df.to_csv("..\\data\\processed\\train.csv", index=False)
    test_df.to_csv("..\\data\\processed\\test.csv", index=False)
    return train_df, test_df

def get_X(df):
    ys = ["Next Day " + col for col in label_cols]
    df2 = df.drop(columns=ys)
    df2.drop(columns=["Gas Day"], inplace=True)
    # experimentally - just include the price time series columns
    #for col in df2.columns.tolist():      # iterate over a copy of the column list
    #    if not is_price_column(col):             # if the substring isn’t found
    #        df2.drop(columns=col, inplace=True)
    #df2 = df2[label_cols]
    #df2 = df2[["SAP"]]
    return df2    

def is_price_column(column_name):
    if "SMP" in column_name or "SAP" in column_name:
        return True
    return False

def get_y(df, col):
    return df["Next Day " + col]

In [81]:
# Root mean squared error - penalises larger errors more than smaller ones
def get_rmse(actuals, predictions):    
    rmse =  np.sqrt(np.mean((predictions - actuals)**2))
    return round(rmse, 2)

#Mean absolute percentage error
def get_mape(actuals, predictions):
    mape = np.mean(np.abs((predictions - actuals) / actuals)) * 100
    return round(mape, 2)


def print_model_stats(model, X):

    cdf = pd.DataFrame(model.coef_, X.columns, columns=['Coefficients'])
    print(cdf)
    # 1. Coefficients and intercept
    print("Coefficients:", model.coef_)      # array of shape (n_features,)
    print("Intercept:", model.intercept_)    # scalar (or array if multi-output)

    # 2. Model parameters
    print("Parameters:", model.get_params())

    # 3. Data‐related attributes
    print("Number of features seen during fit:", model.n_features_in_)
    if hasattr(model, "feature_names_in_"):
        print("Feature names:", model.feature_names_in_)

    # 4. Linear algebra internals (rarely needed)
    print("Rank of design matrix:", model.rank_)
    print("Singular values of X:", model.singular_)

In [82]:
#load data
#download_raw_data() # uncomment this to download the data again
df = load_data()

# split on date, or random proportions
#train, test = split_train_test(df, '2024-04-01')
train, test = train_test_split(df, test_size=0.3, train_size=0.7, shuffle=True)
X_train = get_X(train)
X_test = get_X(test)



Processed 10 of 60 raw files
Processed 20 of 60 raw files
Processed 30 of 60 raw files
Processed 40 of 60 raw files
Processed 50 of 60 raw files
Processed 60 of 60 raw files


In [83]:
# Use the previous day's actual as a naive predictor

def train_and_test_model(model, df_train, df_test, col):
    X_train = get_X(df_train)
    X_test = get_X(df_test)
    y_train = get_y(df_train, col)
    y_test = get_y(df_test, col)
    #scaler = StandardScaler()
    #X_train_scaled = scaler.fit_transform(X_train)
    #X_test_scaled = scaler.fit_transform(X_test)
    X_train_scaled = X_train
    X_test_scaled = X_test

    model.fit(X_train_scaled, y_train)

    y_pred_train = model.predict(X_train_scaled)
    rmse_train = get_rmse(y_train, y_pred_train)

    y_pred_test = model.predict(X_test_scaled)
    rmse_test = get_rmse(y_test, y_pred_test)

    return model, rmse_train, rmse_test

def naive_predictions(df_train, df_test, col):
    naive_predictions_train = df_train[col]
    actuals_train = df_train[f"Next Day {col}"]
    #mape_naive_train = get_mape(actuals_train, naive_predictions_train)
    #print(f"MAPE train (naive predictor) for {col}: {mape_naive_train}")
    rmse_naive_train = get_rmse(actuals_train, naive_predictions_train)
    #print(f"RMSE train (naive predictor) for {col}: {rmse_naive_train}")

    naive_predictions_test = df_test[col]
    actuals_test = df_test[f"Next Day {col}"]
    #mape_naive_test = get_mape(actuals_test, naive_predictions_test)
    #print(f"MAPE test (naive predictor) for {col}: {mape_naive_test}")
    rmse_naive_test = get_rmse(actuals_test, naive_predictions_test)
    #print(f"RMSE test (naive predictor) for {col}: {rmse_naive_test}")
    return rmse_naive_train, rmse_naive_test

def print_results(case, rmse_naive, rmse_model):
    headline = "Worse" if rmse_naive <= rmse_model else "Better"
    print(f"{case} - {headline} - model {rmse_model} v naive {rmse_naive}")


In [84]:
print ("Linear regression model:")
for col in label_cols:
    # Instantiate linear regression model.
    model = LinearRegression()

    # Train and test it    
    model, rmse_train, rmse_test = train_and_test_model(model, train, test, col)

    # Print model details
    X_train = get_X(train)
    #print_model_stats(model, X_train)

    # Get naive prediction stats for comparison
    rmse_naive_train, rmse_naive_test = naive_predictions(train, test, col)   
    
    print_results(col + " train", rmse_naive_train, rmse_train)
    print_results(col + " test", rmse_naive_test, rmse_test)
    
    

Linear regression model:
SAP train - Better - model 0.43 v naive 0.48
SAP test - Better - model 0.44 v naive 0.46
SMPBuy train - Better - model 0.51 v naive 0.56
SMPBuy test - Worse - model 0.58 v naive 0.58
SMPSell train - Better - model 0.61 v naive 0.68
SMPSell test - Worse - model 0.63 v naive 0.58
Demand train - Better - model 16.09 v naive 18.11
Demand test - Better - model 17.83 v naive 20.22


In [85]:
from sklearn.ensemble import RandomForestRegressor
print ("Random forest model:")
for col in label_cols:
    # Instantiate linear regression model.
    model = RandomForestRegressor()

    # Train and test it    
    model, rmse_train, rmse_test = train_and_test_model(model, train, test, col)

    # Print model details
    X_train = get_X(train)
    #print_model_stats(model, X_train)

    # Get naive prediction stats for comparison
    rmse_naive_train, rmse_naive_test = naive_predictions(train, test, col)   
    
    print_results(col + " train", rmse_naive_train, rmse_train)
    print_results(col + " test", rmse_naive_test, rmse_test)

Random forest model:
SAP train - Better - model 0.18 v naive 0.48
SAP test - Better - model 0.43 v naive 0.46
SMPBuy train - Better - model 0.21 v naive 0.56
SMPBuy test - Worse - model 0.58 v naive 0.58
SMPSell train - Better - model 0.26 v naive 0.68
SMPSell test - Worse - model 0.63 v naive 0.58
Demand train - Better - model 6.44 v naive 18.11
Demand test - Better - model 17.56 v naive 20.22
