In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import random
from tqdm import tqdm
import matplotlib.pyplot as plt
import torch.nn as nn
from neuralforecast.losses.pytorch import BasePointLoss, _weighted_mean
import torch
from typing import Union
import warnings
# Filter specific warnings
warnings.filterwarnings("ignore", message="val_check_steps is greater than max_steps")
warnings.filterwarnings("ignore", message="The 'val_dataloader' does not have many workers")
warnings.filterwarnings("ignore", message="The 'train_dataloader' does not have many workers")
warnings.filterwarnings("ignore", message="The number of training batches")
warnings.filterwarnings("ignore", message="`Trainer.fit` stopped: `max_steps=16` reached.")
warnings.filterwarnings("ignore", message="Trying to infer the `batch_size` from an ambiguous collection")
import logging
logging.getLogger("pytorch_lightning").setLevel(logging.CRITICAL)
import pytorch_lightning as pl
# Trainer configuration
trainer = pl.Trainer(logger=False)

In [2]:
# Define the file name
file_path = 'BasicInputTimeSeries.npy'

# Check if the file exists
if os.path.exists(file_path):
    # Load the .npy file with allow_pickle=True
    time_series_data = np.load(file_path, allow_pickle=True)
    print("Data loaded from file.")
else:
    print("File not found.")


Data loaded from file.


In [3]:
#-------------------------------------------------------------
seed = 100
# Set the random seed for Python's random module
random.seed(seed)
# Set the random seed for NumPy
np.random.seed(seed)
print('seed :',seed)
#-------------------------------------------------------------

    
# Define the column names
columns = ["Year_Mnth_Day", "basin_id", "prcp(mm/day)", "srad(W/m2)", "tmax(C)", "tmin(C)", "vp(Pa)", "QObs(mm/d)"]
# Create the DataFrame
df = pd.DataFrame(time_series_data, columns=columns)

# Deleting the numpy array from memory
del time_series_data

df['ds'] = pd.to_datetime(df['Year_Mnth_Day'])
df['unique_id'] = df['basin_id']
df['y'] = df["prcp(mm/day)"].astype(float)
df.drop(['Year_Mnth_Day', 'basin_id', "prcp(mm/day)", ], axis=1, inplace=True) # "srad(W/m2)", "tmax(C)", "tmin(C)", "vp(Pa)", "QObs(mm/d)"

# for col in df.columns:
#     print(col)
#     if col != 'ds' and col !='unique_id':
#         # Global Normalization
#         scaler = MinMaxScaler(feature_range=(0, 1))
#         df[col] = scaler.fit_transform(df[[col]])


# Sort the DataFrame by 'unique_id' and 'ds' to ensure correct order
df.sort_values(by=['unique_id', 'ds'], inplace=True)

# Function to split each group
def split_train_test(data, n):
    train_frames = []
    test_frames = []
    for _, group in data.groupby('unique_id'):
        train, test = group[:-n], group[-n:]
        train_frames.append(train)
        test_frames.append(test)
    train_df = pd.concat(train_frames)
    test_df = pd.concat(test_frames)
    train_df.reset_index(inplace=True, drop=True)
    test_df.reset_index(inplace=True, drop=True)
    return train_df, test_df

# Function to drop the last n values fron each group
def drop_last_n(data, n):
    train_frames = []
    for _, group in data.groupby('unique_id'):
        train = group[:-n]
        train_frames.append(train)
    train_df = pd.concat(train_frames)
    train_df.reset_index(inplace=True, drop=True)
    return train_df


    
# Set 'ds' as the index if you plan to use time-based indexing
df.set_index('ds', inplace=True, drop=False)

# Function to calculate EMA for each group
def calculate_ema(group, span):
    return group.ewm(span=span, adjust=False).mean()

# Apply the function to each group for multiple spans
span_settings = [2, 4, 6, 8, 10, 20, 50]  # Example spans
for span in span_settings:
    print(f'EMA_{span}')
    df[f'EMA_{span}'] = df.groupby('unique_id')['y'].transform(lambda x: calculate_ema(x, span))
    
df.reset_index(inplace=True, drop=True)
   
    
# Number of records to be taken as test data for each unique_id
n = 3000 
n_time_series = 671
len_time_series = 7000
_, df = split_train_test(df, len_time_series)

train_df, test_df = split_train_test(df, n)


print("DataFrame:")
print(df)



seed : 100
EMA_2
EMA_4
EMA_6
EMA_8
EMA_10
EMA_20
EMA_50
DataFrame:
        srad(W/m2) tmax(C) tmin(C)  vp(Pa)  QObs(mm/d)         ds unique_id  \
0           237.53   12.75   -1.37  599.13    0.376714 1989-11-02   1013500   
1            99.59     8.1   -0.31  561.45    0.379961 1989-11-03   1013500   
2           129.07    3.35   -4.01  425.97    0.397282 1989-11-04   1013500   
3           193.49   -0.75   -9.83   303.0    0.405942 1989-11-05   1013500   
4           159.91    3.82   -5.66  425.19    0.422179 1989-11-06   1013500   
...            ...     ...     ...     ...         ...        ...       ...   
4696995     125.72    4.86    0.39  604.28   22.806309 2008-12-27  14400000   
4696996     101.25    6.06    2.83  702.39  115.775145 2008-12-28  14400000   
4696997     140.54    7.87    2.46  703.56  134.606043 2008-12-29  14400000   
4696998     171.69    6.96   -3.14  532.92    54.40037 2008-12-30  14400000   
4696999     174.73    8.15    0.13  599.32   29.920203 2008-12-3

In [4]:
from neuralforecast.core import NeuralForecast
from neuralforecast.models import VanillaTransformer, TCN
from neuralforecast.losses.pytorch import MAE, MSE, QuantileLoss
from neuralforecast.auto import AutoNHITS, AutoPatchTST
from ray import tune
import torch.nn as nn


horizon = 1
models = [TCN(h = horizon,  context_size=1024, input_size=1024 , max_steps=21*100, learning_rate=1e-4, loss= MSE(), random_seed=10, batch_size=32, scaler_type=None, 
                    hist_exog_list =["EMA_2","EMA_4","EMA_6","EMA_8","EMA_10","EMA_20","EMA_50", "srad(W/m2)", "tmax(C)", "tmin(C)", "vp(Pa)", "QObs(mm/d)"])]
nf = NeuralForecast(models=models, freq='1D', local_scaler_type=None)

nf.fit(train_df, val_size=0)

    

[rank: 0] Seed set to 10
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_steps=2100` reached.


In [None]:
# Create an empty DataFrame to store all predictions
all_preds = pd.DataFrame()
for i in range(n, 0, -horizon):
    print("re-fit and test :", i)
    _, re_train_df = split_train_test(df, i+1024)
    data_part = drop_last_n(re_train_df, i)
    preds = nf.predict(data_part)
    preds = preds.reset_index(drop=False)
    all_preds = pd.concat([all_preds, preds], ignore_index=True)
    

In [6]:

all_preds = all_preds.sort_values(by=['unique_id', 'ds'], ascending=[True, True])
results_df = test_df.copy()
results_df['TCN'] = all_preds['TCN'].values


In [7]:
test_actual_y = results_df['y']
test_predicted_y = results_df['TCN']
train_actual_y = train_df['y']


index_date_test = results_df[results_df['unique_id'] == 1013500]['ds']
index_date_train = train_df['ds'][:len_time_series-n]

In [8]:

def mean_absolute_error(y_true, y_pred):
    return np.mean(np.abs(y_true - y_pred))

def normalized_nash_sutcliffe_efficiencySTavg(y_true, y_pred):  # axis 0 space 1 time
    NSE = 1 - np.sum(np.square(y_true - y_pred)) / np.sum(np.square(y_true - np.mean(y_true)))
    return 1 / (2 - NSE)

def mean_squared_error(y_true, y_pred):
    return np.mean(np.square(y_true - y_pred))

def r_squared(y_true, y_pred):
    ss_res = np.sum(np.square(y_true - y_pred))
    ss_tot = np.sum(np.square(y_true - np.mean(y_true)))
    return 1 - (ss_res / ss_tot)

# Assuming your data is organized in results_df and reshaped
Y_ACTUAL = results_df['y'].values.reshape(n_time_series, n)
Y_HAT = results_df['TCN'].values.reshape(n_time_series, n)

MAE = mean_absolute_error(Y_ACTUAL, Y_HAT)
MSE = mean_squared_error(Y_ACTUAL, Y_HAT)
NNSE = normalized_nash_sutcliffe_efficiencySTavg(Y_ACTUAL, Y_HAT)
R2 = r_squared(Y_ACTUAL, Y_HAT)

print("MSE: ", MSE)
print("MAE: ", MAE)
print("NNSE: ", NNSE)
print("R²: ", R2)


MSE:  30.177716735702713
MAE:  2.6137388779363464
NNSE:  0.6068980608016832
R²:  0.3522768244158696


In [10]:
results_df.to_pickle('TCN_prcp.pkl')