In [None]:
import sklearn
import pandas as pd
from tsai.basics import *
from tsai.inference import load_learner
from sklearn.metrics import mean_squared_error, mean_absolute_error

## Data Import

In [None]:
df_initial = pd.read_csv('d:/prog/mlops/projet/data/historical_20080801_full.csv', index_col = 0)

df_initial.sort_values(by=['observation_time','time'], inplace = True)  # sort line to have chronoligic order
df_initial.reset_index(drop = True, inplace = True)                     # reset of index to able to use split function
df_initial.drop(['wind_dir','time','city'], inplace = True, axis = 1)   # remove categorical raw

## Preprocess Dataframe

In [None]:
datetime_col = "observation_time"
freq = 'D'
columns = df_initial.columns[1:]
method = 'ffill'
value = 0

# pipeline
preproc_pipe = sklearn.pipeline.Pipeline([
    ('shrinker', TSShrinkDataFrame()), # shrink dataframe memory usage
    ('drop_duplicates', TSDropDuplicates(datetime_col=datetime_col)), # drop duplicate rows (if any)
#    ('add_mts', TSAddMissingTimestamps(datetime_col=datetime_col, freq=freq)), # add missing timestamps (if any)
    ('fill_missing', TSFillMissing(columns=columns, method=method, value=value)), # fill missing data (1st ffill. 2nd value=0)
    ], 
    verbose=True)
mkdir('data', exist_ok=True, parents=True)
save_object(preproc_pipe, 'data/preproc_pipe.pkl')
preproc_pipe = load_object('data/preproc_pipe.pkl')

df = preproc_pipe.fit_transform(df_initial)

In [None]:

fcst_history = 500 # # steps in the past
fcst_horizon = 7  # # steps in the future
valid_size   = 0.1  # int or float indicating the size of the training set
test_size    = 0.2  # int or float indicating the size of the test set

splits = get_forecasting_splits(df_initial, fcst_history=fcst_history, fcst_horizon=fcst_horizon, datetime_col=datetime_col,
                                valid_size=valid_size, test_size=test_size)


## Scale Dataframe

In [None]:
train_split = splits[0]

# pipeline
exp_pipe = sklearn.pipeline.Pipeline([
    ('scaler', TSStandardScaler(columns=columns)), # standardize data using train_split
    ], 
    verbose=True)
save_object(exp_pipe, 'data/exp_pipe.pkl')
exp_pipe = load_object('data/exp_pipe.pkl')

df_scaled = exp_pipe.fit_transform(df_initial, scaler__idxs=train_split)
df_scaled.describe()

## Sliding Window

In [None]:
x_vars = df_initial.columns[1:]
y_vars = df_initial.columns[1:]
X, y = prepare_forecasting_data(df_initial, fcst_history=fcst_history, fcst_horizon=fcst_horizon, x_vars=x_vars, y_vars=y_vars)
X.shape, y.shape

In [227]:
# Just for MacUsers

X = torch.tensor(X, dtype=torch.float32)
y = torch.tensor(y, dtype=torch.float32)

In [71]:
arch_config = dict(
    n_layers=3,  # number of encoder layers
    n_heads=4,  # number of heads
    d_model=16,  # dimension of model
    d_ff=128,  # dimension of fully connected network
    attn_dropout=0.0, # dropout applied to the attention weights
    dropout=0.3,  # dropout applied to all linear layers in the encoder except q,k&v projections
    patch_len=24,  # length of the patch applied to the time series to create patches
    stride=2,  # stride used when creating patches
    padding_patch=True,  # padding_patch
)

learn = TSForecaster(X, y, splits=splits, batch_size=16, path='d:/prog/mlops/projet/models/', pipelines=[preproc_pipe, exp_pipe],
                     arch="PatchTST", arch_config=arch_config, metrics=[mse, mae], cbs=ShowGraph())
learn.lr_find().valley

In [None]:
name = 'Margaux_v0'
n_epochs = 20
lr_max = 2e-3
learn.fit_one_cycle(n_epochs, lr_max=lr_max) # une epoch 20 minutes
learn.export('d:/Prog/mlops/projet/models/'+name+'.pt')

## Evaluate model

In [None]:
name = 'Margaux_v0'
learn = load_learner('d:/Prog/mlops/projet/models/'+name+'.pt')

df_initial = pd.read_csv('d:/prog/mlops/projet/data/historical_20080801_full.csv', index_col = 0)
df_initial.sort_values(by=['observation_time','time'], inplace = True)  # sort line to have chronoligic order
df_initial.reset_index(drop = True, inplace = True)                     # reset of index to able to use split function
df_initial.drop(['wind_dir','time','city'], inplace = True, axis = 1)   # remove categorical raw

fcst_history = 500 # # steps in the past
fcst_horizon = 7  # # steps in the future
valid_size   = 0.1  # int or float indicating the size of the training set
test_size    = 0.2  # int or float indicating the size of the test set

x_vars = df_initial.columns[1:]
y_vars = df_initial.columns[1:]
X, y = prepare_forecasting_data(df_initial, fcst_history=fcst_history, fcst_horizon=fcst_horizon, x_vars=x_vars, y_vars=y_vars)
X.shape, y.shape

datetime_col = "observation_time"
splits = get_forecasting_splits(df_initial, fcst_history=fcst_history, fcst_horizon=fcst_horizon, datetime_col=datetime_col,
                                valid_size=valid_size, test_size=test_size)

scaled_preds, *_ = learn.get_X_preds(X[splits[1]])
scaled_preds = to_np(scaled_preds)
print(f"scaled_preds.shape: {scaled_preds.shape}")

scaled_y_true = y[splits[1]]
results_df = pd.DataFrame(columns=["mse", "mae"])
results_df.loc["valid", "mse"] = mean_squared_error(scaled_y_true.flatten(), scaled_preds.flatten())
results_df.loc["valid", "mae"] = mean_absolute_error(scaled_y_true.flatten(), scaled_preds.flatten())
print('result on Valid samples') 
display(results_df)

y_test_preds, *_ = learn.get_X_preds(X[splits[2]])
y_test_preds = to_np(y_test_preds)
print(f"y_test_preds.shape: {y_test_preds.shape}")

y_test = y[splits[2]]
results_df = pd.DataFrame(columns=["mse", "mae"])
results_df.loc["test", "mse"] = mean_squared_error(y_test.flatten(), y_test_preds.flatten())
results_df.loc["test", "mae"] = mean_absolute_error(y_test.flatten(), y_test_preds.flatten())
print('result on Test samples') 
display(results_df)

X_test = X[splits[2]]
y_test = y[splits[2]]
plot_forecast(X_test, y_test, y_test_preds, sel_vars=True)