# Online STLF with data streams and drift detection

### 1.1 Local STLF

In [6]:
import pandas as pd
import os
import tensorflow as tf

import sys  
sys.path.append("../")  
from utils.modelgenerator import *
from utils.modelhandler import *
from utils.datahandler import *

In [7]:


#Get data 
cwd = os.path.normpath(os.path.dirname(os.path.dirname(os.getcwd())))
df = pd.read_csv(cwd+'/data/2feature_engineering_data/df_with_final_features.csv', index_col='Date') #df = pd.read_csv('user5.csv')
df.index = pd.to_datetime(df.index)
#df = df[['User5', 'temp', 'rhum']]
df.fillna(0, inplace=True)

df_array = []
for idx in range(1):
    df_array.append(df[[f'User{idx+1}', 'temp', 'rhum', 'wspd', 'PC1', 'hour sin', 'hour cos', f'User{idx+1}_lag_24hrs']])

#df_array[3].head(3)

#Train, Validation and Test datasets
sequence_length = 25
batch_size = 16
num_features = df_array[0].shape[1]

dh = Datahandler()

X_train, y_train, X_val, y_val, X_test, y_test = {}, {}, {}, {}, {}, {}

#Create Train, Validation and Test datasets
for idx, df in enumerate(df_array):
    n = len(df)
    train_df = df[0:int(n*0.7)]
    val_df = df[int(n*0.7):int(n*0.9)]
    test_df = df[int(n*0.9):]

    # Min max sclaing
    train_df = dh.min_max_scaling(train_df)
    val_df = dh.min_max_scaling(val_df)
    test_df = dh.min_max_scaling(test_df)

    # Sequencing
    train_sequences = dh.create_sequences(train_df, sequence_length)
    val_sequences = dh.create_sequences(val_df, sequence_length)
    test_sequences = dh.create_sequences(test_df, sequence_length)

    #Split into feature and label
    X_train[f'user{idx+1}'], y_train[f'user{idx+1}'] = dh.prepare_data(train_sequences, batch_size)
    X_val[f'user{idx+1}'], y_val[f'user{idx+1}'] = dh.prepare_data(val_sequences, batch_size)
    X_test[f'user{idx+1}'], y_test[f'user{idx+1}'] = dh.prepare_data(test_sequences, batch_size)

#General Hyperparameters
# #All models
horizon = 1
max_epochs = 100
m1 = ModelGenerator()
mh = Modelhandler()

loss = tf.keras.losses.MeanSquaredError()
metrics=[
    tf.keras.metrics.RootMeanSquaredError(), 
    tf.keras.metrics.MeanAbsolutePercentageError(),
    tf.keras.metrics.MeanAbsoluteError(),
]

early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss',patience=10,mode='min')
timing_callback = TimingCallback()
custom_callback = CustomCallback()
#model_checkpoint = ModelCheckpoint('models/best_model.h5', save_best_only=True, monitor='val_loss', mode='min')
callbacks=[early_stopping, timing_callback, custom_callback] #model_checkpoint





In [8]:
dense_results = pd.DataFrame(columns=['architecture', 'train_time', 'avg_time_epoch', 'mse','mse_std', 'rmse','rmse_std','mape','mape_std','mae','mae_std'])

In [9]:
#Dense 1 -------------------------------------------------------------

#Dense Hyperparameter
dense_architecture = "L3_U16"
dense_layers = 3
dense_units = 16
dense_all_results = pd.DataFrame(columns=["user", "architecture", "train_time", "avg_time_epoch", "mse", "rmse", "mape", "mae"])

#For each of the 3 user
for idx in range(len(df_array)):
    print("User: ", idx+1)
    for round in range(3):
        #print("Round: ", round)
        dense_model = m1.build_dense_model(X_train[f'user{idx+1}'], horizon, num_layers=dense_layers, units=dense_units, batch_size=batch_size)
        dense_histroy, dense_user_results = mh.compile_fit_evaluate_model(
            model=dense_model, 
            loss=loss, 
            metrics=metrics, 
            X_train=X_train[f'user{idx+1}'],
            y_train = y_train[f'user{idx+1}'], 
            max_epochs = max_epochs, 
            batch_size=batch_size, 
            X_val=X_val[f'user{idx+1}'], 
            y_val=y_val[f'user{idx+1}'], 
            X_test=X_test[f'user{idx+1}'], 
            y_test=y_test[f'user{idx+1}'], 
            callbacks=callbacks, 
            user=f'user{idx+1}', 
            hyper=dense_architecture,
            optimizer=tf.keras.optimizers.Adam(learning_rate=0.001)
        )
        # Add the 'architecture' column from dense_user_results to dense_results
        dense_all_results = pd.merge(dense_all_results, dense_user_results, how='outer')   

    #dense_model.save(cwd + f"/models/Local_learning/Dense/{dense_architecture}/User{idx}")
    print("Saved Soft_Dense_MoE")  


for idx in range(len(df_array)):
    new_row = {
        'architecture': dense_architecture,
        'train_time': dense_all_results[dense_all_results["user"]==f"user{idx+1}"]["train_time"].mean(), 
        'avg_time_epoch' : dense_all_results[dense_all_results["user"]==f"user{idx+1}"]["avg_time_epoch"].mean(),
        'mse': dense_all_results[dense_all_results["user"]==f"user{idx+1}"]["mse"].mean(),
        'mse_std' : dense_all_results[dense_all_results["user"]==f"user{idx+1}"]["mse"].std(),
        'rmse': dense_all_results[dense_all_results["user"]==f"user{idx+1}"]["rmse"].mean(),
        'rmse_std' : dense_all_results[dense_all_results["user"]==f"user{idx+1}"]["rmse"].std(),
        'mape': dense_all_results[dense_all_results["user"]==f"user{idx+1}"]["mape"].mean(),
        'mape_std' : dense_all_results[dense_all_results["user"]==f"user{idx+1}"]["mape"].std(),
        'mae': dense_all_results[dense_all_results["user"]==f"user{idx+1}"]["mae"].mean(),
        'mae_std' : dense_all_results[dense_all_results["user"]==f"user{idx+1}"]["mae"].std(),
    }
    dense_results.loc[len(dense_results)] = new_row

User:  1

Saved Soft_Dense_MoE


In [10]:
dense_results.to_csv(f'../../evaluations/Test_Datastream_Dense.csv')
dense_results

Unnamed: 0,architecture,train_time,avg_time_epoch,mse,mse_std,rmse,rmse_std,mape,mape_std,mae,mae_std
0,L3_U16,13.438615,0.420715,0.03389,0.001599,0.184058,0.004374,142224.630208,20025.937011,0.12693,0.001977


### 1.2 Data Streams

In [12]:
from river import stream
import pandas as pd

import os
cwd = os.path.normpath(os.path.dirname(os.path.dirname(os.getcwd())))
df = pd.read_csv(cwd+'/data/3final_data/final_dataset.csv', index_col='Date')
df.head()

Unnamed: 0_level_0,User1,User10,User11,User12,User13,User14,User15,User16,User17,User18,...,User6,User7,User8,User9,temp,dwpt,rhum,wdir,wspd,pres
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2012-07-01 00:00:00,0.068,0.703,0.353,0.21,0.138,0.208,0.065,0.34,0.129,0.576,...,0.636,0.106,0.156,0.012,5.1,4.5,96.0,,0.0,1015.2
2012-07-01 01:00:00,0.786,0.036,0.547,0.197,0.343,0.176,0.067,0.508,0.121,0.128,...,0.253,0.098,0.151,0.022,5.1,4.5,96.0,,0.0,1015.2
2012-07-01 02:00:00,0.544,0.045,0.519,0.163,0.339,0.164,0.057,0.542,0.141,0.098,...,0.22,0.089,0.152,0.023,5.1,4.5,96.0,,0.0,1015.2
2012-07-01 03:00:00,0.612,0.031,0.324,0.173,0.337,0.178,0.063,0.59,0.165,0.097,...,0.241,0.103,0.148,0.012,2.7,2.3,97.0,200.0,7.6,1014.8
2012-07-01 04:00:00,0.665,0.018,0.343,0.156,0.363,0.193,0.065,0.551,0.122,0.101,...,0.199,0.09,0.146,0.038,2.7,2.3,97.0,200.0,7.6,1014.8


In [13]:
for x, y in stream.iter_csv(
    cwd+'/data/3final_data/final_dataset.csv',
    converters={
        'User1': float,
        'temp': float,
        'dwpt': float,
        'rhum': float,
        #'wdir': float,
        'wspd': float,
        'pres': float
    },
    parse_dates={'Date': '%Y-%m-%d %H:%M:%S'},
    target='User1'
):
    print(x,y)
    break

#x, y

{'Date': datetime.datetime(2012, 7, 1, 0, 0), 'User10': '0.703', 'User11': '0.353', 'User12': '0.21', 'User13': '0.138', 'User14': '0.208', 'User15': '0.065', 'User16': '0.34', 'User17': '0.129', 'User18': '0.576', 'User19': '0.075', 'User2': '0.254', 'User20': '2.35', 'User21': '0.174', 'User22': '0.698', 'User23': '0.048', 'User24': '0.163', 'User25': '0.175', 'User26': '0.888', 'User27': '0.818', 'User28': '0.138', 'User29': '0.154', 'User3': '1.048', 'User30': '0.022', 'User31': '0.288', 'User32': '0.057', 'User33': '0.284', 'User34': '0.244', 'User35': '0.397', 'User36': '0.102', 'User4': '0.09', 'User5': '0.098', 'User6': '0.636', 'User7': '0.106', 'User8': '0.156', 'User9': '0.012', 'temp': 5.1, 'dwpt': 4.5, 'rhum': 96.0, 'wdir': '', 'wspd': 0.0, 'pres': 1015.2} 0.068


### 1.3 Drift detection

In [14]:
from river import drift

adwin = drift.ADWIN()

In [15]:
import random
from river import drift

rng = random.Random(12345)
adwin = drift.ADWIN()

data_stream = rng.choices([0, 1], k=1000) + rng.choices(range(4, 8), k=1000)

for i, val in enumerate(data_stream):
    _ = adwin.update(val)
    if adwin.drift_detected:
        print(f"Change detected at index {i}, input value: {val}")

Change detected at index 1023, input value: 4
