## Imports and Setup - Data prep

In [None]:
import tensorflow as tf
from tensorflow import keras
from keras.callbacks  import EarlyStopping

import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [None]:
%%writefile "ts_utils.py"

# DO NOT EDIT THIS FILE - GENERATED FROM 02_ts_utils.ipynb

import tensorflow as tf
from tensorflow import keras
from keras.callbacks  import EarlyStopping

import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import pickle

mpl.rcParams['figure.figsize'] = (14, 4)
mpl.rcParams['axes.grid'] = True

#--------------------------------------------------------------------------------
'''
    Load the data you prepared - you must have run 01_ts_dataprep 
'''
def load_file( file = '../data/jena_climate_2009_2016.csv.zip'):
    df = pd.read_csv(file+".csv")
    df['Date Time'] = pd.to_datetime( df['Date Time'], format='%Y-%m-%d %H:%M:%S' )
    df_scaled_trn   = pd.read_csv(file+".trn.csv")
    df_scaled_tst   = pd.read_csv(file+".tst.csv")
    scaler          = pickle.load(open(f'{file}.scaler.pkl', 'rb'))

    return df, df_scaled_trn, df_scaled_tst, scaler

#--------------------------------------------------------------------------------
'''
    dataset:        must be tf.data.Dataset.from_tensor_slices
    label_slice:    labels (indices or slice(start,end, skip) )
    window_len:     Length of the window
    output_len:     Length of the labels (# of steps to predict)

    for_aencoder:   Note: for auto encoder, output is same as input

Usage:
    df = pd.read_csv(file) or [[0,1,2,3], [0,1,2,3], [0,1,2,3], [0,1,2,3], [0,1,2,3]]
    ds = timeseries_dataset_from_dataset(df, 2, 2, slice(0, 2))
    #print_dataset(ds)

'''
def window(dataset, window_len, output_length, label_slice=slice(0,1), batch_size=1, skip = 0):
    ds = dataset.window(window_len + skip + output_length, shift=1, drop_remainder=True)
    ds = ds.flat_map(lambda x: x).batch(window_len + skip+ output_length)
     
    def split_feature_label(x):
        return x[:window_len], x[window_len+skip:,label_slice]
     
    ds = ds.map(split_feature_label)
    return ds.batch(batch_size)

def windowae(dataset, window_len, batch_size=1):
    ds = dataset.window(window_len, shift=1, drop_remainder=True)
    ds = ds.flat_map(lambda x: x).batch(window_len)
     
    def split_feature_label(x):
        return x, x
        #return x[:window_len], x[:window_len]
     
    ds = ds.map(split_feature_label)
    return ds.batch(batch_size)

#--------------------------------------------------------------------------------
# Compute the Average of the training output and we will use this as default predictions
# Also for computing R-squared value
def compute_avg(window):
    count, total = 0, None;
    for w in window:
        if (not count):
            total = w[1]
        else:
            total += w[1]
        count += 1

    avg_output = total/count
    return avg_output

#--------------------------------------------------------------------------------
'''
    predict the model,
    y:      is the original array of expected 
    yhat:   is the predicted values
'''
def model_predict(model, window, y=None, yhat= None, howmany=1024*1024):
    for w in window.take(howmany):
        xc = w[0]
        yc = w[1]
        yp = model.predict(xc, verbose=0)

        yc = yc[:,-1,:]
        yp = yp[:,-1,:]

        if ( y is None):
            y = yc
            yhat = yp
            continue;
        
        y = np.concatenate([y,yc])
        yhat = np.concatenate([yhat,yp])

    return y, yhat


#--------------------------------------------------------------------------------
# Define inv_transform functions - Note: yh: [batch, time, features length]
def inverse_transform(yh, scaler, label_slice, df=None):
    yy=np.empty([yh.shape[0], scaler.n_features_in_])
    yy[:] = np.nan

    yy[:, label_slice] = yh
    ys = scaler.inverse_transform(yy)

    if (df is not None):
        ys = pd.DataFrame(ys[:, label_slice], columns=df.columns[label_slice])

    return ys    

#--------------------------------------------------------------------------------
def compile_fit(model, window_trn=None, window_tst= None, opt=None, patience=3, epochs=1, callbacks=[], **kwargs):
    earlyStopCB = EarlyStopping(monitor='val_loss', patience=patience, mode='min', restore_best_weights = True)

    callbacks.append(earlyStopCB)

    # You may use these callbacks to save the model if you wanted to

    #saveModelCB = ModelCheckpoint(filepath=model.name + ".model.tf", save_best_only=True, verbose=0)
    #tensorBrdCB = TensorBoard(log_dir= f'./logs/{model.name}', histogram_freq=0, write_graph=True, write_images=True)

    loss = tf.keras.losses.MeanSquaredError()
    opt  = opt or tf.keras.optimizers.Adam()
    mets = [tf.keras.metrics.MeanAbsoluteError()]

    ##=> Other options you can try
    #learning_rate = 1e-6
    #opt = tf.keras.optimizers.SGD(learning_rate=learning_rate, momentum=0.9)
    #opt = tf.keras.optimizers.SGD()
    #loss=tf.keras.losses.Huber()

    model.compile(loss= loss, optimizer= opt, metrics=mets)

    history = []
    if (window_trn is not None):
        history = model.fit(window_trn, epochs=epochs, validation_data=window_tst, 
                                workers=4, use_multiprocessing=True, callbacks=callbacks, **kwargs)

    return history

#--------------------------------------------------------------------------------
'''
    This commonLayer, a layer that is common to all models given output_len, out_features length.

    suppose if you are building a prediction, forecasting model say
    ouput_len      = 4
    ouput_feat_len = 2

    This means your output will have a final linear layer of length: 
            op_len = ouput_len * ouput_feat_len;
    And your will have linear activation (sometimes relu makes sense or custom activation)

    And you will have to reshape to compare to actual output. Instead of creating this last year,
    you may choose to call this function for convenience.

'''
def getCommonLayer(ouput_len, ouput_feat_len, previousLayer=None, activation="linear"):
    op_len = ouput_len * ouput_feat_len;
    commonLayer = [
        # Shape => [batch, 1, out_len * #features]
        tf.keras.layers.Dense( op_len, activation=activation, kernel_initializer=tf.initializers.zeros()),
        
        # Shape => [batch, out_steps, features]
        tf.keras.layers.Reshape([ouput_len, ouput_feat_len])
    ]
    if (previousLayer is not None):
        preds = commonLayer[0](previousLayer)
        preds = commonLayer[1](preds)
        commonLayer = preds

    return commonLayer

# TS PLot utilities

In [None]:
%%writefile "ts_plot_utils.py"

# DO NOT EDIT THIS FILE - GENERATED FROM 02_ts_utils.ipynb

import tensorflow as tf
import tensorflow.keras as keras
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import pickle
import ts_utils

#--------------------------------------------------------------------------------
def plot(y, yh, x=None, title="", scaler=None):
    if (scaler):
        y1  = scaler.inverse_transform(y) 
        yh1 = scaler.inverse_transform(yh) 
    else:
        y1, yh1 = y, yh

    x = x or range(max(len(y1),len(yh)))
    
    plt.scatter(x, y1,  marker='.', s=64, edgecolor='k', label="Y")
    plt.scatter(x, yh1, marker='x', s=64, edgecolor='k', label="$\hat{y}$")
    plt.title(title)
    plt.grid(1)
    plt.legend()

#--------------------------------------------------------------------------------
def plotFeatureImportance(weights, labels):
    plt.bar( x = range(len(weights)), height=weights)
    if (labels):
        print(labels)
        axis = plt.gca()
        axis.set_xticks(range(len(labels)))
        axis.tick_params(axis='both', which='major', labelsize=15)

        _ = axis.set_xticklabels(labels, rotation=90)

#--------------------------------------------------------------------------------
def eval_performance(model, trn_dataset, tst_dataset=None, metric_name="loss"):
    en = model.evaluate(trn_dataset)
    if (tst_dataset):
        et = model.evaluate(tst_dataset);
    else:
        et = [0] * len(en)

    mi = max(0, model.metrics_names.index(metric_name))

    return np.array(en).flat[mi], np.array(et).flat[mi]

performance = {}
def plot_performance(models, trn_dataset, tst_dataset=None, metric_name="loss", performance = {}, reeval=0):
    for m in models:
        if (not reeval and performance.get(m.name, None)):
            print(f"Performance for {m.name} exists")
            continue;  # Dont evaluate if performance is already computed

        performance[m.name] = eval_performance(m, trn_dataset, tst_dataset, metric_name)

    if (len(performance) <= 0 ):
        print("No models to plot?")

    x = np.arange(len(performance))
    width = 0.3
    val_mae =  [v[0] for v in performance.values()]
    test_mae = [v[1] for v in performance.values()]

    plt.title = f"Comparisons of '{metric_name}' : "
    plt.ylabel('Metrics')
    plt.bar(x - 0.17, val_mae, width,  label= f'Training {metric_name}')
    plt.bar(x + 0.17, test_mae, width, label= f'Test {metric_name}')
    plt.xticks(ticks=x, labels=performance.keys(), rotation=45)
    _ = plt.legend()
    
    return performance

#--------------------------------------------------------------------------------
def plot_predictions(ydf, yhatdf, start=0, end=1024*1024, title=""):
    plt.figure(figsize=(14, 4))

    for c in ydf.columns:
        y1, p1 = ydf[c][start:end], yhatdf[c][start:end]
        plt.scatter( y1.index, y1, edgecolors='k', marker='o', label= f'{c}: y',    c='#2ca02c' )
        plt.scatter( p1.index, p1, edgecolors='k', marker='X', label= f'{c}: yhat', c='#ff7f0e')

        plt.title = title
        plt.legend()
        plt.show()


#--------------------------------------------------------------------------------
def predict_and_plot( model, window_trn, window_tst, howmany=1024* 1024,
                        plot_start=0, plot_end=1024*1024, df=None, scaler=None, label_slice=None):
    y, yhat = None, None
    y, yhat = ts_utils.model_predict( model , window_trn,  y, yhat, howmany)
    if (window_tst is not None):
        y, yhat = ts_utils.model_predict( model , window_tst,  y, yhat, howmany)

    if ( df is not None):
        ydf = ts_utils.inverse_transform(y, scaler, label_slice, df)
        pdf = ts_utils.inverse_transform(yhat, scaler, label_slice, df)
    else:
        ydf = pd.DataFrame(y   )
        pdf = pd.DataFrame(yhat)

    plot_predictions(ydf,pdf, plot_start, plot_end, title=f"{model.name}")

    return ydf, pdf

# Column Transformer 

This is custom column transformer to help in dataframe transformation

In [125]:
#%%writefile "ts_transformer.py"

# DO NOT EDIT THIS FILE - GENERATED FROM 02_ts_utils.ipynb

import pandas as pd
from pandas.api.types import is_numeric_dtype
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import pickle

'''
    Utility that is derived from Column transformer to do inverse_transform

USAGE:
    Easiest way to use it:

        df = pd.read_csv("../data/processminer-rare-event-mts.csv.zip", sep=";")
        scaler, df2 = myColumnTransformer.scale_df(df1)

    Use it scale other data: 

        scaler.fit(df1)

'''
class myColumnTransformer(ColumnTransformer):
    def out_feature_names(self):

        if( not self.verbose_feature_names_out ):
            return self.get_feature_names_out()
        else:
            # Note Below is only valid if scaler is called without 'verbose_feature_names_out'
            out_feats = []
            for s,v in self.named_transformers_.items():
                if ( type(v) == str):       # Problem arises when dont drop remainders
                    out_feats.extend([v])
                    continue

                out_feats.extend(v.get_feature_names_out())

            return  out_feats

    def transform(self, df, returnDF=True):
        ret = super().transform(df)
        rdf = None
        if (returnDF):
            ret = pd.DataFrame(ret, columns= self.out_feature_names())

        return ret


    def inverse_transform(self, sdf, inplace=False):
        ret = sdf if (inplace) else pd.DataFrame()

        for s,v in self.named_transformers_.items():
            if (not hasattr(v, "inverse_transform")):
                continue;
            
            fo = v.get_feature_names_out()
            #print(f"Inverting {fo} => {v.feature_names_in_} {set(fo).issubset(sdf.columns)} ")
            if (not set(fo).issubset(sdf.columns)):
                continue;
            ret[v.feature_names_in_] = v.inverse_transform(sdf[fo])

        return ret

    def save(self, file="my", ext=".scaler.pkl"):
        pickle.dump(self, open(f'{file}.scaler.pkl', 'wb'))

    def load(self, file="my", ext=".scaler.pkl"):
        ret = pickle.load( open(f'{file}.scaler.pkl', 'rb'))
        return ret


    '''
        Scale data frame
    '''
    @staticmethod
    def scale_df(df, num_unique=5, numericScaler= StandardScaler, numerics = None, categorical = None):
        # Detect numerics if both not given; if one is given - we assume other is not required
        if ( numerics is None and categorical is None):
            numerics, categorical = myColumnTransformer.find_cat_numerics_names(df, num_unique)

        scaler = myColumnTransformer( transformers= 
                [(n  ,  numericScaler(), [n] ) for n in numerics] +
                [("categorical",  OneHotEncoder(sparse=False, handle_unknown="ignore"), categorical)]
            , remainder='drop',
            verbose_feature_names_out = False)

        x = scaler.fit_transform(df)
        return scaler, pd.DataFrame(x, columns= scaler.out_feature_names())


    '''
        Finds numeric and categorical columns from DF.
        It returns empty lists if there are no columns matching the criteria.
        For ex: if there are no categorical variables, it returns empty list in categori.
    '''
    @staticmethod
    def find_cat_numerics_names(df, num_unique=5):
        unique = df.nunique()
        numerics = unique[unique  >  num_unique].index.to_list()
        numerics = [ i for i in numerics if is_numeric_dtype(df[i]) ]
        categori = unique[unique <= num_unique].index.to_list()

        return numerics, categori


'''
    Here is a test and how to use it.
'''
def testMyColumnTransformer():
    df = pd.read_csv("../data/processminer-rare-event-mts.csv.zip", sep=";")
    nums, cats = myColumnTransformer.find_cat_numerics_names(df, 5)

    # Lets take and example of first 5 columsn and cats columns for our test
    # Eliminate first column because it is a time column
    #
    df1 = df[nums[1:5]+cats]
    display(df1[0:4])
    '''
        time       x1	       x2         x3	       x4          x5       y  x61
    --------------- -------     ---------   --------    --------    ---------   -  ---
    0	5/1/99 0:00	0.376665	-4.596435	-4.095756	13.497687	-0.118830	0	0
    1	5/1/99 0:02	0.475720	-4.542502	-4.018359	16.230659	-0.128733	0	0
    2	5/1/99 0:04	0.363848	-4.681394	-4.353147	14.127997	-0.138636	0	0
    3	5/1/99 0:06	0.301590	-4.758934	-4.023612	13.161566	-0.148142	0	0
    4	5/1/99 0:08	0.265578	-4.749928	-4.333150	15.267340	-0.155314	0	0
    '''

    scaler, df2 = myColumnTransformer.scale_df(df1)
    print("\nScaled Dataframe:")
    display(df2[0:4])


    # Now you can use scaler to scale other data - 
    # it will do the job as long as there are columns in dataframe matching i/p columns
    # It correctly returns the columns in correct order as original order 
    #
    # You can call transform set returnDF to False to get raw numpy
    #
    print("\nOut of order columns still work correctly:")
    df3 = scaler.transform(df[df.columns[::-1]], True)
    display(df3[0:4])


    # It throwns an error if the expected column is not in the dataframe
    # 
    cols = "x1 x2 x3 x4 x7 y x61"
    try:
        df3 = pd.DataFrame(scaler.transform(df[cols]), columns= scaler.out_feature_names())
    except Exception as e:
        print (f"+++ EXCEPTION IS CORRECT KeyError  missing 'x5' {e} {type(e)}")

    print("\nInverse dataframe correctly inverts it:")
    idf = scaler.inverse_transform(df3)
    display(idf[0:4])

    print('''\nInverse dataframe correctly inverts it: 
    Does it do it if columns or out of order?
    => It DOES!!! See below
    ''')
    odf = df3[df3.columns[::-1]]
    display(odf[0:4])
    idf = scaler.inverse_transform(odf)
    display(idf[0:4])


#testMyColumnTransformer()

# Anomaly utils

In [None]:
#%%writefile "ts_transformer.py"

# DO NOT EDIT THIS FILE - GENERATED FROM 02_ts_utils.ipynb

import pandas as pd
from pandas.api.types import is_numeric_dtype
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import pickle



# Test the utilities

In [None]:
# Test window function

ds = tf.data.Dataset.from_tensor_slices(df_scaled_trn[df_scaled_trn.columns[:4]])
wd = window(ds, 3, 2, slice(1,3), 1,1)
for w in wd.take(3):
    print(f"{w[0].numpy()}\n=>:\n{w[1].numpy()} \n")

##  Advanced Windowing and Scaling

In [None]:
file = '../data/jena_climate_2009_2016.csv.zip'
file = '../data/processminer-rare-event-mts.csv.zip'
df = pd.read_csv(file, sep=';')
df

In [None]:
scaler, dfo = scale_df( df[df.columns[2:]] )
dfo

In [None]:
sdfo = dfo.iloc[:,[0,1,2,-1,-2]]
ret = scaler.inverse_transform(sdfo)
ret

In [None]:
scaler.save("/tmp/test")

In [None]:
scaler1 = scaler.load("/tmp/test")

In [None]:
sdfo = dfo.iloc[:,[0,1,2,-1,-2]]
ret = scaler.inverse_transform(sdfo)
ret