In [None]:
import numpy as np
import os
import matplotlib.pyplot as plt
import wandb
from wandb.integration.keras import WandbMetricsLogger, WandbModelCheckpoint

os.environ["KERAS_BACKEND"] = "torch"
import keras
keras.utils.set_random_seed(812)

In [None]:
wandb.login()

In [None]:
run = wandb.init(
    project = "motifpred"
)

In [None]:
#create synthetic time series random uniform integers
n = 1000
data = np.random.randint(1, 6, n)

#select random timewindows of length 5 without overlapping
motif_indexes = []
motif_pattern = [1,1,1,1,1]
p = len(motif_pattern)

max_interval = n//20
print(max_interval)
last_index = 0
while True:
    index_interval = np.random.randint(p+4, p + max_interval)
    if last_index + index_interval + p > n:
        break
    last_index = last_index + index_interval
    motif_indexes.append(last_index)

motif_indexes

In [None]:
#change the values of the time series in the selected timewindows to the motif pattern
for idx in motif_indexes:
    data[idx:idx+p] = motif_pattern

#introduce clue in the data
for idx in motif_indexes:
    data[idx-5:idx-2] = [1,2,3]

#plot the data and in red the motif pattern
plt.plot(data)
for idx in motif_indexes:
    plt.plot(range(idx, idx+p), motif_pattern, 'r')

In [None]:
from keras.preprocessing.sequence import pad_sequences

def create_dataset(data, past_window, step, forward_window, motif_indexes):
    X1, X2, y1, y = list(), list(), list(), list()
    for idx in range(len(data) - past_window - 1):
        if idx % step != 0:
            continue
    
        next_matches = [motif_idx for motif_idx in motif_indexes if motif_idx > idx + past_window]
        if not next_matches:
            continue #no match
        next_match = next_matches[0]
        if next_match > idx + past_window + forward_window:
            next_match = -1 #no match in forward window

        data_x1 = data[idx:idx+past_window]
        data_x2 = [motif_idx for motif_idx in motif_indexes if motif_idx <= idx+past_window]
        data_y1 = data[idx+past_window]
        data_y = next_match
        X1.append(data_x1)
        X2.append(data_x2)
        y1.append(data_y1)
        y.append(data_y) 

    X2 = pad_sequences(X2, padding='post', value=-1, dtype=int)
    return np.array(X1), np.array(X2), np.array(y1), np.array(y)

In [None]:
past_window = 100 #window size
step = 1 #step size for the sliding window
forward_window = 50 #
X1, X2, y1, y  = create_dataset(data, past_window=past_window, step=step, forward_window=forward_window, motif_indexes=motif_indexes)
print(X1[0], X2[0], y1[0], y[0])

In [None]:
#reshape input to be [samples, time steps, features]
X1 = np.reshape(X1, (X1.shape[0],  X1.shape[1], 1))
X2 = np.reshape(X2, (X2.shape[0],  X2.shape[1], 1))

In [None]:
class BlockingTimeSeriesSplit():
  def __init__(self, n_splits):
      self.n_splits = n_splits

  def get_n_splits(self, X, y, groups):
      return self.n_splits

  def split(self, X, y=None, groups=None):
      n_samples = len(X)
      k_fold_size = n_samples // self.n_splits
      indices = np.arange(n_samples)

      margin = 0
      for i in range(self.n_splits):
          start = i * k_fold_size
          stop = start + k_fold_size
          mid = int(0.8 * (stop - start)) + start
          yield indices[start: mid], indices[mid + margin: stop]

In [None]:
#train lstm
from keras.models import Sequential
from keras.optimizers import Adam
from keras.layers import LSTM, Dense, Masking
from keras import Input

def create_model(hp):
    model = Sequential()
    model.add(Input(shape=(X2.shape[1], X2.shape[2])))
    model.add(Masking(mask_value=-1))
    units = hp.Int('units', min_value=10, max_value=50, step=10)
    model.add(LSTM(units=units, activation='relu', return_sequences=False))
    model.add(Dense(1))
    hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3])
    model.compile(loss='mse', optimizer=Adam(learning_rate=hp_learning_rate), metrics=['mae', 'root_mean_squared_error', 'r2_score'])
    return model

In [None]:
import collections
import keras_tuner as kt
from keras.callbacks import EarlyStopping
import pickle


class CVTuner(kt.engine.tuner.Tuner):

    def run_trial(self, trial, x, y, *args, **kwargs):
        original_callbacks = kwargs.pop("callbacks", [])
        verbose = kwargs.pop("verbose", 0)

        metrics = collections.defaultdict(list)
        batch_size = trial.hyperparameters.Int('batch_size', 8, 32, step=8)
        epochs = trial.hyperparameters.Int('epochs', 400, 400, step=100)
        cv = BlockingTimeSeriesSplit(n_splits=5)
        for train_indices, test_indices in cv.split(x):
            X_train, X_test = x[train_indices], x[test_indices]
            y_train, y_test = y[train_indices], y[test_indices]
            model = self.hypermodel.build(trial.hyperparameters)
            model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, callbacks=original_callbacks, verbose=verbose)
            val_loss, val_mae, val_rmse, val_r2 = model.evaluate(X_test, y_test) 
            metrics["val_loss"].append(val_loss)
            metrics["val_mae"].append(val_mae)
            metrics["val_rmse"].append(val_rmse)
            metrics["val_r2"].append(val_r2)
            print(f"val_loss: {val_loss}, val_mae: {val_mae}, val_rmse: {val_rmse}, val_r2: {val_r2}")

        self.save_model(trial, model)
        return {name: np.mean(values) for name, values in metrics.items()}
    
    def save_model(self, trial, model):
        fname = os.path.join(self.get_trial_dir(trial.trial_id), "model.pickle")
        with open(fname, "wb") as f:
            pickle.dump(model, f)

    def load_model(self, trial):
        fname = os.path.join(self.get_trial_dir(trial.trial_id), "model.pickle")
        with open(fname, "rb") as f:
            return pickle.load(f)

tuner = CVTuner(
  hypermodel=create_model,
  oracle=kt.oracles.GridSearchOracle(
    objective='val_loss',
    max_trials=None))

tuner.search(
        x=X2,
        y=y,
        verbose=2,
        callbacks=[WandbMetricsLogger(log_freq=5)],
    )
run.finish()

In [None]:
tuner.results_summary()

In [None]:
best_model = tuner.get_best_models(num_models=1)[0]
best_model.summary()

In [None]:
cv = BlockingTimeSeriesSplit(5)
scores = {"val_loss": [], "val_mae": [], "val_rmse": [], "val_r2": []}
observed = []
predictions = []
for train_indices, test_indices in cv.split(X2):
    val_loss, val_mae, val_rmse, val_r2 = best_model.evaluate(X2[test_indices], y[test_indices])
    print(val_loss)
    observed.extend(y[test_indices])
    predictions.extend(best_model.predict(X2[test_indices]))
    scores["val_loss"].append(val_loss)
    scores["val_mae"].append(val_mae)
    scores["val_rmse"].append(val_rmse)
    scores["val_r2"].append(val_r2)

#boxplot of the scores
import seaborn as sns
import pandas as pd
sns.boxplot(data=pd.DataFrame(scores["val_mae"]))



In [None]:
def create_model_embeddinglstm(hp):
    model = Sequential()
    model.add(LSTM(units=50, activation='tanh', return_sequences=False))
    hp_learning_rate = hp.Choice('learning_rate', values=[0.01,0.001])
    model.add(Dense(1))
    model.compile(loss='mse', optimizer=Adam(learning_rate=hp_learning_rate), metrics=['mae'])
    return model

import keras_tuner as kt
from keras.callbacks import EarlyStopping

tuner= kt.RandomSearch(
        create_model_embeddinglstm,
        objective='mae',
        max_trials=5,
        executions_per_trial=3,
        project_name = 'embeddinglstm'
        )

tuner.search(
        x=X1,
        y=y1,
        epochs=10,
        batch_size=64
        )


In [None]:
best_model = tuner.get_best_models(num_models=1)[0]
best_model.summary()

In [None]:
#evaluate the model
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

y1_train_pred = best_model.predict(X1_train)
mse = mean_squared_error(y1_train, y1_train_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y1_train, y1_train_pred)
r2 = r2_score(y1_train, y1_train_pred)

print(f'MSE: {mse}')
print(f'RMSE: {rmse}')
print(f'MAE: {mae}')
print(f'R2: {r2}')

plt.plot(y1_train, label='True')
plt.plot(y1_train_pred, label='Predicted')



In [None]:
#get the embedding layer
embeddings = best_model.layers[1].get_weights()[0]
#embeddings to 1d array
embeddings = embeddings.flatten()
embeddings

In [None]:
#join the embeddings with the motif indexes
X_train = []
for x2 in X2_train:
    #concat embeddigs with x2
    x2 = x2.flatten()
    x2 = np.concatenate((embeddings, x2))
    X_train.append(x2)

X_train = np.array(X_train)

X_test = []
for x2 in X2_test:
    #concat embeddigs with x2
    x2 = x2.flatten()
    x2 = np.concatenate((embeddings, x2))
    X_test.append(x2)

X_test = np.array(X_test)
X_train.shape


In [None]:
from xgboost import XGBRegressor

model = XGBRegressor()
model.fit(X_train, y_train)

In [None]:
#evaluate the model
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt

y_train_pred = model.predict(X_train)
mse = mean_squared_error(y_train, y_train_pred)
mae = mean_absolute_error(y_train, y_train_pred)
r2 = r2_score(y_train, y_train_pred)

print(f'MSE: {mse}')
print(f'MAE: {mae}')
print(f'R2: {r2}')

plt.plot(y_train, label='True')
plt.plot(y_train_pred, label='Predicted')

In [None]:
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'MSE: {mse}')
print(f'RMSE: {rmse}')
print(f'MAE: {mae}')
print(f'R2: {r2}')

plt.plot(y_test, label='True')
plt.plot(y_pred, label='Predicted')


In [None]:
#join the embeddings with the motif indexes and the motif pattern
X_train = []
for x2 in X2_train:
    #concat embeddigs with x2
    x2 = x2.flatten()
    x2 = np.concatenate((embeddings, x2, motif_pattern))
    X_train.append(x2)

X_train = np.array(X_train)

X_test = []
for x2 in X2_test:
    #concat embeddigs with x2
    x2 = x2.flatten()
    x2 = np.concatenate((embeddings, x2, motif_pattern))
    X_test.append(x2)

X_test = np.array(X_test)
X_train.shape
