In [1]:
import numpy as np
import pandas as pd
import os
import sys
import csv
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib.image as mpimg
import optuna
import random
import joblib
import math
import logging
from msig import Motif, NullModel
from config import RESULTS_MOTIF_DIR, RESULTS_DIR, IMAGES_DIR, DATA_DIR, DATASET_PATH, VARIABLES, STUMPY_EXCL_ZONE_DENOM, TOP_K_MP, INCLUDE, NORMALIZE, SUBSQUENCES_LENGTHS

print(f"Results will be saved in: {RESULTS_DIR}")
print(f"Images will be saved in: {IMAGES_DIR}")
print(f"Data will be accessed from: {DATA_DIR}")


if '__file__' in globals():
    # For standalone scripts
    base_dir = os.path.dirname(__file__)
else:
    # For Jupyter or interactive environments
    base_dir = os.getcwd()

# Add the parent directory of `utils` to the Python path
sys.path.append(os.path.abspath(os.path.join(base_dir, "../")))

2025-01-29 14:50:50,472 - INFO - Results will be saved in: /home/mgsilva/motifpred/results/household
2025-01-29 14:50:50,472 - INFO - Images will be saved in: /home/mgsilva/motifpred/images/household
2025-01-29 14:50:50,472 - INFO - Data will be accessed from: /home/mgsilva/motifpred/data/household


Results will be saved in: /home/mgsilva/motifpred/results/household
Images will be saved in: /home/mgsilva/motifpred/images/household
Data will be accessed from: /home/mgsilva/motifpred/data/household


In [2]:
data_df = pd.read_csv(DATASET_PATH, index_col=0).astype(float)
data_df = data_df[VARIABLES]
labels = pd.read_csv(DATA_DIR  / f"labels.csv", index_col=0).astype(float)
data = data_df.values.T
data.shape

(2, 77760)

In [3]:
# motif discovery
import stumpy
from stumpy import config

config.STUMPY_EXCL_ZONE_DENOM = STUMPY_EXCL_ZONE_DENOM  

for m in SUBSQUENCES_LENGTHS:
    mp, mp_indices = stumpy.mstump(data, m, normalize=NORMALIZE)
    np.save(
        RESULTS_MOTIF_DIR / f"normalized_{NORMALIZE}_top_{TOP_K_MP}_m_{m}_mp.npy",
        mp,
        allow_pickle=True,
    )
    np.save(
        RESULTS_MOTIF_DIR / f"normalized_{NORMALIZE}_top_{TOP_K_MP}_m_{m}_mp_indices.npy",
        mp_indices,
        allow_pickle=True,
    )

2025-01-29 14:50:50,675 - INFO - init


In [4]:
def table_summary_motifs(
    motif_indices,
    motif_distances,
    motif_subspaces,
    data,
    k_distances,
    m,
    normalize,
    max_allowed_dist,
):
    mp_stats_table = pd.DataFrame(
        columns=[
            "ID",
            "k_distances",
            "Features",
            "m",
            "#Matches",
            "Indices",
            "max(dists)",
            "min(dists)",
            "med(dists)",
        ]
    )

    motif_index = 0

    n_vars, n_time = data.shape

    if normalize:
        data = (data - np.mean(data, axis=1)[:, np.newaxis]) / np.std(data, axis=1)[
            :, np.newaxis
        ]


    for motif_indice, match_indices in enumerate(motif_indices):
        dimensions = motif_subspaces[motif_indice]

        # remove filling values of -1 and Nans from motif_indices and match_distances
        match_indices = match_indices[match_indices != -1]
        match_distances = motif_distances[motif_indice]
        match_distances = match_distances[~np.isnan(match_distances)]

        # if is empty, skip
        if len(match_indices) == 0:
            continue

        excl_zone = np.ceil(m / config.STUMPY_EXCL_ZONE_DENOM)

        # remove trivial matches
        non_trivial_matches = []
        for indice in match_indices:
            trivial = False
            for indice_new in non_trivial_matches:
                if abs(indice - indice_new) <= excl_zone:
                    trivial = True
                    break
            if not trivial:
                non_trivial_matches.append(indice)
        match_indices = non_trivial_matches

        max_possible_matches = int(np.floor((n_time - m) / excl_zone + 1))


        max_dist = np.max(match_distances)
        min_dist = np.min(match_distances[1:])

        if k_distances is None:  # consider all matches
            med_dist = np.median(match_distances[1:])
        else:  # consider only the k closest matches
            med_dist = np.median(match_distances[1 : k_distances + 1])

        
        # data features are now the ones in the dimensions
        used_features = [f"{dimension}" for dimension in dimensions]

        stats_df = {
            "ID": str(motif_index),
            "k": len(dimensions),
            "Features": ",".join(used_features),
            "m": m,
            "#Matches": len(match_indices) - 1,
            "Indices": match_indices,
            "max(dists)": np.around(max_dist, 3),
            "min(dists)": np.around(min_dist, 3),
            "med(dists)": np.around(med_dist, 3),
        }

        mp_stats_table = (
            pd.DataFrame.from_records([stats_df])
            if mp_stats_table.empty
            else pd.concat(
                [mp_stats_table, pd.DataFrame.from_records([stats_df])],
                ignore_index=True,
            )
        )

        motif_index += 1
    return mp_stats_table

In [5]:
k_distances = None
min_neighbors = 2
cutoff = np.inf
max_matches = 99999
max_distance = None
max_motifs = 99999
k = 1

# Initialize mp_stats_table outside the loop to accumulate results
mp_stats_table = pd.DataFrame()

for m in SUBSQUENCES_LENGTHS:
    mp = np.load(
        RESULTS_MOTIF_DIR / f"normalized_{NORMALIZE}_top_{TOP_K_MP}_m_{m}_mp.npy",
        allow_pickle=True,
    )
    indices = np.load(
        RESULTS_MOTIF_DIR / f"normalized_{NORMALIZE}_top_{TOP_K_MP}_m_{m}_mp_indices.npy",
        allow_pickle=True,
    )

    motif_distances, motif_indices, motif_subspaces, motif_mdls = stumpy.mmotifs(
        data,
        mp,
        indices,
        min_neighbors=min_neighbors,
        max_distance=max_distance,
        cutoffs=cutoff,
        max_matches=max_matches,
        max_motifs=max_motifs,
        k=k,
        include=INCLUDE,
        normalize=NORMALIZE,
    )
    if len(motif_indices[0]) == 0:
        continue

    # Create the table for the current iteration
    table = table_summary_motifs(
        motif_indices,
        motif_distances,
        motif_subspaces,
        data,
        k_distances,
        m,
        NORMALIZE,
        max_distance,
    )
    # Append the current table to mp_stats_table
    mp_stats_table = pd.concat([mp_stats_table, table], ignore_index=True)

# Save the accumulated mp_stats_table to CSV
mp_stats_table.to_csv(
    RESULTS_DIR / f"mp_stats_table_normalized_{NORMALIZE}_top_{TOP_K_MP}.csv",
    index=False,
)

In [6]:
mp_stats_table = pd.read_csv(
    RESULTS_DIR / f"mp_stats_table_normalized_{NORMALIZE}_top_{TOP_K_MP}.csv"
)

motif_lengths = mp_stats_table["m"].unique()
motif_stats_table = pd.DataFrame(
    columns=[
        "m",
        "#motifs",
        "avg_n_matches",
        "avg_n_features"
    ]
)
for m in motif_lengths:
    table = mp_stats_table[mp_stats_table["m"] == m]
    if table.empty:
        continue
    n_motifs = table.shape[0]

    avg_n_matches = (
        round(table["#Matches"].mean(), 2),
        round(table["#Matches"].std(), 3),
    )

    avg_n_features = (
        round(table["k"].mean(), 2),
        round(table["k"].std(), 3),
    )

    stats_df = {
        "m": m,
        "#motifs": n_motifs,
        "avg_n_matches": avg_n_matches,
        "avg_n_features": avg_n_features,
    }

    motif_stats_table = (
        pd.DataFrame.from_records([stats_df])
        if motif_stats_table.empty
        else pd.concat(
            [motif_stats_table, pd.DataFrame.from_records([stats_df])],
            ignore_index=True,
        )
    )

print(motif_stats_table.to_latex(index=False, float_format="%.3f"))

\begin{tabular}{rrll}
\toprule
m & #motifs & avg_n_matches & avg_n_features \\
\midrule
60 & 21 & (458.71, 213.2) & (2.0, 0.0) \\
180 & 14 & (163.5, 96.393) & (2.0, 0.0) \\
360 & 13 & (80.92, 36.466) & (2.0, 0.0) \\
\bottomrule
\end{tabular}



In [7]:
mp_stats_table = pd.read_csv(
    RESULTS_DIR / f"mp_stats_table_normalized_{NORMALIZE}_top_{TOP_K_MP}.csv"
)

subsequence_lengths = mp_stats_table["m"].unique()
for m in subsequence_lengths:
    print("########## m:{} #########".format(m))
    top_motifs = mp_stats_table[mp_stats_table["m"] == m]
    top_motifs = top_motifs.sort_values(by="#Matches", ascending=False).head(5)
    top_motifs = top_motifs[
        [
            "ID",
            "#Matches",
            "k",
            "Features",
            "max(dists)",
            "min(dists)",
            "med(dists)",
        ]
    ]
    print(top_motifs.to_latex(index=False, float_format="%.3f"))
    print("\n")

########## m:60 #########
\begin{tabular}{rrrlrrr}
\toprule
ID & #Matches & k & Features & max(dists) & min(dists) & med(dists) \\
\midrule
5 & 836 & 2 & 1,0 & 7.966 & 1.144 & 5.823 \\
20 & 758 & 2 & 1,0 & 8.785 & 6.080 & 8.252 \\
7 & 726 & 2 & 0,1 & 7.731 & 1.371 & 5.003 \\
6 & 720 & 2 & 0,1 & 6.928 & 1.270 & 4.671 \\
10 & 706 & 2 & 0,1 & 6.582 & 1.695 & 4.466 \\
\bottomrule
\end{tabular}



########## m:180 #########
\begin{tabular}{rrrlrrr}
\toprule
ID & #Matches & k & Features & max(dists) & min(dists) & med(dists) \\
\midrule
8 & 349 & 2 & 0,1 & 15.648 & 5.152 & 13.890 \\
10 & 333 & 2 & 0,1 & 15.999 & 7.742 & 14.747 \\
1 & 235 & 2 & 0,1 & 13.549 & 2.670 & 10.097 \\
9 & 214 & 2 & 0,1 & 14.149 & 7.112 & 13.212 \\
4 & 193 & 2 & 0,1 & 12.727 & 3.445 & 9.257 \\
\bottomrule
\end{tabular}



########## m:360 #########
\begin{tabular}{rrrlrrr}
\toprule
ID & #Matches & k & Features & max(dists) & min(dists) & med(dists) \\
\midrule
7 & 160 & 2 & 0,1 & 23.008 & 9.699 & 19.923 \\
11 & 125 & 

In [8]:
def plot_motif(ts_list, features, m, motif_indexes, motif_name):
    # Generate the time range starting at 2008-09-01 00:00:00 with 77760 minutes
    start_time = pd.Timestamp("2008-09-01 00:00:00")
    time_range = pd.date_range(start=start_time, periods=77760, freq="min")

    # Define 5 equally spaced indices for xticks on the right plots
    xtick_indices = np.linspace(0, len(time_range) - 1, 5, dtype=int)
    xtick_labels = [
        time_range[idx].strftime("%d,%b \n %H:%M:%S") for idx in xtick_indices
    ]

    fig, axes = plt.subplots(
        ncols=2, nrows=len(ts_list), figsize=(10, 2 * len(ts_list)), squeeze=False
    )

    for i in range(len(ts_list)):
        ts = ts_list[i]
        # plot light grey on the right side
        axes[i, 1].plot(ts, color="black", linewidth=0.5, alpha=0.5)

        # Set unique colors for motifs
        colors = plt.cm.tab20(np.linspace(0, 1, len(motif_indexes)))
        axes[i, 0].set_prop_cycle("color", colors)
        axes[i, 1].set_prop_cycle("color", colors)

        # Plot motifs on the left and highlight on the right
        for index in motif_indexes:
            subsequence_match = ts.iloc[index : index + m]
            axes[i, 0].plot(subsequence_match.values)  # Ensure left plot is drawn
            axes[i, 1].plot(subsequence_match, linewidth=2)

        # Set y-axis labels for left plot
        # Split long y-axis labels (e.g., those with 3 words) into multiple lines
        words = features[i].split()
        if len(words) == 3:
            wrapped_label = (
                f"{' '.join(words[:-1])}\n{words[-1]}"  # Add \n before the last word
            )
        else:
            wrapped_label = features[i]  # No line break for 2 words or less

        # Set y-axis labels for left plot
        axes[i, 0].set_ylabel(wrapped_label, rotation=90, size="large")
        xticks = [0] + list(range(max(1, m // 5), m - 1, max(1, m // 5))) + [m - 1]
        xticklabels = ["i"] + [f"i+{t}" for t in xticks[1:-1]] + [f"i+{m-1}"]

        axes[i, 0].set_xticks(xticks)
        axes[i, 0].set_xticklabels(xticklabels)
        plt.setp(axes[i, 0].xaxis.get_majorticklabels(), rotation=90)

        # Add custom xticks and labels to the right plots
        axes[i, 1].set_xticks(xtick_indices)
        axes[i, 1].set_xticklabels(xtick_labels, rotation=45)

        # Only display x-axis on the last row of plots
        if i != len(ts_list) - 1:
            axes[i, 0].axes.get_xaxis().set_visible(False)
            axes[i, 1].axes.get_xaxis().set_visible(False)

    # Set titles for the two columns
    axes[0, 0].set_title("Raw Subsequences")
    axes[0, 1].set_title("Motif in TS")

    plt.tight_layout()
    plt.savefig(
        images_dir + "/m=" + str(m) + "_motif_" + str(motif_name) + ".pdf",
        bbox_inches="tight",
    )
    plt.show()

    return None


In [9]:
# Load motif statistics table
file_path = os.path.join(
    results_dir,
    f"table_motifs_normalize={normalize}_min_neighbors={min_neighbors}_max_distance={max_distance}_cutoff={cutoff}_max_matches={max_matches}_max_motifs={max_motifs}.csv",
)
mp_stats_table = pd.read_csv(file_path)

# Extract unique subsequence lengths
subsequence_lengths = mp_stats_table["m"].unique()

ts = data

# Loop over each subsequence length
for m in subsequence_lengths:
    logging.info(f"Motif length: {m}")

    # Filter motifs by current subsequence length
    top_motifs = mp_stats_table[mp_stats_table["m"] == m]
    top_motifs = top_motifs.sort_values(by="Score Unified", ascending=False).head(5)

    # Loop over each top motif
    for top_motif in top_motifs.to_dict(orient="records"):
        print(top_motif)

        # Parse dimensions and indices
        dimensions = sorted(map(int, top_motif["Features"].split(",")))
        indices = sorted(map(int, top_motif["Indices"].strip("[]").split(",")))

        # Extract feature names
        features = [df_data.columns[dimension] for dimension in dimensions]

        # Add label feature to the list
        ts_list = [df_data[feature].reset_index(drop=True) for feature in features]

        # Generate motif name
        motif_name = top_motif["ID"]

        # Create the figure and axes once, with 3 subplots for the submeters
        fig, ax = plt.subplots(figsize=(10, 5), nrows=3, sharex=True)

        # Loop over each index and plot the subsequences
        for indice in indices:
            subseq = labels.iloc[indice: indice + m]

            # Plot subsequences for each submeter in the same 3 subplots
            for i, feature in enumerate(labels.columns):
                ax[i].plot(subseq[feature].values, alpha=0.7)  # Plot with transparency to differentiate subsequences
                ax[i].set_ylabel(feature.replace("_", " "))  # Format the y-axis label for better readability

        # Set common x-axis labels and formatting
        ax[-1].set_xlabel("Time Index")  # Only set x-axis label on the bottom plot

        # Adjust layout to prevent overlap
        plt.tight_layout()

        # Show the plot (or save it, depending on your requirements)
        plt.show()

        features = [feature.replace("_", " ") for feature in features]

        # Plot and save the motif
        plot_motif(ts_list, features, m, indices, motif_name)

NameError: name 'results_dir' is not defined