In [None]:
import numpy as np
import pandas as pd
import os
import csv
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import optuna
import random
import joblib
import math
import logging
from msig import Motif, NullModel


results_dir = '../results/populationdensity/'
images_dir = '../images/populationdensity/'
data_dir = '../data/populationdensity/'

In [None]:
# read csv
hourly_taz_data = pd.read_csv(
    "../data/populationdensity/hourly_taz.csv",
    parse_dates=["one_time"],
    date_format="%Y-%m-%d %H:%M:%S",
    index_col=0,
)
# taz_id to taz_name dict
taz_id_name = {}
for taz_id in hourly_taz_data["taz_id"].unique():
    taz_name = hourly_taz_data[hourly_taz_data["taz_id"] == taz_id]["taz_name"].values[
        0
    ]
    taz_id_name[taz_id] = taz_name

hourly_taz_data = hourly_taz_data[hourly_taz_data["taz_name"] != "Beato (Picheleira)"]
hourly_taz_data = hourly_taz_data[
    hourly_taz_data["taz_name"] != "Alcântara (Ribeirinha - Belém)"
]
hourly_taz_data

In [None]:
# create folders in results path
if not os.path.exists(results_dir):
    os.makedirs(results_dir + "/mp")

In [None]:
# motif discovery
import stumpy
from stumpy import config

config.STUMPY_EXCL_ZONE_DENOM = 2  # r = np.ceil(m/2)
top_k_mp = 1
include = None
normalize = True
subsequence_lengths = [6]

data = hourly_taz_data


# plot the data and residuals of top 3 taz_ids
top_taz_ids = stats_table["id"].head(1).values
# create dataframe with taz_id and resids
top_resids = pd.DataFrame()


for m in subsequence_lengths:
    for i, df in data.groupby(by=["id", "feature"]):
        X = np.squeeze(df["residuals"].values)
        out = stumpy.stump(X, m, normalize=normalize, k=top_k_mp)
        np.save(
            results_dir
            + "/mp/hourly_taz_normalize={}_topkmp={}_m={}_{}_{}.npy".format(
                normalize, top_k_mp, m, i[0], i[1]
            ),
            out,
            allow_pickle=True,
        )

In [None]:
# create a new table for each motif length with statistics of the motifs (number of motifs found,
# number of significant motifs, average number of matches +- std, average of features +- std,
# average probability +- std, average pvalue +- std)

for i, df in top_resids.groupby(by=["id", "feature"]):
    print(i[0], taz_id_name[i[0]])
    mp_stats_table = pd.read_csv(
        results_path
        + "/table_motifs_normalize={}_min_neighbors={}_max_distance={}_cutoff={}_max_matches={}_max_motifs={}_{}.csv".format(
            normalize,
            min_neighbors,
            max_distance,
            cutoff,
            max_matches,
            max_motifs,
            i[0],
        )
    )
    motif_lengths = mp_stats_table["m"].unique()
    motif_stats_table = pd.DataFrame(
        columns=[
            "m",
            "#motifs",
            "avg_n_matches",
            "avg_n_features",
            "avg_probability",
            "avg_pvalue",
            "#sig_motifs(<0.01)",
            "significant",
            "#sig_hochberg",
        ]
    )
    for m in motif_lengths:
        table = mp_stats_table[mp_stats_table["m"] == m]
        if table.empty:
            continue
        n_motifs = table.shape[0]
        n_sig_motifs_0001 = table[table["p-value"] < 0.001].shape[0]
        n_sig_motifs_hochberg = table[table["Sig_Hochber"]].shape[0]
        avg_n_matches = (
            round(table["#Matches"].mean(), 2),
            round(table["#Matches"].std(), 3),
        )
        avg_n_features = round(table["k"].mean(), 2), round(table["k"].std(), 3)
        avg_probability = table["P"].mean(), table["P"].std()
        avg_pvalue = table["p-value"].mean(), table["p-value"].std()

        stats_df = {
            "m": m,
            "#motifs": n_motifs,
            "#sig_motifs(<0.001)": n_sig_motifs_0001,
            "significant": (n_sig_motifs_0001 * 100) / n_motifs,
            "avg_n_matches": avg_n_matches,
            "avg_n_features": avg_n_features,
        }

        motif_stats_table = (
            pd.DataFrame.from_records([stats_df])
            if motif_stats_table.empty
            else pd.concat(
                [motif_stats_table, pd.DataFrame.from_records([stats_df])],
                ignore_index=True,
            )
        )

    print(motif_stats_table.to_latex(index=False, float_format="%.3f"))