In [1]:
import numpy as np
import pandas as pd
import os
import sys
import csv
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import optuna
import random
import joblib
import math
import logging
from msig import Motif, NullModel
from config import RESULTS_MOTIF_DIR, RESULTS_DIR, IMAGES_DIR, DATA_DIR, DATASET_PATH, VARIABLES, STUMPY_EXCL_ZONE_DENOM, TOP_K_MP, INCLUDE, NORMALIZE, SUBSQUENCES_LENGTHS

print(f"Results will be saved in: {RESULTS_DIR}")
print(f"Images will be saved in: {IMAGES_DIR}")
print(f"Data will be accessed from: {DATA_DIR}")


if '__file__' in globals():
    # For standalone scripts
    base_dir = os.path.dirname(__file__)
else:
    # For Jupyter or interactive environments
    base_dir = os.getcwd()

# Add the parent directory of `utils` to the Python path
sys.path.append(os.path.abspath(os.path.join(base_dir, "../")))

2025-01-01 17:52:29,076 - INFO - Results will be saved in: /home/mgsilva/motifpred/results/populationdensity
2025-01-01 17:52:29,077 - INFO - Images will be saved in: /home/mgsilva/motifpred/images/populationdensity
2025-01-01 17:52:29,077 - INFO - Data will be accessed from: /home/mgsilva/motifpred/data/populationdensity


Results will be saved in: /home/mgsilva/motifpred/results/populationdensity
Images will be saved in: /home/mgsilva/motifpred/images/populationdensity
Data will be accessed from: /home/mgsilva/motifpred/data/populationdensity


In [2]:
# read csv
data = pd.read_csv(
    DATASET_PATH,
    parse_dates=["one_time"],
    date_format="%Y-%m-%d %H:%M:%S",
    index_col=0,
)

data = data[data["township_name"] == "Avenidas Novas"]
#set index to one_time and township_name
data = data.set_index(["one_time", "township_name"]).sort_index()
#filter by variable columns
data = data[VARIABLES]
data

Unnamed: 0_level_0,Unnamed: 1_level_0,sum_terminals,sum_roaming_terminals,sum_phonecalls
one_time,township_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2021-09-15 00:00:00,Avenidas Novas,260700.0,17678.0,2262.0
2021-09-15 01:00:00,Avenidas Novas,276675.0,17533.0,994.0
2021-09-15 02:00:00,Avenidas Novas,284563.0,15280.0,725.0
2021-09-15 03:00:00,Avenidas Novas,279563.0,12424.0,393.0
2021-09-15 04:00:00,Avenidas Novas,281460.0,10789.0,290.0
...,...,...,...,...
2021-11-30 19:00:00,Avenidas Novas,391367.0,12886.0,17569.0
2021-11-30 20:00:00,Avenidas Novas,352361.0,11026.0,13738.0
2021-11-30 21:00:00,Avenidas Novas,388246.0,13957.0,16368.0
2021-11-30 22:00:00,Avenidas Novas,360169.0,13943.0,8635.0


In [3]:
# motif discovery
import stumpy
from stumpy import config

config.STUMPY_EXCL_ZONE_DENOM = STUMPY_EXCL_ZONE_DENOM  

for m in SUBSQUENCES_LENGTHS:
    X = data.to_numpy()
    print(X.shape)
    out = stumpy.mstump(X, m, normalize=NORMALIZE)
    np.save(
        RESULTS_MOTIF_DIR,
        out,
        allow_pickle=True,
    )

2025-01-01 17:32:39,732 - INFO - init


(1848, 13)


In [4]:
mp_stats_table = pd.read_csv(
    results_path
    + "/table_motifs_normalize={}_min_neighbors={}_max_distance={}_cutoff={}_max_matches={}_max_motifs={}_{}.csv".format(
        normalize,
        min_neighbors,
        max_distance,
        cutoff,
        max_matches,
        max_motifs,
        i[0],
    )
)
motif_lengths = mp_stats_table["m"].unique()
motif_stats_table = pd.DataFrame(
    columns=[
        "m",
        "#motifs",
        "avg_n_matches",
        "avg_n_features",
        "avg_probability",
        "avg_pvalue",
        "#sig_motifs(<0.01)",
        "significant",
        "#sig_hochberg",
    ]
)
for m in motif_lengths:
    table = mp_stats_table[mp_stats_table["m"] == m]
    if table.empty:
        continue
    n_motifs = table.shape[0]
    n_sig_motifs_0001 = table[table["p-value"] < 0.001].shape[0]
    n_sig_motifs_hochberg = table[table["Sig_Hochber"]].shape[0]
    avg_n_matches = (
        round(table["#Matches"].mean(), 2),
        round(table["#Matches"].std(), 3),
    )
    avg_n_features = round(table["k"].mean(), 2), round(table["k"].std(), 3)
    avg_probability = table["P"].mean(), table["P"].std()
    avg_pvalue = table["p-value"].mean(), table["p-value"].std()

    stats_df = {
        "m": m,
        "#motifs": n_motifs,
        "#sig_motifs(<0.001)": n_sig_motifs_0001,
        "significant": (n_sig_motifs_0001 * 100) / n_motifs,
        "avg_n_matches": avg_n_matches,
        "avg_n_features": avg_n_features,
    }

    motif_stats_table = (
        pd.DataFrame.from_records([stats_df])
        if motif_stats_table.empty
        else pd.concat(
            [motif_stats_table, pd.DataFrame.from_records([stats_df])],
            ignore_index=True,
        )
    )

print(motif_stats_table.to_latex(index=False, float_format="%.3f"))

NameError: name 'top_resids' is not defined