Import data

In [None]:
import numpy as np
import pandas as pd

from data_provider.m4 import M4Dataset

seasonal_patterns = 'Monthly'
m4 = M4Dataset.load(training=True, dataset_file='./dataset/m4')
training_values = np.array(
    [v[~np.isnan(v)] for v in m4.values[m4.groups == seasonal_patterns]])  # split different frequencies
ids = np.array([i for i in m4.ids[m4.groups == seasonal_patterns]])
data = [ts for ts in training_values]
flat_data = np.concatenate(data)

print('Dataset size: ', len(data))
print('Flat Dataset size: ', len(flat_data))


def create_bin_count_table(array, bin_ranges):
    # Initialize bins
    bins = [-np.inf] + bin_ranges + [np.inf]

    # Calculate histogram bin counts
    bin_counts, _ = np.histogram(array, bins=bins)

    # Create a DataFrame for the table
    bin_labels = [f"Bin {i + 1}" for i in range(len(bin_counts))]
    histogram_df = pd.DataFrame({
        'Bin': bin_labels,
        'Range': [f"<= {bin_ranges[0]}" if i == 0 else
                  f">= {bin_ranges[-1]}" if i == len(bin_counts) - 1 else
                  f"{bin_ranges[i - 1]} < n <= {bin_ranges[i]}" for i in range(len(bin_counts))],
        'Count': bin_counts
    })

    return histogram_df


Manual Discretization

In [15]:

# Calculate histogram bin counts without plotting
bin_counts, bin_edges = np.histogram(flat_data, bins=50)

# Create a table of bin counts
histogram_table = np.column_stack((bin_edges[:-1], bin_edges[1:], bin_counts))

# Convert the table to a more readable format
histogram_df = pd.DataFrame(histogram_table, columns=['Bin Start', 'Bin End', 'Count'])

ranges = [50, 100, 150, 200, 250, 300, 350, 400, 450, 500, 600, 700, 800, 900, 1000, 1200, 1400, 1600, 1800, 2000,
          2500, 3000, 3500, 4000, 4500, 5000, 6000, 7000, 8000, 9000, 10000, 12000, 14000, 16000, 18000, 20000, 30000]

histogram_df = create_bin_count_table(flat_data, ranges)
histogram_df


# discretizer = Discretization(ranges)
# print('Descretizing data')
# discretized_data = [[discretizer.discretize(value) for value in element] for element in data]


Unnamed: 0,Bin,Range,Count
0,Bin 1,<= 50,180
1,Bin 2,50 < n <= 100,2281
2,Bin 3,100 < n <= 150,6018
3,Bin 4,150 < n <= 200,10214
4,Bin 5,200 < n <= 250,15506
5,Bin 6,250 < n <= 300,17748
6,Bin 7,300 < n <= 350,21369
7,Bin 8,350 < n <= 400,23509
8,Bin 9,400 < n <= 450,27758
9,Bin 10,450 < n <= 500,30186


In [16]:
import numpy as np
from scipy.stats import norm
from pyts.approximation import SymbolicAggregateApproximation

# SAX transformation
n_bins = 25
sax = SymbolicAggregateApproximation(n_bins=n_bins, strategy='normal')
X_sax = sax.fit_transform(np.array(flat_data).reshape(1, -1))

# Compute gaussian bins
bins = norm.ppf(np.linspace(0, 1, n_bins + 1)[1:-1])

# Show the results for the first time series
bottom_bool = np.r_[True, X_sax[0, 1:] > X_sax[0, :-1]]

print(bins)

# Count unique values
unique_values, counts = np.unique(X_sax, return_counts=True)

# Displaying unique values and their counts
for value, count in zip(unique_values, counts):
    print(f"{value}: {count}")


[-1.75068607 -1.40507156 -1.17498679 -0.99445788 -0.84162123 -0.70630256
 -0.58284151 -0.4676988  -0.35845879 -0.2533471  -0.15096922 -0.05015358
  0.05015358  0.15096922  0.2533471   0.35845879  0.4676988   0.58284151
  0.70630256  0.84162123  0.99445788  1.17498679  1.40507156  1.75068607]
y: 10382411


SAX

In [2]:
import numpy as np
from saxpy.znorm import znorm
from saxpy.sax import ts_to_string
from saxpy.alphabet import cuts_for_asize

dat_znorm = znorm(flat_data)
#dat_paa_3 = paa(dat_znorm, len(dat_znorm))
print(dat_znorm)

sax_flat_data = ts_to_string(dat_znorm, cuts_for_asize(20))
char_to_int = lambda c: ord(c) - ord('a')
sax_flat_data = [char_to_int(char) for char in sax_flat_data]
sax_flat_data = np.array(sax_flat_data)

original_shapes = [array.shape for array in data]
reshaped_data = []
start = 0
for shape in original_shapes:
    end = start + shape[0]
    reshaped_data.append(sax_flat_data[start:end].tolist())
    start = end

print(len(reshaped_data))
print(len(data))

[1.17443501 1.28305097 1.35132386 ... 0.32102389 0.31171395 0.29309408]
48000
48000


In [3]:
from dBG.FeatureGraph import FeatureGraph
from dBG.utils.Substitute.LogOdds import LogOdds

sub_matrix = LogOdds(reshaped_data)

dbg = FeatureGraph(4, reshaped_data, approximate=True, substitute=sub_matrix, similarity_threshold=0.003)
print(dbg)
features = dbg.generate_features(0.01)



[[ 2.99682385e+00  1.16848379e+00  1.09520267e-01  1.63611648e-02
   1.68007753e-02 -3.05617866e-02 -3.59940505e-02 -1.34990554e-02
  -6.59260889e-02 -1.08264769e-01  9.77156076e-03 -9.32387260e-02
  -1.07838657e-01 -1.74006425e-01 -2.15598869e-01 -3.15904512e-01
  -4.82235013e-01 -8.80356960e-01 -3.44550832e-01]
 [ 1.16848379e+00  5.24753056e-01  8.90816683e-02  9.91470468e-03
   1.70700978e-03 -2.15162331e-02 -1.09521769e-02  1.48254580e-02
  -1.09501127e-02 -3.30419973e-02  2.53840553e-02 -2.10369950e-02
  -1.83074506e-02 -4.21250866e-02 -4.44119362e-02 -7.66257194e-02
  -1.53006621e-01 -3.55150202e-01 -4.22421241e-01]
 [ 1.09520267e-01  8.90816683e-02  2.50508185e-02  1.88642005e-03
  -1.81128981e-03 -5.98343428e-03 -2.20213247e-03  5.33482347e-03
   4.73507459e-04 -4.44096702e-03  8.85831501e-03 -6.03861548e-04
   1.88559952e-03 -1.59638561e-03 -7.68944430e-04 -5.46548396e-03
  -2.01140097e-02 -5.53210557e-02 -1.15316810e-01]
 [ 1.63611648e-02  9.91470468e-03  1.88642005e-03  2.51

100%|██████████| 57807/57807 [10:12<00:00, 94.35it/s]  


FeatureGraph with 5774 nodes and 57807 edges
Threshold: 102219.43094306503
Turns: 1398.0
1 14,14,13,19,14,14,15,9,12,14,15,9,14,12,15,9,14,14,15,19,14,14,13,9,14,12,9,16,14,12,9,6,12,14,17,17,12,14,17,6,10,10,17,18,14,14,16,18,14,14,17,18,14,6,12,13,12,6,10,15,14,6,6,13,14,15,6,10,12,18,17,12,12,9,6,14,12,17,6,17,17,17,6,6,17,17,6,17,6,18,6,17,14,6,12,18,12,17,12,17,12,18,12,6,14,16,12,17,14,16,10,17,15,15,15,17,15,13,12,9,15,14,14,19,16,12,12,9,17,10,13,17,16,10,13,17,15,13,10,17,10,17,14,16,12,6,12,13,14,9,12,15,12,9,12,16,12,18,12,16,12,17,12,16,12,6,10,6,17,10,12,18,16,10,12,18,15,12,14,9,17,14,12,18,15,10,15,18,17,14,13,18,17,14,15,18,16,14,13,18,16,14,12,18,16,14,15,18,17,10,13,12,9,12,15,10,6,14,13,14,6,12,6,10,17,12,13,10,17,15,13,13,17,15,10,13,17,17,10,13,16,16,10,13,12,17,13,13,15,17,13,13,12,9,13,10,12,6,13,10,12,18,6,14,10,13,6,14,10,6,15,10,12,18,13,10,13,18,12,12,18,6,12,14,16,17,12,14,16,6,15,14,12,9,13,12,14,9,16,12,14,18,13,15,14,17,15,15,10,18,12,15,10,17,15,12,10,17

In [4]:
# 231
sim_thr = 0.003

larger_than_number = sub_matrix.similarity_matrix > sim_thr
len(sub_matrix.similarity_matrix[larger_than_number])

112

Remove similar features

In [8]:
from Levenshtein import distance as lev

print(len(features))


def tuple_to_string(t):
    return ''.join(chr(i) for i in t)


unique_features = []
for feat1 in features:
    is_similar = False
    for feat2 in unique_features:
        sim_thr = max(len(feat1), len(feat2)) * 0.5
        if lev(tuple_to_string(feat1), tuple_to_string(feat2)) < sim_thr:
            is_similar = True
            break
    if not is_similar:
        unique_features.append(feat1)

print(len(unique_features))


3308
962


Allignment Alg

In [14]:

import swalign
from tqdm.contrib.concurrent import process_map  # Import process_map from tqdm.contrib.concurrent

match = 2
mismatch = -1
scoring = swalign.NucleotideScoringMatrix(match, mismatch)
sw = swalign.LocalAlignment(scoring)  # you can also choose gap penalties, etc...


def tuple_to_string(t):
    return ''.join(chr(i) for i in t)


def process_row(row):
    row_str = tuple_to_string(row)
    csv_row = [row]  # Start with the original int tuple
    for feature in unique_features:
        feature_str = tuple_to_string(feature)
        score = sw.align(row_str, feature_str).score
        csv_row.append(score)
    return csv_row


# Use process_map for parallel processing with a progress bar
results = list(process_map(process_row, reshaped_data, max_workers=None, chunksize=1, total=len(reshaped_data)))



  0%|          | 0/48000 [00:00<?, ?it/s]

Exception in thread QueueManagerThread:
Traceback (most recent call last):
  File "/usr/lib/python3.8/threading.py", line 932, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.8/threading.py", line 870, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.8/concurrent/futures/process.py", line 394, in _queue_management_worker
    work_item.future.set_exception(bpe)
  File "/usr/lib/python3.8/concurrent/futures/_base.py", line 547, in set_exception
    raise InvalidStateError('{}: {!r}'.format(self._state, self))
concurrent.futures._base.InvalidStateError: CANCELLED: <Future at 0x7f0267d1d910 state=cancelled>

KeyboardInterrupt



In [15]:
import pickle

with open('dataset/Disc/reshaped_data.pkl', 'wb') as file:
    # Use pickle to serialize and save the data
    pickle.dump(reshaped_data, file)

with open('dataset/Disc/features.pkl', 'wb') as file:
    # Use pickle to serialize and save the data
    pickle.dump(features, file)


In [2]:
import pandas as pd

lev_features = pd.read_csv('dataset/Disc/all_features_weighted_lev_dist.csv')
sampled_df = lev_features.sample(frac=0.1)

print(lev_features.shape)
print(sampled_df.shape)

sampled_df = sampled_df.drop('Data', axis=1)
correlation_matrix = sampled_df.corr()

(48000, 3309)
(4800, 3309)


In [5]:
k = 50

selected_features = []
remaining_features = correlation_matrix.columns.tolist()

# Iteratively select features
for _ in range(k):
    min_corr = float('inf')
    best_feature = None

    for feature in remaining_features:
        # Calculate the average correlation of the feature with the already selected features
        if selected_features:
            avg_corr = correlation_matrix.loc[selected_features, feature].abs().mean()
        else:
            # For the first feature, use the sum of correlations with all other features
            avg_corr = correlation_matrix[feature].abs().sum() - 1  # subtract self-correlation

        if avg_corr < min_corr:
            min_corr = avg_corr
            best_feature = feature

    # Add the best feature to the selected list and remove it from the remaining list
    selected_features.append(best_feature)
    remaining_features.remove(best_feature)

for feat in selected_features:
    print(feat)
print(len(selected_features))
with open('dataset/Disc/unique_features.txt', 'w') as file:
    for feature in selected_features:
        file.write(feature + '\n')

(14, 14, 13, 19, 14, 14, 15, 9, 12, 14, 15, 9, 14, 12, 15, 9, 14, 14, 15, 19, 14, 14, 13, 9, 14, 12, 9, 16, 14, 12, 9, 6, 12, 14, 17, 17, 12, 14, 17, 6, 10, 10, 17, 18, 14, 14, 16, 18, 14, 14, 17, 18, 14, 6, 12, 13, 12, 6, 10, 15, 14, 6, 6, 13, 14, 15, 6, 10, 12, 18, 17, 12, 12, 9, 6, 14, 12, 17, 6, 17, 17, 17, 6, 6, 17, 17, 6, 17, 6, 18, 6, 17, 14, 6, 12, 18, 12, 17, 12, 17, 12, 18, 12, 6, 14, 16, 12, 17, 14, 16, 10, 17, 15, 15, 15, 17, 15, 13, 12, 9, 15, 14, 14, 19, 16, 12, 12, 9, 17, 10, 13, 17, 16, 10, 13, 17, 15, 13, 10, 17, 10, 17, 14, 16, 12, 6, 12, 13, 14, 9, 12, 15, 12, 9, 12, 16, 12, 18, 12, 16, 12, 17, 12, 16, 12, 6, 10, 6, 17, 10, 12, 18, 16, 10, 12, 18, 15, 12, 14, 9, 17, 14, 12, 18, 15, 10, 15, 18, 17, 14, 13, 18, 17, 14, 15, 18, 16, 14, 13, 18, 16, 14, 12, 18, 16, 14, 15, 18, 17, 10, 13, 12, 9, 12, 15, 10, 6, 14, 13, 14, 6, 12, 6, 10, 17, 12, 13, 10, 17, 15, 13, 13, 17, 15, 10, 13, 17, 17, 10, 13, 16, 16, 10, 13, 12, 17, 13, 13, 15, 17, 13, 13, 12, 9, 13, 10, 12, 6, 13, 

In [2]:
import joblib
from collections import Counter
from data_provider.m4 import M4Dataset
import numpy as np
import warnings
import networkx as nx
from dBG.utils.Substitute.LogOdds import LogOdds
from dBG.FeatureGraph import FeatureGraph
from tqdm import tqdm

warnings.filterwarnings('ignore')

similarity_threshold = {
    'Yearly': 0.08,
    'Quarterly': 0.025,
    'Monthly': 0.003,
    'Weekly': 0,
    'Daily': -0.2,
    'Hourly': -0.4
}

seasonal_patterns = ['Monthly', 'Quarterly', 'Yearly', 'Weekly', 'Daily', 'Hourly']


def bin_descretize(sequence, discretizer):
    flat_data = np.concatenate(sequence)
    discrete_time_sequence = discretizer.transform(flat_data.reshape(-1, 1))

    discrete_time_sequence = discrete_time_sequence.flatten().astype(int)
    # Count the frequency of each value in the discretized data
    distribution = Counter(discrete_time_sequence)

    # Convert the distribution to a more readable format
    print(dict(distribution))
    original_shapes = [array.shape for array in sequence]
    reshaped_data = []
    start = 0
    for shape in original_shapes:
        end = start + shape[0]
        reshaped_data.append(discrete_time_sequence[start:end].tolist())
        start = end
    return reshaped_data


for pattern in seasonal_patterns:
    print(f'Reading {pattern} data...')
    m4 = M4Dataset.load(training=True, dataset_file='dataset/m4')
    training_values = np.array([v[~np.isnan(v)] for v in m4.values[m4.groups == pattern]])
    data = [ts for ts in training_values]
    discretizer = joblib.load(f'dataset/Discretizer/20Disc/{pattern}_discretizer_model.joblib')
    data = bin_descretize(data, discretizer)
    print(len(data))
    print('Dataset size: ', len(data))
    sub_matrix = LogOdds(data)
    dbg = FeatureGraph(k=3, sequences=data, approximate=True, substitute=sub_matrix,
                       similarity_threshold=similarity_threshold[pattern])
    G = dbg.graph
    print(G)
    node_label_mapping = {node: i for i, node in enumerate(G.nodes)}
    G = nx.relabel_nodes(G, node_label_mapping)
    print('Saving graph...')
    with open(f'dataset/Graphs/{pattern}_edges.txt', 'w') as f:
        for u, v, data in tqdm(G.edges(data=True)):
            for i in range(int(G[u][v].get('weight', 1))):
                f.write(f"{u} {v}\n")

    # Save the node label mapping to a separate file
    mapping_file_path = f'dataset/Graphs/{pattern}_nodes.joblib'
    joblib.dump(node_label_mapping, mapping_file_path)
    print(f'Node label mapping saved to {mapping_file_path}')


Reading Monthly data...
{17: 520174, 16: 519108, 15: 518079, 14: 521394, 13: 516848, 12: 522100, 18: 519121, 11: 525181, 10: 510618, 9: 521020, 8: 516684, 19: 519121, 7: 522145, 6: 516271, 5: 518945, 3: 519159, 4: 519120, 0: 519116, 1: 518110, 2: 520097}
48000
Dataset size:  48000
[[ 5.30371054e-01  2.43530455e-01  4.57027760e-02 -2.40980339e-03
  -3.95334943e-03  6.69156042e-03  2.47943080e-02 -2.64933962e-03
  -6.66393315e-03 -2.05549240e-02  3.87498303e-03 -3.89956774e-03
  -3.65951758e-02  1.79223632e-02 -2.40418337e-02 -4.74105874e-02
  -8.51720611e-02 -2.23952973e-01 -4.83459589e-01 -3.60057484e-01]
 [ 2.43530455e-01  1.18495776e-01  2.96231629e-02  2.73154327e-03
  -1.26652341e-03  3.37084830e-04  6.82837829e-03 -3.96938623e-03
  -5.49180515e-03 -1.28273533e-02  4.06998977e-04 -2.51057367e-03
  -1.71737698e-02  9.54161044e-03 -9.15971245e-03 -1.99310695e-02
  -3.81457679e-02 -1.02547702e-01 -1.93563331e-01 -1.63268086e-01]
 [ 4.57027760e-02  2.96231629e-02  3.47344869e-02  1.652

100%|██████████| 7459/7459 [00:18<00:00, 405.63it/s]


DiGraph with 400 nodes and 7459 edges
Saving graph...


100%|██████████| 7459/7459 [00:02<00:00, 3280.28it/s] 


Node label mapping saved to dataset/Graphs/Monthly_nodes.joblib
Reading Quarterly data...
{16: 110701, 15: 111052, 17: 110420, 6: 110873, 7: 110798, 9: 110650, 5: 110445, 8: 110704, 3: 110744, 4: 110773, 2: 110599, 1: 110706, 18: 110996, 10: 110734, 11: 110734, 12: 110705, 13: 110381, 14: 110682, 19: 110706, 0: 110705}
24000
Dataset size:  24000
[[ 1.00508860e+00  6.87472638e-01  4.20650675e-01  1.04834500e-01
   6.81534593e-03  7.63060924e-03 -3.28117729e-02 -4.68399687e-02
  -5.23596024e-02 -4.59476108e-02 -4.42614276e-02 -5.35205032e-03
  -5.66504683e-02 -1.29755540e-01 -1.97135244e-01 -2.67673652e-01
  -3.73441061e-01 -6.00308886e-01 -1.03913791e+00 -1.49603456e+00]
 [ 6.87472638e-01  4.87690154e-01  3.23857380e-01  1.11797464e-01
   1.65838803e-02  7.19700329e-03 -1.95726017e-02 -2.75389251e-02
  -3.08845153e-02 -2.78145233e-02 -2.41791094e-02 -4.58407754e-04
  -2.60401270e-02 -7.60388407e-02 -1.27362926e-01 -1.79582960e-01
  -2.46577872e-01 -3.98202419e-01 -7.07793906e-01 -1.1050

100%|██████████| 5823/5823 [00:03<00:00, 1828.55it/s]


DiGraph with 400 nodes and 5823 edges
Saving graph...


100%|██████████| 5823/5823 [00:00<00:00, 11996.72it/s]


Node label mapping saved to dataset/Graphs/Quarterly_nodes.joblib
Reading Yearly data...
{14: 35915, 15: 36056, 16: 36096, 17: 36021, 8: 36057, 9: 35992, 6: 36020, 5: 36022, 4: 36027, 3: 36122, 7: 35991, 2: 35918, 10: 36038, 11: 36042, 12: 36023, 13: 36022, 18: 36027, 19: 36023, 0: 36023, 1: 36023}
23000
Dataset size:  23000
[[ 2.22199127e+00  1.22024585e+00  5.34022094e-01  1.56130076e-01
  -5.85513818e-02 -2.04999406e-01 -2.62208558e-01 -3.22230030e-01
  -4.32431248e-01 -3.87862093e-01 -4.66817511e-01 -4.15838029e-01
  -4.87977952e-01 -5.05225471e-01 -5.87026367e-01 -6.55805945e-01
  -7.67810516e-01 -8.24218287e-01 -1.22056287e+00 -1.90752156e+00]
 [ 1.22024585e+00  1.04591266e+00  5.30080620e-01  1.83549399e-01
   1.71192284e-02 -1.01425440e-01 -1.89375710e-01 -2.11844786e-01
  -2.64692147e-01 -2.41394423e-01 -2.63400785e-01 -2.33378400e-01
  -2.76923291e-01 -2.91202340e-01 -3.40124648e-01 -3.85567895e-01
  -4.33544519e-01 -4.42756641e-01 -6.95188831e-01 -1.51726769e+00]
 [ 5.340220

100%|██████████| 3799/3799 [00:08<00:00, 438.32it/s]


DiGraph with 390 nodes and 3799 edges
Saving graph...


100%|██████████| 3799/3799 [00:00<00:00, 24387.43it/s]


Node label mapping saved to dataset/Graphs/Yearly_nodes.joblib
Reading Weekly data...
{3: 18346, 2: 18345, 4: 18338, 5: 18353, 6: 18342, 7: 18347, 8: 18347, 9: 18346, 10: 18346, 11: 18342, 12: 18346, 13: 18348, 14: 18339, 15: 18351, 16: 18347, 17: 18345, 18: 18346, 19: 18346, 0: 18345, 1: 18347}
359
Dataset size:  359
[[ 9.39275139e-01  4.86212464e-01  5.02266674e-01  3.06712962e-01
   2.83057141e-01  1.75716118e-02  1.11690834e-01  5.37483722e-02
  -2.12439981e-01 -1.68848615e-01 -2.68011922e-01 -1.38990185e-01
  -1.64887061e-01 -2.13215975e-01 -2.16176060e-01 -2.82824286e-01
  -4.85613753e-01 -5.53201119e-01 -6.51789392e-01 -1.44231466e+00]
 [ 4.86212464e-01  2.52196879e-01  2.59967989e-01  1.78625161e-01
   1.56319599e-01  1.36474717e-01  1.54693609e-01  9.15745050e-02
   4.33254115e-04 -2.72156170e-02 -7.99748922e-02 -1.46480887e-01
  -1.05944972e-01 -1.77503514e-01 -1.95856430e-01 -2.23768953e-01
  -2.76987803e-01 -3.82079908e-01 -3.96913634e-01 -7.13607966e-01]
 [ 5.02266674e-01 

100%|██████████| 2263/2263 [00:11<00:00, 190.73it/s]


DiGraph with 323 nodes and 2263 edges
Saving graph...


100%|██████████| 2263/2263 [00:00<00:00, 26945.64it/s]


Node label mapping saved to dataset/Graphs/Weekly_nodes.joblib
Reading Daily data...
{2: 498244, 1: 498206, 3: 498225, 4: 498250, 5: 498238, 6: 498234, 7: 498204, 8: 498231, 0: 498233, 9: 498247, 10: 498205, 11: 498262, 12: 498238, 13: 498242, 19: 498238, 18: 498246, 17: 498267, 16: 498190, 15: 498278, 14: 498180}
4227
Dataset size:  4227
[[ 5.19400361e-01  1.84223113e-02  6.62834623e-03  9.42135370e-02
   9.23378753e-02  2.19849343e-01  4.26699448e-01  5.64831104e-01
   5.01549797e-01  3.36252067e-01  1.29140333e-01  6.65411316e-02
  -1.33778414e-02 -1.89389804e-01 -3.20717512e-01 -5.11440418e-01
  -8.15939367e-01 -8.95935935e-01 -9.52384510e-01 -9.83131873e-01]
 [ 1.84223113e-02  8.06023789e-02  2.91911978e-03 -2.85184006e-02
  -3.18655607e-02 -4.65380255e-02 -5.68084509e-02 -3.39843376e-02
  -1.39848902e-02 -1.04893781e-02  5.05853821e-04  8.36538960e-03
   1.54918870e-02  4.46466181e-02  6.57530271e-02  8.10011284e-02
   3.97368918e-02 -1.28241913e-02 -4.64044568e-02 -7.20587885e-0

100%|██████████| 1003/1003 [00:04<00:00, 234.86it/s]


DiGraph with 312 nodes and 1003 edges
Saving graph...


100%|██████████| 1003/1003 [00:02<00:00, 456.06it/s]


Node label mapping saved to dataset/Graphs/Daily_nodes.joblib
Reading Hourly data...
{15: 17696, 14: 17725, 17: 17688, 16: 17666, 18: 17677, 19: 17675, 13: 17851, 12: 17660, 11: 17594, 10: 18057, 0: 17308, 2: 17080, 9: 17598, 1: 17496, 5: 17296, 4: 17762, 8: 17918, 7: 17837, 3: 18566, 6: 17350}
414
Dataset size:  414
[[ 4.02934535e+00  1.68191615e+00 -1.26609198e-01 -8.15191065e-01
  -6.28406118e-01 -1.81644635e+00 -1.94646832e+00 -2.11493817e+00
  -2.36016609e+00 -1.19099453e+00  3.21525599e-01 -8.07267107e-01
  -1.68325170e+00 -1.29914194e+00 -9.42815080e-01 -6.22612461e-01
  -2.51019282e-01  7.08224228e-02  3.25420804e-01  2.45730111e-01]
 [ 1.68191615e+00  2.87218777e+00  1.52843665e+00  2.54193101e-01
  -1.81943499e-01 -1.61139787e+00 -1.85016939e+00 -1.78066481e+00
  -1.89808392e+00 -8.43437537e-01  4.36863764e-01 -6.90611555e-01
  -1.58895902e+00 -1.28612708e+00 -9.71105814e-01 -5.50792099e-01
  -4.33455099e-01 -1.22012815e-02  1.67375333e-01  2.59293840e-01]
 [-1.26609198e-01  

100%|██████████| 2142/2142 [00:11<00:00, 185.82it/s]


DiGraph with 248 nodes and 2142 edges
Saving graph...


100%|██████████| 2142/2142 [00:00<00:00, 22529.61it/s]

Node label mapping saved to dataset/Graphs/Hourly_nodes.joblib





In [1]:
import subprocess
import time

# Path to the directory you want to clean
directory_path = '/run/media/lumpus/HDD Storage/PycharmProjects/Time-Series-Library/struc2vec-master/pickles'

seasonal_patterns = ['Daily', 'Hourly', 'Yearly', 'Quarterly', 'Monthly', 'Weekly']

for pattern in seasonal_patterns:
    print(f'Reading {pattern} data...')

    # Define the command and parameters
    command = 'venv/bin/python'
    script_path = 'struc2vec-master/src/main.py'
    params = [
        '--input',
        f'/run/media/lumpus/HDD Storage/PycharmProjects/Time-Series-Library/dataset/Graphs/{pattern}_edges.txt',
        '--output',
        f'/run/media/lumpus/HDD Storage/PycharmProjects/Time-Series-Library/dataset/graph_emb/{pattern}.emb',
        '--weighted',
        '--directed',
        '--workers', '24',
        '--dimensions', '32',
        '--OPT1', 'true',
        '--OPT1', 'true',
        '--OPT3', 'true'
    ]

    # Combine command and parameters
    full_command = [command, script_path] + params

    start = time.time()
    # Run the command
    subprocess.run(full_command)
    end = time.time()

    print(f'Time Cost {(end - start):.2f}s')



Reading Daily data...
Namespace(OPT1=True, OPT2=False, OPT3=True, dimensions=32, directed=True, input='/run/media/lumpus/HDD Storage/PycharmProjects/Time-Series-Library/dataset/Graphs/Daily_edges.txt', iter=5, num_walks=10, output='/run/media/lumpus/HDD Storage/PycharmProjects/Time-Series-Library/dataset/graph_emb/Daily.emb', undirected=True, until_layer=None, unweighted=True, walk_length=80, weighted=True, window_size=10, workers=24)
True True
Time Cost 16.65s
Reading Hourly data...
Namespace(OPT1=True, OPT2=False, OPT3=True, dimensions=32, directed=True, input='/run/media/lumpus/HDD Storage/PycharmProjects/Time-Series-Library/dataset/Graphs/Hourly_edges.txt', iter=5, num_walks=10, output='/run/media/lumpus/HDD Storage/PycharmProjects/Time-Series-Library/dataset/graph_emb/Hourly.emb', undirected=True, until_layer=None, unweighted=True, walk_length=80, weighted=True, window_size=10, workers=24)
True True
Time Cost 6.95s
Reading Yearly data...
Namespace(OPT1=True, OPT2=False, OPT3=True,

In [3]:
 with open('/run/media/lumpus/HDD Storage/PycharmProjects/Time-Series-Library/dataset/graph_emb/Yearly.emb',
           'r') as file:
    first_line = file.readline()
    specified_dimensions = list(map(int, first_line.split()))

print(f)


<_io.TextIOWrapper name='dataset/Graphs/Hourly.txt' mode='w' encoding='UTF-8'>


In [5]:
from sklearn.preprocessing import KBinsDiscretizer
import joblib
from collections import Counter
from data_provider.m4 import M4Dataset
import numpy as np
import warnings
import networkx as nx
from dBG.utils.Substitute.LogOdds import LogOdds
from dBG.FeatureGraph import FeatureGraph
from tqdm import tqdm
import pandas as pd
 
warnings.filterwarnings('ignore')
 
similarity_threshold = {
    'Yearly': 0.08,
    'Quarterly': 0.025,
    'Monthly': 0.003,
    'Weekly': 0,
    'Daily': -0.2,
    'Hourly': -0.4
}
 
seasonal_patterns = ['Monthly', 'Quarterly', 'Yearly', 'Weekly', 'Daily', 'Hourly']

sample = {
    'Monthly': 2, 
    'Quarterly': 65, 
    'Yearly': 72, 
    'Weekly': 3, 
    'Daily': 10, 
    'Hourly': 124
}
 
def bin_descretize(sequence, discretizer):
    flat_data = np.concatenate(sequence)
    discrete_time_sequence = discretizer.fit_transform(flat_data.reshape(-1, 1))
 
    discrete_time_sequence = discrete_time_sequence.flatten().astype(int)
    # Count the frequency of each value in the discretized data
    distribution = Counter(discrete_time_sequence)
 
    # Convert the distribution to a more readable format
    print(dict(distribution))
    original_shapes = [array.shape for array in sequence]
    reshaped_data = []
    start = 0
    for shape in original_shapes:
        end = start + shape[0]
        reshaped_data.append(discrete_time_sequence[start:end].tolist())
        start = end
    return reshaped_data
 
 # 578
for pattern in seasonal_patterns:
    print(f'Reading {pattern} data...')
    m4 = M4Dataset.load(training=True, dataset_file='dataset/m4')
    training_values = [v[~np.isnan(v)] for v in m4.values[m4.groups == pattern]]
    data = [ts for ts in training_values]
    discretizer = KBinsDiscretizer(n_bins=20, encode='ordinal', strategy='quantile')
    data = bin_descretize(data, discretizer)
    # joblib.dump(discretizer, f'dataset/Discretizer/20Disc/{pattern}_discretizer_model.joblib')
    # data = data[:sample[pattern]]
    print(len(data))
    print('Dataset size: ', len(data))
    sub_matrix = LogOdds(data)
    dbg = FeatureGraph(k=5, sequences=data, approximate=True, substitute=sub_matrix,
                       similarity_threshold=similarity_threshold[pattern])
    G = dbg.graph
    print(G)

    exit(-1)
 
    # Export Node List
    node_data = pd.DataFrame([[t] for t in G.nodes()], columns=['Node'])
    node_data['Max'] = node_data['Node'].apply(lambda x: max(x))
    node_data['Node'] = node_data['Node'].apply(lambda x: ' '.join(map(str, x)))
    node_data.to_csv(f'dataset/Cythospace/{pattern}_nodelist.csv', index=False)
 
    # Export Edge List
    # Get edge list with all attributes
    edge_data = G.edges(data=True)
    
    # Convert edge list to pandas DataFrame dynamically
    # This will create a list of dictionaries where each dictionary corresponds to an edge and its attributes
    edges_list = [{'Source': u, 'Target': v, **data} for u, v, data in edge_data]
    
    # Now convert this list of dictionaries to a DataFrame
    edge_df = pd.DataFrame(edges_list)
    edge_df = edge_df.drop('substitute_edges', axis=1)
    edge_df['Max'] = edge_df['tuple'].apply(lambda x: max(x))
    edge_df['Source'] = edge_df['Source'].apply(lambda x: ' '.join(map(str, x)))
    edge_df['Target'] = edge_df['Target'].apply(lambda x: ' '.join(map(str, x)))
    edge_df['tuple'] = edge_df['tuple'].apply(lambda x: ' '.join(map(str, x)))

    # Save the DataFrame to CSV
    edge_df.to_csv(f'dat|aset/Cythospace/{pattern}_edgelist.csv', index=False)



Reading Monthly data...
{17: 520174, 16: 519108, 15: 518079, 14: 521394, 13: 516848, 12: 522100, 18: 519121, 11: 525181, 10: 510618, 9: 521020, 8: 516684, 19: 519121, 7: 522145, 6: 516271, 5: 518945, 3: 519159, 4: 519120, 0: 519116, 1: 518110, 2: 520097}
48000
Dataset size:  48000
[[ 5.30371054e-01  2.43530455e-01  4.57027760e-02 -2.40980339e-03
  -3.95334943e-03  6.69156042e-03  2.47943080e-02 -2.64933962e-03
  -6.66393315e-03 -2.05549240e-02  3.87498303e-03 -3.89956774e-03
  -3.65951758e-02  1.79223632e-02 -2.40418337e-02 -4.74105874e-02
  -8.51720611e-02 -2.23952973e-01 -4.83459589e-01 -3.60057484e-01]
 [ 2.43530455e-01  1.18495776e-01  2.96231629e-02  2.73154327e-03
  -1.26652341e-03  3.37084830e-04  6.82837829e-03 -3.96938623e-03
  -5.49180515e-03 -1.28273533e-02  4.06998977e-04 -2.51057367e-03
  -1.71737698e-02  9.54161044e-03 -9.15971245e-03 -1.99310695e-02
  -3.81457679e-02 -1.02547702e-01 -1.93563331e-01 -1.63268086e-01]
 [ 4.57027760e-02  2.96231629e-02  3.47344869e-02  1.652

 37%|███▋      | 83236/222889 [29:33<49:35, 46.94it/s]  


KeyboardInterrupt: 