Import data

In [None]:
import numpy as np
import pandas as pd

from data_provider.m4 import M4Dataset

seasonal_patterns = 'Monthly'
m4 = M4Dataset.load(training=True, dataset_file='./dataset/m4')
training_values = np.array(
    [v[~np.isnan(v)] for v in m4.values[m4.groups == seasonal_patterns]])  # split different frequencies
ids = np.array([i for i in m4.ids[m4.groups == seasonal_patterns]])
data = [ts for ts in training_values]
flat_data = np.concatenate(data)

print('Dataset size: ', len(data))
print('Flat Dataset size: ', len(flat_data))


def create_bin_count_table(array, bin_ranges):
    # Initialize bins
    bins = [-np.inf] + bin_ranges + [np.inf]

    # Calculate histogram bin counts
    bin_counts, _ = np.histogram(array, bins=bins)

    # Create a DataFrame for the table
    bin_labels = [f"Bin {i + 1}" for i in range(len(bin_counts))]
    histogram_df = pd.DataFrame({
        'Bin': bin_labels,
        'Range': [f"<= {bin_ranges[0]}" if i == 0 else
                  f">= {bin_ranges[-1]}" if i == len(bin_counts) - 1 else
                  f"{bin_ranges[i - 1]} < n <= {bin_ranges[i]}" for i in range(len(bin_counts))],
        'Count': bin_counts
    })

    return histogram_df


Manual Discretization

In [None]:

# Calculate histogram bin counts without plotting
bin_counts, bin_edges = np.histogram(flat_data, bins=50)

# Create a table of bin counts
histogram_table = np.column_stack((bin_edges[:-1], bin_edges[1:], bin_counts))

# Convert the table to a more readable format
histogram_df = pd.DataFrame(histogram_table, columns=['Bin Start', 'Bin End', 'Count'])

ranges = [50, 100, 150, 200, 250, 300, 350, 400, 450, 500, 600, 700, 800, 900, 1000, 1200, 1400, 1600, 1800, 2000,
          2500, 3000, 3500, 4000, 4500, 5000, 6000, 7000, 8000, 9000, 10000, 12000, 14000, 16000, 18000, 20000, 30000]

histogram_df = create_bin_count_table(flat_data, ranges)
histogram_df


# discretizer = Discretization(ranges)
# print('Descretizing data')
# discretized_data = [[discretizer.discretize(value) for value in element] for element in data]


In [None]:
import numpy as np
from scipy.stats import norm
from pyts.approximation import SymbolicAggregateApproximation

# SAX transformation
n_bins = 25
sax = SymbolicAggregateApproximation(n_bins=n_bins, strategy='normal')
X_sax = sax.fit_transform(np.array(flat_data).reshape(1, -1))

# Compute gaussian bins
bins = norm.ppf(np.linspace(0, 1, n_bins + 1)[1:-1])

# Show the results for the first time series
bottom_bool = np.r_[True, X_sax[0, 1:] > X_sax[0, :-1]]

print(bins)

# Count unique values
unique_values, counts = np.unique(X_sax, return_counts=True)

# Displaying unique values and their counts
for value, count in zip(unique_values, counts):
    print(f"{value}: {count}")


SAX

In [None]:
import numpy as np
from saxpy.znorm import znorm
from saxpy.sax import ts_to_string
from saxpy.alphabet import cuts_for_asize

dat_znorm = znorm(flat_data)
#dat_paa_3 = paa(dat_znorm, len(dat_znorm))
print(dat_znorm)

sax_flat_data = ts_to_string(dat_znorm, cuts_for_asize(20))
char_to_int = lambda c: ord(c) - ord('a')
sax_flat_data = [char_to_int(char) for char in sax_flat_data]
sax_flat_data = np.array(sax_flat_data)

original_shapes = [array.shape for array in data]
reshaped_data = []
start = 0
for shape in original_shapes:
    end = start + shape[0]
    reshaped_data.append(sax_flat_data[start:end].tolist())
    start = end

print(len(reshaped_data))
print(len(data))

In [None]:
from dBG.FeatureGraph import FeatureGraph
from dBG.utils.Substitute.LogOdds import LogOdds

sub_matrix = LogOdds(reshaped_data)

dbg = FeatureGraph(4, reshaped_data, approximate=True, substitute=sub_matrix, similarity_threshold=0.003)
print(dbg)
features = dbg.generate_features(0.01)



In [None]:
# 231
sim_thr = 0.003

larger_than_number = sub_matrix.similarity_matrix > sim_thr
len(sub_matrix.similarity_matrix[larger_than_number])

Remove similar features

In [None]:
from Levenshtein import distance as lev

print(len(features))


def tuple_to_string(t):
    return ''.join(chr(i) for i in t)


unique_features = []
for feat1 in features:
    is_similar = False
    for feat2 in unique_features:
        sim_thr = max(len(feat1), len(feat2)) * 0.5
        if lev(tuple_to_string(feat1), tuple_to_string(feat2)) < sim_thr:
            is_similar = True
            break
    if not is_similar:
        unique_features.append(feat1)

print(len(unique_features))


Allignment Alg

In [None]:

import swalign
from tqdm.contrib.concurrent import process_map  # Import process_map from tqdm.contrib.concurrent

match = 2
mismatch = -1
scoring = swalign.NucleotideScoringMatrix(match, mismatch)
sw = swalign.LocalAlignment(scoring)  # you can also choose gap penalties, etc...


def tuple_to_string(t):
    return ''.join(chr(i) for i in t)


def process_row(row):
    row_str = tuple_to_string(row)
    csv_row = [row]  # Start with the original int tuple
    for feature in unique_features:
        feature_str = tuple_to_string(feature)
        score = sw.align(row_str, feature_str).score
        csv_row.append(score)
    return csv_row


# Use process_map for parallel processing with a progress bar
results = list(process_map(process_row, reshaped_data, max_workers=None, chunksize=1, total=len(reshaped_data)))



In [None]:
import pickle

with open('dataset/Disc/reshaped_data.pkl', 'wb') as file:
    # Use pickle to serialize and save the data
    pickle.dump(reshaped_data, file)

with open('dataset/Disc/features.pkl', 'wb') as file:
    # Use pickle to serialize and save the data
    pickle.dump(features, file)


In [None]:
import pandas as pd

lev_features = pd.read_csv('dataset/Disc/all_features_weighted_lev_dist.csv')
sampled_df = lev_features.sample(frac=0.1)

print(lev_features.shape)
print(sampled_df.shape)

sampled_df = sampled_df.drop('Data', axis=1)
correlation_matrix = sampled_df.corr()

In [None]:
k = 50

selected_features = []
remaining_features = correlation_matrix.columns.tolist()

# Iteratively select features
for _ in range(k):
    min_corr = float('inf')
    best_feature = None

    for feature in remaining_features:
        # Calculate the average correlation of the feature with the already selected features
        if selected_features:
            avg_corr = correlation_matrix.loc[selected_features, feature].abs().mean()
        else:
            # For the first feature, use the sum of correlations with all other features
            avg_corr = correlation_matrix[feature].abs().sum() - 1  # subtract self-correlation

        if avg_corr < min_corr:
            min_corr = avg_corr
            best_feature = feature

    # Add the best feature to the selected list and remove it from the remaining list
    selected_features.append(best_feature)
    remaining_features.remove(best_feature)

for feat in selected_features:
    print(feat)
print(len(selected_features))
with open('dataset/Disc/unique_features.txt', 'w') as file:
    for feature in selected_features:
        file.write(feature + '\n')

In [None]:
import joblib
from collections import Counter
from data_provider.m4 import M4Dataset
import numpy as np
import warnings
import networkx as nx
from dBG.utils.Substitute.LogOdds import LogOdds
from dBG.FeatureGraph import FeatureGraph
from tqdm import tqdm

warnings.filterwarnings('ignore')

similarity_threshold = {
    'Yearly': 0.08,
    'Quarterly': 0.025,
    'Monthly': 0.003,
    'Weekly': 0,
    'Daily': -0.2,
    'Hourly': -0.4
}

seasonal_patterns = ['Monthly', 'Quarterly', 'Yearly', 'Weekly', 'Daily', 'Hourly']


def bin_descretize(sequence, discretizer):
    flat_data = np.concatenate(sequence)
    discrete_time_sequence = discretizer.transform(flat_data.reshape(-1, 1))

    discrete_time_sequence = discrete_time_sequence.flatten().astype(int)
    # Count the frequency of each value in the discretized data
    distribution = Counter(discrete_time_sequence)

    # Convert the distribution to a more readable format
    print(dict(distribution))
    original_shapes = [array.shape for array in sequence]
    reshaped_data = []
    start = 0
    for shape in original_shapes:
        end = start + shape[0]
        reshaped_data.append(discrete_time_sequence[start:end].tolist())
        start = end
    return reshaped_data


for pattern in seasonal_patterns:
    print(f'Reading {pattern} data...')
    m4 = M4Dataset.load(training=True, dataset_file='dataset/m4')
    training_values = np.array([v[~np.isnan(v)] for v in m4.values[m4.groups == pattern]])
    data = [ts for ts in training_values]
    discretizer = joblib.load(f'dataset/Discretizer/20Disc/{pattern}_discretizer_model.joblib')
    data = bin_descretize(data, discretizer)
    print(len(data))
    print('Dataset size: ', len(data))
    sub_matrix = LogOdds(data)
    dbg = FeatureGraph(k=3, sequences=data, approximate=True, substitute=sub_matrix,
                       similarity_threshold=similarity_threshold[pattern])
    G = dbg.graph
    print(G)
    node_label_mapping = {node: i for i, node in enumerate(G.nodes)}
    G = nx.relabel_nodes(G, node_label_mapping)
    print('Saving graph...')
    with open(f'dataset/Graphs/{pattern}_edges.txt', 'w') as f:
        for u, v, data in tqdm(G.edges(data=True)):
            for i in range(int(G[u][v].get('weight', 1))):
                f.write(f"{u} {v}\n")

    # Save the node label mapping to a separate file
    mapping_file_path = f'dataset/Graphs/{pattern}_nodes.joblib'
    joblib.dump(node_label_mapping, mapping_file_path)
    print(f'Node label mapping saved to {mapping_file_path}')


In [None]:
import subprocess
import time

# Path to the directory you want to clean
directory_path = '/run/media/lumpus/HDD Storage/PycharmProjects/Time-Series-Library/struc2vec-master/pickles'

seasonal_patterns = ['Daily', 'Hourly', 'Yearly', 'Quarterly', 'Monthly', 'Weekly']

for pattern in seasonal_patterns:
    print(f'Reading {pattern} data...')

    # Define the command and parameters
    command = 'venv/bin/python'
    script_path = 'struc2vec-master/src/main.py'
    params = [
        '--input',
        f'/run/media/lumpus/HDD Storage/PycharmProjects/Time-Series-Library/dataset/Graphs/{pattern}_edges.txt',
        '--output',
        f'/run/media/lumpus/HDD Storage/PycharmProjects/Time-Series-Library/dataset/graph_emb/{pattern}.emb',
        '--weighted',
        '--directed',
        '--workers', '24',
        '--dimensions', '32',
        '--OPT1', 'true',
        '--OPT1', 'true',
        '--OPT3', 'true'
    ]

    # Combine command and parameters
    full_command = [command, script_path] + params

    start = time.time()
    # Run the command
    subprocess.run(full_command)
    end = time.time()

    print(f'Time Cost {(end - start):.2f}s')



In [None]:
 with open('/run/media/lumpus/HDD Storage/PycharmProjects/Time-Series-Library/dataset/graph_emb/Yearly.emb',
           'r') as file:
    first_line = file.readline()
    specified_dimensions = list(map(int, first_line.split()))

print(f)


In [None]:
from sklearn.preprocessing import KBinsDiscretizer
import joblib
from collections import Counter
from data_provider.m4 import M4Dataset
import numpy as np
import warnings
import networkx as nx
from dBG.utils.Substitute.LogOdds import LogOdds
from dBG.FeatureGraph import FeatureGraph
from tqdm import tqdm
import pandas as pd
 
warnings.filterwarnings('ignore')
 
similarity_threshold = {
    'Yearly': 0.08,
    'Quarterly': 0.025,
    'Monthly': 0.003,
    'Weekly': 0,
    'Daily': -0.2,
    'Hourly': -0.4
}
 
seasonal_patterns = ['Monthly', 'Quarterly', 'Yearly', 'Weekly', 'Daily', 'Hourly']

sample = {
    'Monthly': 2, 
    'Quarterly': 65, 
    'Yearly': 72, 
    'Weekly': 3, 
    'Daily': 10, 
    'Hourly': 124
}
 
def bin_descretize(sequence, discretizer):
    flat_data = np.concatenate(sequence)
    discrete_time_sequence = discretizer.fit_transform(flat_data.reshape(-1, 1))
 
    discrete_time_sequence = discrete_time_sequence.flatten().astype(int)
    # Count the frequency of each value in the discretized data
    distribution = Counter(discrete_time_sequence)
 
    # Convert the distribution to a more readable format
    print(dict(distribution))
    original_shapes = [array.shape for array in sequence]
    reshaped_data = []
    start = 0
    for shape in original_shapes:
        end = start + shape[0]
        reshaped_data.append(discrete_time_sequence[start:end].tolist())
        start = end
    return reshaped_data
 
 # 578
for pattern in seasonal_patterns:
    print(f'Reading {pattern} data...')
    m4 = M4Dataset.load(training=True, dataset_file='dataset/m4')
    training_values = [v[~np.isnan(v)] for v in m4.values[m4.groups == pattern]]
    data = [ts for ts in training_values]
    discretizer = KBinsDiscretizer(n_bins=20, encode='ordinal', strategy='quantile')
    data = bin_descretize(data, discretizer)
    # joblib.dump(discretizer, f'dataset/Discretizer/20Disc/{pattern}_discretizer_model.joblib')
    # data = data[:sample[pattern]]
    print(len(data))
    print('Dataset size: ', len(data))
    sub_matrix = LogOdds(data)
    dbg = FeatureGraph(k=5, sequences=data, approximate=True, substitute=sub_matrix,
                       similarity_threshold=similarity_threshold[pattern])
    G = dbg.graph
    print(G)

    exit(-1)
 
    # Export Node List
    node_data = pd.DataFrame([[t] for t in G.nodes()], columns=['Node'])
    node_data['Max'] = node_data['Node'].apply(lambda x: max(x))
    node_data['Node'] = node_data['Node'].apply(lambda x: ' '.join(map(str, x)))
    node_data.to_csv(f'dataset/Cythospace/{pattern}_nodelist.csv', index=False)
 
    # Export Edge List
    # Get edge list with all attributes
    edge_data = G.edges(data=True)
    
    # Convert edge list to pandas DataFrame dynamically
    # This will create a list of dictionaries where each dictionary corresponds to an edge and its attributes
    edges_list = [{'Source': u, 'Target': v, **data} for u, v, data in edge_data]
    
    # Now convert this list of dictionaries to a DataFrame
    edge_df = pd.DataFrame(edges_list)
    edge_df = edge_df.drop('substitute_edges', axis=1)
    edge_df['Max'] = edge_df['tuple'].apply(lambda x: max(x))
    edge_df['Source'] = edge_df['Source'].apply(lambda x: ' '.join(map(str, x)))
    edge_df['Target'] = edge_df['Target'].apply(lambda x: ' '.join(map(str, x)))
    edge_df['tuple'] = edge_df['tuple'].apply(lambda x: ' '.join(map(str, x)))

    # Save the DataFrame to CSV
    edge_df.to_csv(f'dat|aset/Cythospace/{pattern}_edgelist.csv', index=False)



In [None]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE


plt.style.use('seaborn-darkgrid')

# Adjusted code to use subfigures for visualization of all seasons in a single image
fig, axes = plt.subplots(2, 3, figsize=(15, 10)) # 2 rows, 3 columns layout for the subplots
axes = axes.flatten() # Flatten the axes array for easy iteration

# Mock data and cluster numbers for illustration; replace with actual data loading and clustering
# clusters dictionary as provided
clusters = {
    "Weekly": 6,
    "Daily": 14,
    "Quarterly": 6,
    "Hourly": 14,
    "Yearly": 14,
    "Monthly": 3
}

seasons = ["Weekly", "Daily", "Quarterly", "Hourly", "Yearly", "Monthly"]

# Placeholder for actual data loading and clustering
# Since the environment doesn't have access to 'dataset/graph_emb/{season}.emb', we'll simulate the data
# For demonstration, simulate 400 samples with 32 dimensions as in the original question
np.random.seed(0) # For reproducible random data

for i, season in enumerate(seasons):
    data = np.loadtxt(f'dataset/graph_emb/{season}.emb', skiprows=1, usecols=range(1, 33))

    # Apply K-means clustering
    n_clusters = clusters[season]
    kmeans = KMeans(n_init='auto', n_clusters=n_clusters, random_state=0).fit(data)

    # Apply t-SNE for dimensionality reduction to 2D for visualization
    tsne = TSNE(n_components=2, random_state=0)
    data_2d = tsne.fit_transform(data)

    # Visualization in the subplot
    ax = axes[i]
    for j in range(n_clusters):
        ax.scatter(data_2d[kmeans.labels_ == j, 0], data_2d[kmeans.labels_ == j, 1], label=f'Cluster {j+1}', alpha=0.7, edgecolor='w', s=50)
    ax.set_title(f'{season}',fontsize=24)
    ax.set_xticks([])
    ax.set_yticks([])


fig.tight_layout(pad=3.0)
# plt.tight_layout()
#plt.savefig('Images/scatter.pdf')
plt.show()
