In [None]:
import numpy as np
import pickle
import sys

# Fix for NumPy 2.x compatibility with pickle files created with NumPy 1.x
# NumPy 2.0+ renamed numpy.core to numpy._core, but pickle files may reference numpy._core
# We need to create a custom unpickler that redirects numpy._core references to numpy.core

class NumpyCompatibilityUnpickler(pickle.Unpickler):
    """Custom unpickler that fixes numpy._core module references for NumPy 2.x compatibility"""
    def find_class(self, module, name):
        # Redirect numpy._core.* references to numpy.core.*
        if module.startswith('numpy._core'):
            module = module.replace('numpy._core', 'numpy.core')
        
        # Ensure numpy.core modules are accessible
        if module == 'numpy.core.multiarray':
            import numpy.core.multiarray
            return getattr(numpy.core.multiarray, name)
        elif module == 'numpy.core._multiarray_umath':
            import numpy.core._multiarray_umath
            return getattr(numpy.core._multiarray_umath, name)
        
        return super().find_class(module, name)

from sequenzo.define_sequence_data import SequenceData
from sequenzo.multidomain.cat import compute_cat_distance_matrix
import pandas as pd 

# Load pickle file using custom unpickler for NumPy 2.x compatibility
with open("dataset.pkl", "rb") as f:
    unpickler = NumpyCompatibilityUnpickler(f)
    data = unpickler.load()

sequences_mv = data["X"]

In [5]:
sequences_mv

array([[[0, 1, 1, ..., 1, 0, 0],
        [0, 0, 0, ..., 1, 1, 1],
        [0, 1, 1, ..., 0, 0, 1],
        ...,
        [1, 0, 1, ..., 1, 0, 0],
        [0, 1, 1, ..., 1, 1, 0],
        [1, 0, 1, ..., 1, 0, 0]],

       [[0, 1, 0, ..., 1, 0, 1],
        [0, 1, 1, ..., 1, 0, 1],
        [0, 1, 1, ..., 0, 0, 1],
        ...,
        [0, 0, 1, ..., 1, 0, 0],
        [1, 0, 0, ..., 1, 0, 0],
        [1, 0, 1, ..., 0, 0, 1]],

       [[1, 1, 1, ..., 1, 1, 1],
        [1, 1, 1, ..., 1, 0, 1],
        [0, 1, 1, ..., 0, 1, 0],
        ...,
        [1, 1, 0, ..., 0, 0, 1],
        [1, 1, 1, ..., 1, 0, 1],
        [0, 1, 0, ..., 1, 1, 0]],

       ...,

       [[0, 1, 1, ..., 0, 1, 0],
        [0, 1, 1, ..., 1, 0, 0],
        [0, 1, 0, ..., 1, 0, 1],
        ...,
        [0, 0, 0, ..., 1, 0, 0],
        [0, 0, 0, ..., 1, 0, 1],
        [0, 0, 1, ..., 0, 1, 1]],

       [[0, 0, 0, ..., 1, 1, 1],
        [0, 1, 0, ..., 1, 1, 0],
        [1, 1, 1, ..., 1, 1, 1],
        ...,
        [1, 1, 0, ..., 

In [2]:
N = 10                          
V = 50
C_list = [2] * V  
T = sequences_mv[0].shape[0]
time_cols = [str(t) for t in range(1, T + 1)]
N_use = min(N, len(sequences_mv))

ids = list(range(N_use))

channels_seq = []
for v in range(V):
    mat = np.stack([sequences_mv[i][:, v] for i in range(N_use)], axis=0).astype(int) + 1
    df_v = pd.DataFrame(mat, columns=time_cols)
    df_v.insert(0, "ID", ids)
    df_v[time_cols] = df_v[time_cols].astype(str)  # "1"/"2"
    seq_v = SequenceData(
        data=df_v,
        time=time_cols,
        states=["1", "2"],
        id_col="ID",
    )
    channels_seq.append(seq_v)

print(channels_seq)

result_cat = compute_cat_distance_matrix(
    channels=channels_seq,
    method="OM",
    sm=["TRATE"] * V,          
    indel="auto",              
    link="sum",                
    norm="none",               
    full_matrix=True,
    what="diss",
    cweight=[1.0] * V,
    with_missing=[False] * V,  
    ch_sep="+"
)
D_sequenzo = result_cat.to_numpy() if hasattr(result_cat, "to_numpy") else np.asarray(result_cat)


[>] SequenceData initialized successfully! Here's a summary:
[>] Number of sequences: 10
[>] Number of time points: 20
[>] Min/Max sequence length: 20 / 20
[>] States: ['1', '2']
[>] Labels: ['1', '2']
[>] Weights: Not provided

[>] SequenceData initialized successfully! Here's a summary:
[>] Number of sequences: 10
[>] Number of time points: 20
[>] Min/Max sequence length: 20 / 20
[>] States: ['1', '2']
[>] Labels: ['1', '2']
[>] Weights: Not provided

[>] SequenceData initialized successfully! Here's a summary:
[>] Number of sequences: 10
[>] Number of time points: 20
[>] Min/Max sequence length: 20 / 20
[>] States: ['1', '2']
[>] Labels: ['1', '2']
[>] Weights: Not provided

[>] SequenceData initialized successfully! Here's a summary:
[>] Number of sequences: 10
[>] Number of time points: 20
[>] Min/Max sequence length: 20 / 20
[>] States: ['1', '2']
[>] Labels: ['1', '2']
[>] Weights: Not provided

[>] SequenceData initialized successfully! Here's a summary:
[>] Number of sequence

In [3]:
print(D_sequenzo)
print("\n")

# Load D_traminer before printing using custom unpickler for NumPy 2.x compatibility
with open("dist_om_traminer.pkl", "rb") as f:
    unpickler = NumpyCompatibilityUnpickler(f)
    D_traminer = unpickler.load()

print(D_traminer)

[[  0.         536.76840252 542.12020336 555.64576295 530.41348217
  560.88439283 531.50792389 551.68618196 506.27587748 558.57655325]
 [536.76840252   0.         564.78652633 546.38774323 532.95017967
  561.27664941 552.91594381 543.95558138 512.06368322 560.37486052]
 [542.12020336 564.78652633   0.         536.41300674 533.80730971
  526.39038469 545.71292554 525.61626625 522.78419994 551.3408659 ]
 [555.64576295 546.38774323 536.41300674   0.         534.33063364
  555.76066091 553.47085318 554.32114918 464.07280762 550.55135416]
 [530.41348217 532.95017967 533.80730971 534.33063364   0.
  540.08607202 508.25765755 537.94943189 532.4541857  538.50027668]
 [560.88439283 561.27664941 526.39038469 555.76066091 540.08607202
    0.         533.6277659  559.0869077  537.03650881 537.10361707]
 [531.50792389 552.91594381 545.71292554 553.47085318 508.25765755
  533.6277659    0.         555.13616526 513.56805646 541.98970783]
 [551.68618196 543.95558138 525.61626625 554.32114918 537.94943

In [4]:
with open("dist_om_traminer.pkl", "rb") as f : 
    unpickler = NumpyCompatibilityUnpickler(f)
    D_traminer = unpickler.load()
    
def upper_triangle_flat(M):
    """Retourne les valeurs du triangle sup√©rieur strict sous forme 1D."""
    return M[np.triu_indices_from(M, k=1)]

v1 = upper_triangle_flat(D_sequenzo)
v2 = upper_triangle_flat(D_traminer)


corr = np.corrcoef(v1, v2)[0, 1]

print(f"[corr] Pearson correlation between Sequenzo and Traminer = {corr:.6f}")

[corr] Pearson correlation between Sequenzo and Traminer = 0.392804
