In [1]:
import numpy as np
import pickle
import sys

# Fix for NumPy 2.x compatibility with pickle files created with NumPy 1.x
# NumPy 2.0+ renamed numpy.core to numpy._core, but pickle files may reference numpy._core
# We need to create a custom unpickler that redirects numpy._core references to numpy.core

class NumpyCompatibilityUnpickler(pickle.Unpickler):
    """Custom unpickler that fixes numpy._core module references for NumPy 2.x compatibility"""
    def find_class(self, module, name):
        # Redirect numpy._core.* references to numpy.core.*
        if module.startswith('numpy._core'):
            module = module.replace('numpy._core', 'numpy.core')
        
        # Ensure numpy.core modules are accessible
        if module == 'numpy.core.multiarray':
            import numpy.core.multiarray
            return getattr(numpy.core.multiarray, name)
        elif module == 'numpy.core._multiarray_umath':
            import numpy.core._multiarray_umath
            return getattr(numpy.core._multiarray_umath, name)
        
        return super().find_class(module, name)

from sequenzo.define_sequence_data import SequenceData
from sequenzo.multidomain.cat import compute_cat_distance_matrix
import pandas as pd 

# Load pickle file using custom unpickler for NumPy 2.x compatibility
with open("dataset.pkl", "rb") as f:
    unpickler = NumpyCompatibilityUnpickler(f)
    data = unpickler.load()

sequences_mv = data["X"]

In [2]:
sequences_mv

array([[[0, 1, 1, ..., 1, 0, 0],
        [0, 0, 0, ..., 1, 1, 1],
        [0, 1, 1, ..., 0, 0, 1],
        ...,
        [1, 0, 1, ..., 1, 0, 0],
        [0, 1, 1, ..., 1, 1, 0],
        [1, 0, 1, ..., 1, 0, 0]],

       [[0, 1, 0, ..., 1, 0, 1],
        [0, 1, 1, ..., 1, 0, 1],
        [0, 1, 1, ..., 0, 0, 1],
        ...,
        [0, 0, 1, ..., 1, 0, 0],
        [1, 0, 0, ..., 1, 0, 0],
        [1, 0, 1, ..., 0, 0, 1]],

       [[1, 1, 1, ..., 1, 1, 1],
        [1, 1, 1, ..., 1, 0, 1],
        [0, 1, 1, ..., 0, 1, 0],
        ...,
        [1, 1, 0, ..., 0, 0, 1],
        [1, 1, 1, ..., 1, 0, 1],
        [0, 1, 0, ..., 1, 1, 0]],

       ...,

       [[0, 1, 1, ..., 0, 1, 0],
        [0, 1, 1, ..., 1, 0, 0],
        [0, 1, 0, ..., 1, 0, 1],
        ...,
        [0, 0, 0, ..., 1, 0, 0],
        [0, 0, 0, ..., 1, 0, 1],
        [0, 0, 1, ..., 0, 1, 1]],

       [[0, 0, 0, ..., 1, 1, 1],
        [0, 1, 0, ..., 1, 1, 0],
        [1, 1, 1, ..., 1, 1, 1],
        ...,
        [1, 1, 0, ..., 

In [2]:
N = 10                          
V = 50
C_list = [2] * V  
T = sequences_mv[0].shape[0]
time_cols = [str(t) for t in range(1, T + 1)]
N_use = min(N, len(sequences_mv))

ids = list(range(N_use))

channels_seq = []
for v in range(V):
    mat = np.stack([sequences_mv[i][:, v] for i in range(N_use)], axis=0).astype(int) + 1
    df_v = pd.DataFrame(mat, columns=time_cols)
    df_v.insert(0, "ID", ids)
    df_v[time_cols] = df_v[time_cols].astype(str)  # "1"/"2"
    seq_v = SequenceData(
        data=df_v,
        time=time_cols,
        states=["1", "2"],
        id_col="ID",
    )
    channels_seq.append(seq_v)

print(channels_seq)

result_cat = compute_cat_distance_matrix(
    channels=channels_seq,
    method="OM",
    sm=["TRATE"] * V,          
    indel="auto",              
    link="sum",                
    norm="none",               
    full_matrix=True,
    what="diss",
    cweight=[1.0] * V,
    with_missing=[False] * V,  
    ch_sep="+"
)
D_sequenzo = result_cat.to_numpy() if hasattr(result_cat, "to_numpy") else np.asarray(result_cat)


[>] SequenceData initialized successfully! Here's a summary:
[>] Number of sequences: 10
[>] Number of time points: 20
[>] Min/Max sequence length: 20 / 20
[>] States: ['1', '2']
[>] Labels: ['1', '2']
[>] Weights: Not provided

[>] SequenceData initialized successfully! Here's a summary:
[>] Number of sequences: 10
[>] Number of time points: 20
[>] Min/Max sequence length: 20 / 20
[>] States: ['1', '2']
[>] Labels: ['1', '2']
[>] Weights: Not provided

[>] SequenceData initialized successfully! Here's a summary:
[>] Number of sequences: 10
[>] Number of time points: 20
[>] Min/Max sequence length: 20 / 20
[>] States: ['1', '2']
[>] Labels: ['1', '2']
[>] Weights: Not provided

[>] SequenceData initialized successfully! Here's a summary:
[>] Number of sequences: 10
[>] Number of time points: 20
[>] Min/Max sequence length: 20 / 20
[>] States: ['1', '2']
[>] Labels: ['1', '2']
[>] Weights: Not provided

[>] SequenceData initialized successfully! Here's a summary:
[>] Number of sequence

In [4]:
print(D_sequenzo)
print("\n")

# Load D_traminer before printing using custom unpickler for NumPy 2.x compatibility
with open("dist_om_traminer.pkl", "rb") as f:
    unpickler = NumpyCompatibilityUnpickler(f)
    D_traminer = unpickler.load()

print(D_traminer)

[[  0.         551.438464   590.7011487  558.23320021 569.50106608
  523.98719994 597.83316371 552.43455992 545.32027421 588.18970496]
 [551.438464     0.         563.99885467 479.81157105 565.7535393
  570.25897148 568.55825371 558.1578998  508.94951632 589.03445094]
 [590.7011487  563.99885467   0.         540.75665436 591.88421763
  569.41351047 568.18946827 551.76331639 537.47650411 569.35206041]
 [558.23320021 479.81157105 540.75665436   0.         568.69462841
  553.26117698 577.60373211 565.74508872 481.45051188 549.96750797]
 [569.50106608 565.7535393  591.88421763 568.69462841   0.
  472.57457624 550.19533146 598.56026128 582.9601302  542.81149401]
 [523.98719994 570.25897148 569.41351047 553.26117698 472.57457624
    0.         557.13888219 578.74768414 563.05138514 561.39741823]
 [597.83316371 568.55825371 568.18946827 577.60373211 550.19533146
  557.13888219   0.         606.49532806 548.92176249 538.15183142]
 [552.43455992 558.1578998  551.76331639 565.74508872 598.560261

In [5]:
with open("dist_om_traminer.pkl", "rb") as f : 
    unpickler = NumpyCompatibilityUnpickler(f)
    D_traminer = unpickler.load()
    
def upper_triangle_flat(M):
    """Retourne les valeurs du triangle supérieur strict sous forme 1D."""
    return M[np.triu_indices_from(M, k=1)]

v1 = upper_triangle_flat(D_sequenzo)
v2 = upper_triangle_flat(D_traminer)


corr = np.corrcoef(v1, v2)[0, 1]

print(f"[corr] Pearson correlation between Sequenzo and Traminer = {corr:.6f}")

[corr] Pearson correlation between Sequenzo and Traminer = 1.000000


In [2]:
# 验证 1：R 和 Python 的 cat sequence data 是否相同 --> ✅

import pandas as pd
import numpy as np

r_cat_sequence_data = pd.read_csv("/Users/xinyi/Projects/sequenzo/sequenzo/data_and_output/orignal data/cat_sequence_data_R.csv")
r_cat_sequence_data = r_cat_sequence_data.to_numpy()

python_cat_sequence_data = np.loadtxt("/Users/xinyi/Projects/sequenzo/sequenzo/data_and_output/orignal data/cat_sequence_data_python.csv", delimiter=",", dtype=str)

np.array_equal(r_cat_sequence_data, python_cat_sequence_data)

True

In [None]:
# 验证 2：R 和 Python 的 cat substmat_list 是否相同 --> ✅
# 验证 3: R 和 Python 的 cat sm 是否相同 --> ✅（只不过由于索引不同，顺序有所改变，但值仍然一一对应）

import pandas as pd
import numpy as np

r_cat_sequence_data = pd.read_csv("/Users/xinyi/Projects/sequenzo/sequenzo/data_and_output/orignal data/cat_sequence_data_R.csv")
r_cat_sequence_data = r_cat_sequence_data.to_numpy()

python_cat_sequence_data = np.loadtxt("/Users/xinyi/Projects/sequenzo/sequenzo/data_and_output/orignal data/cat_sequence_data_python.csv", delimiter=",", dtype=str)

np.array_equal(r_cat_sequence_data, python_cat_sequence_data)