In [1]:
import pandas as pd
import math
import gc
import pickle
from scipy.linalg import norm
from sklearn.preprocessing import normalize
from scipy.stats import entropy
from sklearn.metrics.pairwise import cosine_similarity
import os
from itertools import combinations
import numpy as np
from random import seed, sample
import random
import numpy as np
from collections import Counter
import networkx as nx
import scipy.stats as stats
from scipy.stats import sem
import warnings
warnings.filterwarnings("ignore")
warnings.filterwarnings("ignore", category=FutureWarning)
from sklearn.decomposition import PCA

import hypernetx as hnx  # Hypergraph library
import itertools
from scipy.linalg import eigvalsh

#sys.path.append('/home/ll16598/Documents/Altered_States_Reddit/model_pipeline/__pycache__')
#from quality import reconst_qual, topic_diversity, coherence_centroid, coherence_pairwise #written for this jupyter notebook

In [52]:
import gudhi as gd
import gudhi.representations
def compute_persistence_diagram(data):
    # Compute the Rips complex
    rips_complex = gd.RipsComplex(points=data, max_edge_length=ML)
    # Construct a simplex tree
    simplex_tree = rips_complex.create_simplex_tree(max_dimension=3)
    # Compute persistent homology
    persistence = simplex_tree.persistence()
    # Plot persistence diagram`
    gd.plot_persistence_diagram(persistence)
    gd.plot_persistence_barcode(persistence)

    plt.show()
    return persistence

def add_geometric_centroid(data):
    """
    Given an (N, D) array 'data' of N points in D dimensions,
    computes the mean (geometric centroid) and appends it
    as an extra row at the end.

    Returns:
      data_with_centroid: an (N+1, D) array,
        where the last row is the centroid.
      centroid_index: the integer index of the new centroid row.
    """
    centroid = np.mean(data, axis=0)             # shape (D,)
    data_with_centroid = np.vstack([data, centroid])
    centroid_index = data_with_centroid.shape[0] - 1
    return data_with_centroid, centroid_index

def build_centroid_distance_matrix(data_with_centroid, centroid_index, large_val=1e6):
    """
    Creates a distance matrix where only edges from the 'centroid_index'
    to other points have the real Euclidean distance.
    All other pairwise distances are set to 'large_val'.
    """
    N = data_with_centroid.shape[0]
    dist_matrix = np.full((N, N), large_val, dtype=float)

    # Diagonal = 0
    np.fill_diagonal(dist_matrix, 0.0)

    # Compute distances between centroid and each other point
    for i in range(N):
        if i == centroid_index:
            continue
        # Real distance from centroid -> i
        dist = cosine_similarity(data_with_centroid[centroid_index].reshape(1,-1),\
                                 data_with_centroid[i].reshape(1,-1))
        dist_matrix[centroid_index, i] = dist
        dist_matrix[i, centroid_index] = dist

    return dist_matrix

def compute_persistence_centroid(data, max_edge_length=3.0, plotting=True):
    """
    1) Compute the geometric centroid of 'data' and append it as an extra row.
    2) Build a distance matrix such that only the centroid can connect to other points.
    3) Construct the Rips complex using the custom distance matrix.
    4) Compute and plot the persistence diagram and barcode.

    Returns:
      persistence: The list of (dim, (birth, death)) intervals from GUDHI
    """
    # 1) Add the centroid
    data_with_centroid, centroid_index = add_geometric_centroid(data)

    # 2) Build the custom distance matrix
    dist_matrix = build_centroid_distance_matrix(data_with_centroid, centroid_index, large_val=1e6)

    # 3) Create RipsComplex from the distance matrix
    rips_complex = gd.RipsComplex(distance_matrix=dist_matrix, max_edge_length=max_edge_length)
    
    if SPARSE:
        rips_complex = gd.RipsComplex(distance_matrix=dist_matrix, max_edge_length=max_edge_length, sparse=sparse_param)
    else:
        rips_complex = gd.RipsComplex(distance_matrix=dist_matrix, max_edge_length=max_edge_length)
    simplex_tree = rips_complex.create_simplex_tree(max_dimension=dims_simplex)
    del dist_matrix  # Delete objects
    gc.collect()  # Force garbage collection to free memory
    # 4) Compute persistent homology
    persistence = simplex_tree.persistence()
    if not plotting:
        return rips_complex, simplex_tree, persistence
    # 5) Plot the persistence diagram & barcode
    gd.plot_persistence_diagram(persistence)
    gd.plot_persistence_barcode(persistence)
    plt.show()

    return rips_complex, simplex_tree, persistence


def get_alive_components_over_scales(births, deaths, step=0.025):
    """
    Given lists of (birth, death) intervals for a particular homology dimension,
    compute how many such features (e.g., connected components if D=0, loops if D=1, etc.)
    are 'alive' at increments of 'step' from 0 up to max(deaths).

    Returns:
    - scales: list of scale values (0, 0.025, 0.05, ...)
    - alive_counts: corresponding list of how many features are alive at each scale
    """
    if len(births) == 0:
        # No intervals => no features
        return [], []
    
    max_death = max(deaths)
    scales = np.arange(0, max_death + 1e-9, step)
    
    alive_counts = []
    for s in scales:
        # Count intervals that are alive: birth <= s < death
        count_alive = sum(1 for (b, d) in zip(births, deaths) if b <= s < d)
        alive_counts.append(count_alive)
    
    return list(scales), alive_counts

def get_rips_time_centroid(df, embeddings='sentence_embeddings_pca', step=0.025, D=0):
    """
    For each row in df, build a Rips complex, extract dimension-D intervals
    (e.g., D=0 => connected components, D=1 => loops, etc.),
    then compute how many such features are 'alive' at increments of 'step'.

    Creates two new columns in df:
    - f"scales_dim{D}": The scale values
    - f"alive_dim{D}": The counts of alive features at each scale
    """
    # Copy the DataFrame to avoid SettingWithCopy warnings
    df = df.copy()
    
    # Prepare two new columns (lists)
    df[f'centroid_scales_dim{D}'] = None
    df[f'centroid_alive_dim{D}'] = None
    df[f'rt_centroid'] = None

    for idx, row in df.iterrows():
#         if idx in [26, 27, 28]:
#             print('warning: skipping a computationally intensive sample')
#             continue
        # Get the embeddings for this row
        embed = row[embeddings]
        if not isinstance(embed, (list, np.ndarray)) or len(embed) == 0:
            continue
        
        # Build the Rips Complex
        rips_complex, simplex_tree,persistence= compute_persistence_centroid(embed,max_edge_length=ML,plotting=False)
        births_dimD = []
        deaths_dimD = []
        
        for dim, (b, d) in persistence:
            if dim == D and d != float('inf'):  # ignoring infinite intervals
                births_dimD.append(b)
                deaths_dimD.append(d)

        # Compute how many features are alive at each scale
        scales, alive_components = get_alive_components_over_scales(births_dimD, deaths_dimD, step=step)
        df.at[idx, f"rt_centroid"] = max(deaths_dimD)
        # Store these lists in the new columns
        df.at[idx, f"centroid_scales_dim{D}"] = scales
        df.at[idx, f"centroid_alive_dim{D}"] = alive_components
        del rips_complex, simplex_tree, persistence, births_dimD, deaths_dimD, scales, alive_components  # Delete objects
        gc.collect()  # Force garbage collection to free memory
        
        #time.sleep(0.2)  # Pause for 0.5 seconds
    return df




import networkx as nx
import numpy as np
import community  # for Louvain modularity detection (python-louvain)
from networkx.algorithms import approximation

import collections
import itertools

def ff3(x):
    return x*(x-1)*(x-2)

def avg_tetr_cc(g):
    tetrahedra = itertools.islice(itertools.groupby(
        nx.enumerate_all_cliques(g), len), 3, 4)
    try:
        tetrahedra = next(tetrahedra)[1]
    except StopIteration:
        return 0
    cnts = collections.Counter(itertools.chain(*tetrahedra))
    return 6 * sum(cnt / ff3(g.degree[v]) for v, cnt in cnts.items()) / len(g)



def compute_graph_metrics(G):
    """
    Computes various network metrics for a given graph G, including Laplacian eigenvalues.
    
    Metrics:
    - Shortest Path (Weighted & Unweighted)
    - Number of Triangles
    - Number of Tetrahedra (4-cliques)
    - Modularity using Louvain (Weighted)
    - Clustering Coefficient
    - Max & Mean Degree
    - Max & Mean Betweenness Centrality
    - Max & Mean Strength (Weighted Degree)
    - Second Smallest Laplacian Eigenvalue (Fiedler Value)
    - Largest Laplacian Eigenvalue

    Parameters:
    - G (networkx.Graph): A 3-skeleton graph with weighted edges.

    Returns:
    - Dictionary with computed graph metrics.
    """
    metrics = {
        "shortest_path_unweighted": np.nan,
        "nodes":np.nan,
        "shortest_path_weighted": np.nan,
        "num_triangles": np.nan,
        "num_tetrahedra": np.nan, #maybe also area of these
        "modularity_louvain": np.nan,
        "clustering_coefficient": np.nan,
        "max_degree": np.nan,
        "mean_degree": np.nan,
        "max_betweenness": np.nan,
        "mean_betweenness": np.nan,
        "max_strength": np.nan,
        "mean_strength": np.nan,
        "fiedler_value": np.nan,
        "largest_laplacian_eigenvalue": np.nan
    }

    if not G or G.number_of_nodes() < 2:
        return metrics

    # Sorted nodes
    sorted_nodes = sorted(G.nodes())

    # Shortest Path (Unweighted & Weighted)
    first_node, last_node = sorted_nodes[0], sorted_nodes[-1]
    if nx.has_path(G, first_node, last_node):
        metrics["shortest_path_unweighted"] = nx.shortest_path_length(G, source=first_node, target=last_node)
        metrics["shortest_path_weighted"] = nx.shortest_path_length(G, source=first_node, target=last_node, weight='weight')

    # Number of triangles (3-cliques)
    metrics["num_triangles"] = sum(nx.triangles(G).values()) // 3  # Each triangle counted 3 times

    # Number of tetrahedra (4-cliques)
    metrics["num_tetrahedra"] = avg_tetr_cc(G)  # Ensure avg_tetr_cc is defined


    # Clustering Coefficient
    metrics["clustering_coefficient"] = nx.average_clustering(G, weight='weight')

    # Degree (Max & Mean)
    degrees = dict(G.degree())
    metrics["max_degree"] = max(degrees.values())
    metrics["mean_degree"] = np.mean(list(degrees.values()))

    # Betweenness Centrality (Max & Mean)
    betweenness = nx.betweenness_centrality(G, weight='weight')
    metrics["max_betweenness"] = max(betweenness.values())
    metrics["mean_betweenness"] = np.mean(list(betweenness.values()))
    for u, v, data in G.edges(data=True):
        original_weight = data.get('weight', 1)  # default to 1 if no weight provided
        # Avoid division by zero:
        if original_weight != 0:
            data['inv_weight'] = 1 / original_weight
        else:
            data['inv_weight'] = 0  # or some default value that makes sense for your case

    # Louvain Modularity (Weighted)
    comms = nx.community.louvain_communities(G, weight='inv_weight')
    metrics["modularity_louvain"] = nx.community.modularity(G, comms, weight='inv_weight')

    # Strength (Weighted Degree) (Max & Mean)
    strength = {node: sum(G[node][nbr].get('inv_weight', 1) for nbr in G[node]) for node in G.nodes()}
    metrics["max_strength"] = max(strength.values())
    metrics["mean_strength"] = np.mean(list(strength.values()))
    metrics['nodes']=len(G.nodes())
    # Compute Laplacian Eigenvalues
    L = nx.laplacian_matrix(G, weight='inv_weight').toarray()  # Convert sparse matrix to dense NumPy array
    eigenvalues = eigvalsh(L)  # Compute eigenvalues

    if len(eigenvalues) > 1:  # Ensure there are at least two eigenvalues
        metrics["fiedler_value"] = eigenvalues[1]  # Second smallest eigenvalue (λ₂)
        metrics["largest_laplacian_eigenvalue"] = eigenvalues[-1]  # Largest eigenvalue (λ_max)

    return metrics

def compute_distribution_stats(births, deaths, persistences):
    """
    Given arrays/lists of births, deaths, and persistences, compute summary stats.
    Returns a dict of named metrics.
    """
    births = np.array(births, dtype=float)
    deaths = np.array(deaths, dtype=float)
    pers   = np.array(persistences, dtype=float)
    
    if len(pers) == 0:
        return dict.fromkeys([
            'birth_rate','death_rate','mean_persistence','max_persistence',
            'std_persistence','skewness','kurtosis','entropy'
        ], np.nan)
    
    birth_rate = births.mean()
    death_rate = deaths.mean()
    mean_persistence = pers.mean()
    max_persistence  = pers.max()
    std_persistence  = pers.std(ddof=1)
    skewness = stats.skew(pers, bias=False)
    kurt = stats.kurtosis(pers, bias=False)
    number=len(pers)
    # If you truly want entropy of the raw "pers" values (not a histogram):
    # be aware that stats.entropy(pers) is not standard (it’s for discrete pmf).
    # Typically you'd do a histogram-based approach, but for demonstration:
    #   ent = stats.entropy(pers)
    # Or, a histogram-based approach:
    #   hist, _ = np.histogram(pers, bins='auto', density=True)
    #   ent = stats.entropy(hist) if np.any(hist > 0) else 0.0
    
    ent = stats.entropy(pers)  # Just following your snippet, though it's unusual
    
    return {
        'birth_rate': birth_rate,
        'death_rate': death_rate,
        'mean_persistence': mean_persistence,
        'max_persistence': max_persistence,
        'std_persistence': std_persistence,
        'skewness': skewness,
        'kurtosis': kurt,
        'entropy': ent,
        'number':number
    }



def visualize_rips_simplicial_complex(embed, dataset_name, entry, max_edge_length=3):
    """
    1) Builds a Rips complex (via GUDHI) from a set of high-dimensional points.
    2) Extracts simplices (up to dimension 2) from the simplex tree.
       - Edges (1-simplices) and triangles (2-simplices).
    3) Uses PCA to reduce the points to 3D.
    4) Plots a 3D visualization:
       - Nodes are shown as a scatter plot.
       - Edges are drawn as lines.
       - Triangles are drawn as filled polygons (using Poly3DCollection).
    
    Parameters:
    -----------
    embed : np.ndarray of shape (N, D)
        The high-dimensional point cloud.
    max_edge_length : float
        The maximum edge length used in the Rips complex.
    """
    # 1) Build the Rips complex and create the simplex tree
    rips_complex = gd.RipsComplex(points=embed, max_edge_length=max_edge_length)
    simplex_tree = rips_complex.create_simplex_tree(max_dimension=dims_simplex)
    
    # 2) Extract simplices:
    edges = []
    triangles = []
    
    # get_skeleton(2) returns all simplices up to dimension 2
    for simplex, fvalue in simplex_tree.get_skeleton(4):
        if len(simplex) == 2:
            # 1-simplices: edges
            edges.append(simplex)
        elif len(simplex) == 3:
            # 2-simplices: triangles
            triangles.append(simplex)
    # 3) Use PCA to reduce the point cloud to 3D
    pca = PCA(n_components=3)
    coords_3d = pca.fit_transform(embed)  # shape (N, 3)
    n_points = coords_3d.shape[0]
    
    # Prepare colormap for nodes (using 'magma_r')
    norm = plt.Normalize(vmin=0, vmax=n_points - 1)
    cmap = plt.get_cmap('plasma_r')
    node_colors = cmap(norm(np.arange(n_points)))
    
    # 4) Create the 3D plot
    fig = plt.figure(figsize=(10, 7))
    ax = fig.add_subplot(111, projection='3d')
    
    # Plot nodes
    sc = ax.scatter(coords_3d[:, 0], coords_3d[:, 1], coords_3d[:, 2],
                    c=node_colors, s=30, alpha=0.9)
    
    # Plot edges as lines
    for edge in edges:
        i, j = edge
        x_vals = [coords_3d[i, 0], coords_3d[j, 0]]
        y_vals = [coords_3d[i, 1], coords_3d[j, 1]]
        z_vals = [coords_3d[i, 2], coords_3d[j, 2]]
        # Optionally, color edge based on one endpoint's index or the average.
        avg_idx = int(np.mean(edge))
        edge_color = cmap(norm(avg_idx))
        ax.plot(x_vals, y_vals, z_vals, color=edge_color, alpha=0.8, linewidth=1.5)
    
    # Plot triangles as filled faces
    face_polys = []
    face_colors = []
    for tri in triangles:
        # Get the 3 vertices for this triangle
        pts = [coords_3d[idx] for idx in tri]
        face_polys.append(pts)
        # Color can be computed from the average index of the triangle's vertices
        avg_idx = int(np.mean(tri))
        face_colors.append(cmap(norm(avg_idx)))
    
    # Create a Poly3DCollection for the triangles with a set transparency (alpha)
    poly_collection = Poly3DCollection(face_polys, alpha=0.3, edgecolor='k')
    poly_collection.set_facecolor(face_colors)
    ax.add_collection3d(poly_collection)
    
    # Set title and labels
    ax.set_title(f"", pad=20)
    ax.set_xlabel("PCA 1")
    ax.set_ylabel("PCA 2")
    ax.set_zlabel("PCA 3")
    
    # Add colorbar for node indices
    sm = plt.cm.ScalarMappable(norm=norm, cmap=cmap)
    sm.set_array([])
    cbar = plt.colorbar(sm, ax=ax, pad=0.1)
    cbar.set_label("Node Index")
        # Define three different viewing angles
    angles = [(15, 180), (30, 90), (45, 0)]  # (elevation, azimuth) in degrees
    dir_fig_save=working_dir+f'rips_skeletons/{dataset_name}_{window}_{step}/'
    os.makedirs(dir_fig_save, exist_ok=True)

    # Save figures from different angles
    for i, (elev, azim) in enumerate(angles):
        ax.view_init(elev=elev, azim=azim)  # Set camera angle
        filename = dir_fig_save+f"{entry}_{i}.png"
        plt.savefig(filename, dpi=300, bbox_inches='tight')  # Save figure
       # print(f"Saved: {filename}")

    #plt.show()



def compute_euler_characteristic(simplex_tree, max_dim=4):
    """
    Compute the Euler characteristic of a simplicial complex represented by a GUDHI simplex tree.
    
    Parameters:
      simplex_tree: A GUDHI simplex tree containing simplices up to dimension max_dim.
      max_dim: Maximum dimension to consider (e.g., 3 for tetrahedra).
    
    Returns:
      euler: The Euler characteristic computed as 
             f0 - f1 + f2 - f3 + ... (up to max_dim).
    """
    if not simplex_tree:
        return None
    # Dictionary to store counts for each dimension
    simplex_counts = {}
    for d in range(max_dim + 1):
        # In GUDHI, a d-simplex is a simplex with d+1 vertices.
        simplices_d = [simplex for simplex, filt in simplex_tree.get_skeleton(d) if len(simplex) == d + 1]
        simplex_counts[d] = len(simplices_d)
    #    print(f"Number of {d}-simplices (f_{d}): {simplex_counts[d]}")
    
    # Euler characteristic: sum_{d=0}^{max_dim} (-1)^d * f_d
    euler = sum(((-1) ** d) * simplex_counts[d] for d in range(max_dim + 1))
    return euler

def get_rips_complex_G(df, embedding=str('sentence_embeddings')):
    df['graph']=None
    df['density']=None
    df['edges']=None
    df['tris']=None
    df['tetra']=None
    df['penta']=None

    for idx, row in df.iterrows():
        G=nx.Graph()
        
        H = {}  # Hypergraph as a dictionary: {hyperedge_id: [vertices]}
        embed = row[embedding] 
        rips_complex = row['rt_rips']
        if not rips_complex:
            continue
        simplex_tree =  rips_complex.create_simplex_tree(max_dimension=dims_simplex)

        edges2 = []
        triangles = []
        tetrahedrons = []
        fives=[]
        # get_skeleton(2) returns all simplices up to dimension 2
        for simplex, fvalue in simplex_tree.get_skeleton(4):
            if len(simplex) >= 2:
                for (i, j) in itertools.combinations(simplex, 2):
                    G.add_edge(i, j, weight=fvalue)
            if len(simplex) == 2:
                edges2.append(simplex)
            elif len(simplex) == 3:
                triangles.append(simplex)
            elif len(simplex) == 4:
                tetrahedrons.append(simplex)
            elif len(simplex) == 5:
                fives.append(simplex)

                    
        print(len(G.nodes()), 'nodes')
        print(len(embed), 'embed len')
        df['edges'].loc[idx]=len(edges2)
        df['tris'].loc[idx]=len(triangles)
        df['tetra'].loc[idx]=len(tetrahedrons)
        df['penta'].loc[idx]=len(tetrahedrons)
        df['graph'].loc[idx]=G
        df['density'].loc[idx]=nx.density(G)

        
    return df#df

    ####
import psutil

# Create a psutil Process object for memory checks
process = psutil.Process()
memory_threshold_mb=7000
def get_rips_time(df, embeddings='sentence_embeddings', step=0.025):
    """
    For each row in df, build a Rips complex, extract dimension-D intervals
    (e.g., D=0 => connected components, D=1 => loops, etc.),
    then compute how many such features are 'alive' at increments of 'step'.

    Creates two new columns in df:
    - f"scales_dim{D}": The scale values
    - f"alive_dim{D}": The counts of alive features at each scale
    """
    # Copy the DataFrame to avoid SettingWithCopy warnings
    df = df.copy()
    
    # Prepare two new columns (lists)
    df[f'scales_dim0'] = None
    df[f'alive_dim0'] = None
    df[f'scales_dim1'] = None
    df[f'alive_dim1'] = None
    df[f'scales_dim2'] = None
    df[f'alive_dim2'] = None
    df['rt'] = None
    df["rt_rips"]=None
    df["rips"]=None
    for idx, row in df.iterrows():
        # Get the embeddings for this row
        embed = row[embeddings]
        if not isinstance(embed, (list, np.ndarray)) or len(embed) == 0:
            continue
        
        # Build the Rips Complex
        if SPARSE:
            rips_complex = gd.RipsComplex(points=embed, max_edge_length=3, sparse=sparse_param)
        else:
            rips_complex = gd.RipsComplex(points=embed, max_edge_length=3)
            
        current_mem_mb = process.memory_info().rss / (1024 * 1024)
    #    print('rips mem',idx, current_mem_mb)
        simplex_tree = rips_complex.create_simplex_tree(max_dimension=dims_simplex)
        current_mem_mb = process.memory_info().rss / (1024 * 1024)
     #   print('simplex mem', current_mem_mb)
        if current_mem_mb > memory_threshold_mb:
            continue
        # Extract dimension-D intervals from persistence
        persistence = simplex_tree.persistence()
        
        current_mem_mb = process.memory_info().rss / (1024 * 1024)
   #     print('persistence mem', current_mem_mb)
        if current_mem_mb > memory_threshold_mb:
            continue
        for D in [0,1,2]:
            births_dimD = []
            deaths_dimD = []

            for dim, (b, d) in persistence:
                if dim == D and d != float('inf'):  # ignoring infinite intervals
                    births_dimD.append(b)
                    deaths_dimD.append(d)

            # Compute how many features are alive at each scale
            scales, alive_components = get_alive_components_over_scales(births_dimD, deaths_dimD, step=step)
            if len(deaths_dimD)>0:
                df.at[idx, f"rt"] = max(deaths_dimD)
            # Store these lists in the new columns
            df.at[idx, f"scales_dim{D}"] = scales
            df.at[idx, f"alive_dim{D}"] = alive_components
        rips_complex_max = gd.RipsComplex(points=embed, max_edge_length=df["rt"].loc[idx], \
                                          sparse=sparse_param)
        current_mem_mb = process.memory_info().rss / (1024 * 1024)
      #  print('rips max mem', current_mem_mb)
#         if current_mem_mb > memory_threshold_mb:
#             continue
        #simplex_tree_max = rips_complex.create_simplex_tree(max_dimension=dims_simplex)
        df.at[idx, f"rt_rips"] = rips_complex_max
        df.at[idx, f"rips"] = rips_complex

    return df

import math

def count_new_simplices_by_dimension_in_bins(simplex_tree, dimensions=(2, 3, 4, 5), bin_size=0.025):
    # Initialize a dictionary for each dimension of interest.
    counts = {dim: {} for dim in dimensions}
    
    # Get the (simplex, filtration) pairs from the simplex tree.
    simplex_filtration = simplex_tree.get_filtration()
    
    for simplex, filt in simplex_filtration:
        # Determine the dimension of the simplex.
        dim = len(simplex) - 1
        # Only process simplices with a dimension in the provided set.
        if dim in dimensions:
            bin_start = math.floor(filt / bin_size) * bin_size
            counts[dim][bin_start] = counts[dim].get(bin_start, 0) + 1
            
    return counts
def get_simplices_over_time(df, max_dimension=4,simplex_tree_type='rt_simplex_tree'):
    """
    For each row in the dataframe (which contains a simplex tree in the 'simplex_tree' column),
    compute the simplex counts at different filtration values for dimensions 2, 3, and 4.
    
    The filtration values and the corresponding counts are stored in separate columns.
    
    New columns added:
      - simplex_time_dim{D}_filtration: the list of filtration values.
      - simplex_time_dim{D}_count: the list of simplex counts for that dimension.
      
    Parameters:
        df: pandas DataFrame that contains a column 'simplex_tree'
        max_dimension: maximum dimension to be passed to count_simplices (not used in this snippet,
                       but can be used if you want to generalize further).
    
    Returns:
        The dataframe with additional columns.
    """
    
    # For each dimension, create two new columns for the filtration values and counts.
    for D in [2, 3, 4]:
        df[f'simplex_time_dim{D}_filtration'] = None
        df[f'simplex_time_dim{D}_count'] = None

    # Process each row in the dataframe.
    for idx, row in df.iterrows():
        # Calculate the counts for the current simplex tree.
        simplex_counts = count_new_simplices_by_dimension_in_bins(row[simplex_tree_type])
        for D in [2,3,4]:
            # Get the dictionary for the current dimension D.
            # (If there are no simplices of this dimension, we set empty lists.)
            if D in simplex_counts and simplex_counts[D]:
                # Sort the bins so that the lists are ordered by increasing filtration value.
                bins = sorted(simplex_counts[D].keys())
                counts_list = [simplex_counts[D][b] for b in bins]
            else:
                bins, counts_list = [], []
            df.at[idx, f'simplex_time_dim{D}_filtration'] = bins
            df.at[idx, f'simplex_time_dim{D}_count'] = counts_list

    return df


In [None]:
# 3) Define a helper function to transform a single row's embeddings
dir_array='/home/ll16598/Documents/POSTDOC/TDA/TDA_cluster/utt_span_vectors'
pooling_method='mean'
if pooling_method=='max':
    ML=5
else:
    ML=3
embeddings='sentence_embeddings'
reduce_dims=True
SPARSE=True
sparse_param=0.5
dims_simplex=3
chunk_size=5
df_monologs=pd.read_csv(f'{dir_atom_dfs}/df_monolog_{threshold}.csv')
df_SER2=pd.read_csv(f'{dir_atom_dfs}/df_SER2_{threshold}.csv')
df_PEM=pd.read_csv(f'{dir_atom_dfs}/df_PEM.csv')
df_SER_MA=pd.read_csv(f'{dir_atom_dfs}/SER1.csv')
preprocessed_df_dir='/home/ll16598/Documents/POSTDOC/preprocessed_dfs'
df_MASM=pd.read_csv(preprocessed_df_dir + '/MASM.csv')
df_DEI=pd.read_csv(preprocessed_df_dir + '/cleaned_DEI.csv')

print(f'using {pooling_method} pooling')
import gc
import shutil
from tqdm import tqdm
df_names=['PEM_df','SER_monologs', 'SER_IPSP', 'SER1', 'MASM', 'cleaned_DEI']
dfs1=[df_PEM, df_monologs, df_SER2, df_SER_MA, df_MASM, df_DEI]
if len(df_names)!=len(dfs1):
    raise Exception('MISMATCH IN NAMES/dfs') 
data_save_dir=working_dir+'TDA_output/'
os.makedirs(data_save_dir, exist_ok=True)

completed_files=os.listdir(data_save_dir)
for span in [1, 2,3]:
    layers='last'
    dfs=dfs1.copy()
    for df_no, df_monolog in enumerate(dfs):

        df_name=df_names[df_no]
#             if newfilename in completed_files:
#                 print(f'Already completed {newfilename}')
#                 continue
        data_save_dir_name=working_dir+f'TDA_output/{df_name}_{span}_{pooling_method}/'
        if os.path.exists(data_save_dir_name):
            shutil.rmtree(data_save_dir_name)  # Deletes the entire directory and its contents
        os.makedirs(data_save_dir_name, exist_ok=True)

        with open(f'{dir_array}/utterance_{span}_{df_name}_{pooling_method}_back_sentence_embeddings_arrays.pkl', 'rb') as f:
            embeds= pickle.load(f)
            if pooling_method=='max':
                embeds=[np.array(i) for i in embeds]
        newfilename=f'{df_name}_utterance_distance_results.csv'


        if len(embeds)!=len(df_monolog):
            raise Exception('MISMATCH IN LENGTH')

        df_monolog['sentence_embeddings'] = embeds


        print(len(embeds))
        print(len(df_monolog))
        if len(embeds)!=len(df_monolog):
            raise Exception('MISMATCH IN LENGTH')

        df_monolog['sentence_embeddings'] = embeds
        df_monolog['length'] = [len(i) for i in embeds]
        if test_mode:
            df_monolog=df_monolog[0:10]
        df_monolog = df_monolog[
            df_monolog["sentence_embeddings"].apply(
                lambda x: (
                    not isinstance(x, float)               # exclude floats
                    and isinstance(x, (list, tuple, np.ndarray))  # must be list/tuple/np.ndarray
                    and len(x) >= 3                        # length >= 3
                )
            )]

        # 4) Apply that function to each row -> produce a new column
        if reduce_dims:
            all_vecs = []
            for row in df_monolog['sentence_embeddings']:
                arr = np.array(row)  
                all_vecs.append(arr)
            big_matrix = np.concatenate(all_vecs, axis=0)
            pca = PCA(n_components=50)
            pca.fit(big_matrix)
            def transform_embeddings(emb_list):
                emb_array = np.array(emb_list)   # shape (k_i, 384)
                emb_pca = pca.transform(emb_array)  # shape (k_i, 50)
                return emb_pca
            df_monolog['sentence_embeddings'] = df_monolog['sentence_embeddings'].apply(transform_embeddings)
        df_monolog['token_embeddings']=None
        print('performing TDA on ',df_name, ' span: ', span)#, 'step: ', step)

        for fi in tqdm(range(0, len(df_monolog), chunk_size), desc="Processing Chunks"):
            if fi+chunk_size>=len(df_monolog):
                df_subset=df_monolog[fi:].reset_index(drop=True)
            else:
                df_subset=df_monolog[fi:fi+chunk_size].reset_index(drop=True)


           # drugs=list(set(df_subset['Drug']))
            #Participants=list(set(df_subset['Participant']))
            df_subset=get_rips_time(df_subset,embeddings=embeddings)
           # df_subset=get_rips_time_centroid(df_subset,embeddings=embeddings)
            #df_monolog=get_simplices_over_time(df_monolog,simplex_tree_type='simplex_tree')
            #print('got RIPS')
            df_with_graph=get_rips_complex_G(df_subset)
            #print('got G')
            #df_with_graph['euler'] = df_with_graph['rt_simplex_tree'].apply(lambda st: compute_euler_characteristic(st, max_dim=4))
            # Apply the function to each graph in df_with_graph
            graph_metrics = df_with_graph['graph'].apply(compute_graph_metrics)
            graph_metrics_df = pd.DataFrame(graph_metrics.tolist())
            df_with_graph = pd.concat([df_with_graph, graph_metrics_df], axis=1)
                        # Create a new DataFrame

            # We'll accumulate new rows in a list of dicts
            new_rows = []
            dimensions = [0, 1, 2]

            for idx, row in df_with_graph.iterrows():
                embed = row[embeddings]  # Adjust as needed
                # We’ll store births, deaths, pers LENGTHS in a dict keyed by dimension
                dim_dict = {
                    dim: {'births': [], 'deaths': [], 'pers': []}
                    for dim in dimensions
                }



                # Build the Rips Complex for *this row only*
                rips_complex =row['rips']
                try:
                    simplex_tree = \
                    rips_complex.create_simplex_tree(max_dimension=dims_simplex)
                except Exception as e:
                    continue
                persistence = simplex_tree.persistence()

                # Collect intervals by dimension
                for dim, (b, d) in persistence:
                    if d == float('inf'):
                        continue
                    if dim in dimensions:
                        dim_dict[dim]['births'].append(b)
                        dim_dict[dim]['deaths'].append(d)
                        dim_dict[dim]['pers'].append(d - b)


                row_dict = row.to_dict()  # Start with original row's columns

                for dim in dimensions:
                    bdp = dim_dict[dim]
                    stats_dict = compute_distribution_stats(bdp['births'], bdp['deaths'], bdp['pers'])
                    # prefix each stat key with dim
                    for stat_key, stat_val in stats_dict.items():
                        row_dict[f"{stat_key}_dim{dim}"] = stat_val

                # Add row_dict to new_rows
                new_rows.append(row_dict)

            # Create a new DataFrame
            df_with_tda = pd.DataFrame(new_rows)
            df_with_tda.to_csv(data_save_dir_name + f'{df_name}_{fi}_{span}_TDA_results.csv')
            del df_subset
            del df_with_graph
            gc.collect()
        print(f'completed! {df_name} span: {span}')
        list_files=sorted(os.listdir(data_save_dir_name))
        ddfs=[]
        for f in list_files:
            ddfs.append(pd.read_csv(data_save_dir_name+f))
        data=pd.concat(ddfs)
        data.to_csv(data_save_dir+f'{df_name}_{span}_{pooling_method}_utterance_TDA_results.csv')

using mean pooling
130
130
performing TDA on  PEM_df  span:  1


Processing Chunks:   0%|                                 | 0/26 [00:00<?, ?it/s]

82 nodes
82 embed len
75 nodes
77 embed len
71 nodes
71 embed len
68 nodes
68 embed len
52 nodes
52 embed len


Processing Chunks:   4%|▉                        | 1/26 [00:33<13:53, 33.35s/it]

58 nodes
58 embed len
87 nodes
91 embed len
70 nodes
71 embed len
65 nodes
66 embed len
55 nodes
55 embed len


Processing Chunks:   8%|█▉                       | 2/26 [01:22<17:06, 42.79s/it]

67 nodes
68 embed len
51 nodes
51 embed len
94 nodes
99 embed len
76 nodes
76 embed len
76 nodes
77 embed len


Processing Chunks:  12%|██▉                      | 3/26 [02:36<21:53, 57.10s/it]

78 nodes
78 embed len
43 nodes
43 embed len
88 nodes
90 embed len
88 nodes
89 embed len
54 nodes
56 embed len


Processing Chunks:  15%|███▊                     | 4/26 [03:24<19:32, 53.31s/it]

103 nodes
104 embed len
92 nodes
93 embed len
110 nodes
111 embed len
78 nodes
86 embed len
67 nodes
68 embed len


Processing Chunks:  19%|████▊                    | 5/26 [05:19<26:27, 75.59s/it]

62 nodes
65 embed len
54 nodes
54 embed len
63 nodes
64 embed len
77 nodes
80 embed len
69 nodes
71 embed len


Processing Chunks:  23%|█████▊                   | 6/26 [05:59<21:10, 63.52s/it]

66 nodes
69 embed len
120 nodes
123 embed len
63 nodes
64 embed len
44 nodes
47 embed len
85 nodes
88 embed len


Processing Chunks:  27%|██████▋                  | 7/26 [08:14<27:27, 86.72s/it]

82 nodes
83 embed len
68 nodes
69 embed len
89 nodes
98 embed len
17 nodes
17 embed len
59 nodes
59 embed len


Processing Chunks:  31%|███████▋                 | 8/26 [08:59<22:02, 73.50s/it]

89 nodes
90 embed len
101 nodes
101 embed len
75 nodes
77 embed len
78 nodes
79 embed len
33 nodes
33 embed len


Processing Chunks:  35%|████████▋                | 9/26 [10:22<21:42, 76.60s/it]

94 nodes
98 embed len
69 nodes
70 embed len
128 nodes
128 embed len
111 nodes
112 embed len
57 nodes
57 embed len


Processing Chunks:  38%|████████▊              | 10/26 [13:25<29:09, 109.37s/it]

133 nodes
140 embed len
85 nodes
85 embed len
82 nodes
84 embed len
126 nodes
128 embed len
59 nodes
63 embed len


Processing Chunks:  42%|█████████▋             | 11/26 [18:26<42:02, 168.18s/it]

56 nodes
56 embed len
51 nodes
52 embed len
95 nodes
99 embed len
62 nodes
62 embed len
102 nodes
102 embed len


Processing Chunks:  46%|██████████▌            | 12/26 [19:43<32:43, 140.22s/it]

54 nodes
56 embed len
54 nodes
54 embed len
58 nodes
63 embed len
74 nodes
75 embed len
55 nodes
56 embed len


Processing Chunks:  50%|███████████▌           | 13/26 [20:10<22:57, 105.92s/it]

59 nodes
59 embed len
44 nodes
44 embed len
76 nodes
83 embed len
49 nodes
49 embed len
91 nodes
93 embed len


Processing Chunks:  54%|████████████▍          | 14/26 [22:03<21:37, 108.11s/it]

67 nodes
67 embed len
67 nodes
68 embed len
96 nodes
99 embed len
64 nodes
65 embed len
60 nodes
60 embed len


Processing Chunks:  58%|█████████████▊          | 15/26 [23:19<18:02, 98.45s/it]

120 nodes
126 embed len
76 nodes
76 embed len
88 nodes
90 embed len
51 nodes
51 embed len
93 nodes
94 embed len


Processing Chunks:  62%|██████████████▏        | 16/26 [26:15<20:19, 121.95s/it]

72 nodes
72 embed len
100 nodes
100 embed len
107 nodes
108 embed len
82 nodes
83 embed len
87 nodes
88 embed len


Processing Chunks:  65%|███████████████        | 17/26 [31:10<26:05, 173.92s/it]

72 nodes
73 embed len
88 nodes
92 embed len
70 nodes
71 embed len
85 nodes
88 embed len
83 nodes
87 embed len


Processing Chunks:  69%|███████████████▉       | 18/26 [33:06<20:50, 156.34s/it]

55 nodes
55 embed len
65 nodes
66 embed len
60 nodes
62 embed len
81 nodes
84 embed len
84 nodes
88 embed len


Processing Chunks:  73%|████████████████▊      | 19/26 [34:15<15:11, 130.24s/it]

42 nodes
43 embed len
65 nodes
69 embed len
47 nodes
47 embed len
79 nodes
79 embed len
83 nodes
87 embed len


Processing Chunks:  77%|█████████████████▋     | 20/26 [35:27<11:16, 112.79s/it]

67 nodes
67 embed len
84 nodes
84 embed len
62 nodes
63 embed len
74 nodes
79 embed len
57 nodes
62 embed len


Processing Chunks:  81%|██████████████████▌    | 21/26 [36:43<08:27, 101.57s/it]

88 nodes
89 embed len
65 nodes
67 embed len
107 nodes
107 embed len
93 nodes
93 embed len
69 nodes
71 embed len


Processing Chunks:  85%|███████████████████▍   | 22/26 [39:21<07:53, 118.48s/it]

63 nodes
66 embed len
87 nodes
87 embed len
61 nodes
64 embed len
65 nodes
65 embed len
95 nodes
95 embed len


Processing Chunks:  88%|████████████████████▎  | 23/26 [41:04<05:42, 114.02s/it]

38 nodes
38 embed len
43 nodes
45 embed len
91 nodes
93 embed len
89 nodes
90 embed len
118 nodes
119 embed len


Processing Chunks:  92%|█████████████████████▏ | 24/26 [44:31<04:44, 142.01s/it]

54 nodes
55 embed len
74 nodes
82 embed len
44 nodes
45 embed len
86 nodes
87 embed len
88 nodes
90 embed len


Processing Chunks:  96%|██████████████████████ | 25/26 [46:08<02:08, 128.32s/it]

79 nodes
80 embed len
83 nodes
87 embed len
111 nodes
111 embed len
72 nodes
72 embed len
59 nodes
59 embed len


Processing Chunks: 100%|███████████████████████| 26/26 [48:45<00:00, 112.51s/it]


completed! PEM_df span: 1
101
101
performing TDA on  SER_monologs  span:  1


Processing Chunks:   0%|                                 | 0/20 [00:00<?, ?it/s]

16 nodes
18 embed len
34 nodes
34 embed len
25 nodes
25 embed len
21 nodes
21 embed len
33 nodes
37 embed len


Processing Chunks:   5%|█▎                       | 1/20 [00:02<00:51,  2.72s/it]

33 nodes
36 embed len
32 nodes
32 embed len
38 nodes
38 embed len
33 nodes
33 embed len
53 nodes
54 embed len


Processing Chunks:  10%|██▌                      | 2/20 [00:10<01:46,  5.89s/it]

54 nodes
54 embed len
26 nodes
27 embed len
35 nodes
35 embed len
31 nodes
31 embed len
33 nodes
33 embed len


Processing Chunks:  15%|███▊                     | 3/20 [00:16<01:40,  5.94s/it]

49 nodes
49 embed len
34 nodes
37 embed len
27 nodes
27 embed len
34 nodes
36 embed len
49 nodes
50 embed len


Processing Chunks:  20%|█████                    | 4/20 [00:25<01:54,  7.13s/it]

57 nodes
59 embed len
54 nodes
57 embed len
18 nodes
18 embed len
8 nodes
8 embed len
56 nodes
57 embed len


Processing Chunks:  25%|██████▎                  | 5/20 [00:38<02:17,  9.17s/it]

69 nodes
70 embed len
64 nodes
68 embed len
47 nodes
47 embed len
28 nodes
28 embed len
43 nodes
44 embed len


Processing Chunks:  30%|███████▌                 | 6/20 [01:02<03:19, 14.26s/it]

35 nodes
38 embed len
38 nodes
39 embed len
40 nodes
42 embed len
21 nodes
21 embed len
33 nodes
43 embed len


Processing Chunks:  35%|████████▊                | 7/20 [01:06<02:21, 10.91s/it]

59 nodes
60 embed len
47 nodes
50 embed len
54 nodes
54 embed len
44 nodes
44 embed len
50 nodes
50 embed len


Processing Chunks:  40%|██████████               | 8/20 [01:25<02:39, 13.28s/it]

47 nodes
47 embed len
44 nodes
45 embed len
109 nodes
109 embed len
50 nodes
50 embed len
57 nodes
57 embed len


Processing Chunks:  45%|███████████▎             | 9/20 [02:41<06:01, 32.89s/it]

66 nodes
66 embed len
32 nodes
32 embed len
27 nodes
27 embed len
28 nodes
30 embed len
24 nodes
24 embed len


Processing Chunks:  50%|████████████            | 10/20 [03:03<04:55, 29.58s/it]

41 nodes
41 embed len
36 nodes
36 embed len
44 nodes
45 embed len
27 nodes
28 embed len
53 nodes
53 embed len


Processing Chunks:  55%|█████████████▏          | 11/20 [03:14<03:35, 23.90s/it]

42 nodes
42 embed len
68 nodes
70 embed len
62 nodes
70 embed len
99 nodes
101 embed len
15 nodes
15 embed len


Processing Chunks:  60%|██████████████▍         | 12/20 [04:38<05:37, 42.18s/it]

33 nodes
33 embed len
40 nodes
40 embed len
37 nodes
38 embed len
41 nodes
41 embed len
11 nodes
11 embed len


Processing Chunks:  65%|███████████████▌        | 13/20 [04:43<03:36, 30.88s/it]

14 nodes
15 embed len
11 nodes
11 embed len
24 nodes
24 embed len
18 nodes
18 embed len
21 nodes
21 embed len


Processing Chunks:  70%|████████████████▊       | 14/20 [04:44<02:11, 21.96s/it]

26 nodes
26 embed len
27 nodes
28 embed len
20 nodes
20 embed len
40 nodes
40 embed len
31 nodes
31 embed len


Processing Chunks:  75%|██████████████████      | 15/20 [04:47<01:20, 16.11s/it]

24 nodes
28 embed len
37 nodes
37 embed len
35 nodes
35 embed len
39 nodes
39 embed len
40 nodes
40 embed len


Processing Chunks:  80%|███████████████████▏    | 16/20 [04:52<00:51, 12.80s/it]

36 nodes
37 embed len
40 nodes
40 embed len
30 nodes
30 embed len
26 nodes
27 embed len
23 nodes
23 embed len


Processing Chunks:  85%|████████████████████▍   | 17/20 [04:56<00:30, 10.13s/it]

35 nodes
35 embed len
12 nodes
12 embed len
16 nodes
16 embed len
39 nodes
40 embed len
24 nodes
24 embed len


Processing Chunks:  90%|█████████████████████▌  | 18/20 [04:58<00:15,  7.80s/it]

63 nodes
63 embed len
19 nodes
19 embed len
25 nodes
25 embed len
17 nodes
17 embed len
24 nodes
26 embed len


Processing Chunks:  95%|██████████████████████▊ | 19/20 [05:06<00:07,  8.00s/it]

28 nodes
29 embed len
38 nodes
38 embed len
17 nodes
18 embed len
41 nodes
42 embed len
39 nodes
40 embed len


Processing Chunks: 100%|████████████████████████| 20/20 [05:13<00:00, 15.68s/it]


completed! SER_monologs span: 1
107
107
performing TDA on  SER_IPSP  span:  1


Processing Chunks:   0%|                                 | 0/22 [00:00<?, ?it/s]

37 nodes
37 embed len
59 nodes
59 embed len
45 nodes
46 embed len
53 nodes
53 embed len
36 nodes
37 embed len


Processing Chunks:   5%|█▏                       | 1/22 [00:12<04:12, 12.05s/it]

37 nodes
40 embed len
52 nodes
53 embed len
44 nodes
54 embed len
58 nodes
62 embed len
57 nodes
59 embed len


Processing Chunks:   9%|██▎                      | 2/22 [00:34<06:01, 18.08s/it]

46 nodes
46 embed len
77 nodes
82 embed len
40 nodes
40 embed len
55 nodes
55 embed len
45 nodes
47 embed len


Processing Chunks:  14%|███▍                     | 3/22 [00:57<06:26, 20.34s/it]

53 nodes
53 embed len
50 nodes
52 embed len
47 nodes
47 embed len
46 nodes
48 embed len
74 nodes
83 embed len


Processing Chunks:  18%|████▌                    | 4/22 [01:54<10:30, 35.03s/it]

60 nodes
61 embed len
45 nodes
45 embed len
35 nodes
39 embed len
36 nodes
36 embed len
39 nodes
44 embed len


Processing Chunks:  23%|█████▋                   | 5/22 [02:05<07:26, 26.25s/it]

65 nodes
67 embed len
64 nodes
66 embed len
70 nodes
71 embed len
47 nodes
53 embed len
64 nodes
66 embed len


Processing Chunks:  27%|██████▊                  | 6/22 [02:48<08:31, 31.94s/it]

44 nodes
47 embed len
33 nodes
36 embed len
37 nodes
38 embed len
20 nodes
21 embed len
33 nodes
34 embed len


Processing Chunks:  32%|███████▉                 | 7/22 [02:53<05:47, 23.17s/it]

59 nodes
60 embed len
32 nodes
34 embed len
26 nodes
26 embed len
49 nodes
52 embed len
53 nodes
53 embed len


Processing Chunks:  36%|█████████                | 8/22 [03:17<05:25, 23.24s/it]

43 nodes
43 embed len
58 nodes
59 embed len
47 nodes
47 embed len
74 nodes
75 embed len
56 nodes
58 embed len


Processing Chunks:  41%|██████████▏              | 9/22 [03:59<06:19, 29.23s/it]

72 nodes
76 embed len
77 nodes
79 embed len
79 nodes
83 embed len
60 nodes
61 embed len
57 nodes
57 embed len


Processing Chunks:  45%|██████████▉             | 10/22 [05:31<09:43, 48.66s/it]

44 nodes
44 embed len
35 nodes
36 embed len
49 nodes
51 embed len
37 nodes
39 embed len
38 nodes
39 embed len


Processing Chunks:  50%|████████████            | 11/22 [05:45<06:58, 38.08s/it]

18 nodes
20 embed len
42 nodes
46 embed len
39 nodes
39 embed len
45 nodes
46 embed len
37 nodes
39 embed len


Processing Chunks:  55%|█████████████           | 12/22 [05:53<04:50, 29.00s/it]

57 nodes
58 embed len
40 nodes
40 embed len
58 nodes
63 embed len
58 nodes
58 embed len
78 nodes
80 embed len


Processing Chunks:  59%|██████████████▏         | 13/22 [06:47<05:27, 36.39s/it]

100 nodes
100 embed len
16 nodes
16 embed len
52 nodes
56 embed len
57 nodes
58 embed len
35 nodes
35 embed len


Processing Chunks:  64%|███████████████▎        | 14/22 [07:51<05:58, 44.80s/it]

50 nodes
50 embed len
20 nodes
20 embed len
25 nodes
25 embed len
25 nodes
28 embed len
19 nodes
19 embed len


Processing Chunks:  68%|████████████████▎       | 15/22 [08:06<04:09, 35.68s/it]

69 nodes
78 embed len
90 nodes
99 embed len
75 nodes
84 embed len
25 nodes
26 embed len
23 nodes
25 embed len


Processing Chunks:  73%|█████████████████▍      | 16/22 [09:14<04:33, 45.64s/it]

23 nodes
23 embed len
59 nodes
60 embed len
59 nodes
60 embed len
49 nodes
50 embed len
74 nodes
74 embed len


Processing Chunks:  77%|██████████████████▌     | 17/22 [10:05<03:55, 47.06s/it]

75 nodes
78 embed len
82 nodes
85 embed len
50 nodes
50 embed len
61 nodes
63 embed len
40 nodes
50 embed len


Processing Chunks:  82%|███████████████████▋    | 18/22 [11:11<03:31, 52.79s/it]

25 nodes
25 embed len
23 nodes
23 embed len
55 nodes
56 embed len
56 nodes
59 embed len
63 nodes
68 embed len


Processing Chunks:  86%|████████████████████▋   | 19/22 [11:32<02:09, 43.17s/it]

35 nodes
35 embed len
28 nodes
28 embed len
30 nodes
30 embed len
53 nodes
54 embed len
71 nodes
74 embed len


Processing Chunks:  91%|█████████████████████▊  | 20/22 [12:00<01:17, 38.83s/it]

69 nodes
69 embed len
30 nodes
30 embed len
44 nodes
44 embed len
49 nodes
53 embed len
43 nodes
46 embed len


Processing Chunks:  95%|██████████████████████▉ | 21/22 [12:23<00:34, 34.00s/it]

45 nodes
49 embed len
62 nodes
64 embed len


Processing Chunks: 100%|████████████████████████| 22/22 [12:32<00:00, 34.21s/it]


completed! SER_IPSP span: 1
108
108
performing TDA on  SER1  span:  1


Processing Chunks:   0%|                                 | 0/22 [00:00<?, ?it/s]

8 nodes
8 embed len
4 nodes
4 embed len
6 nodes
6 embed len
8 nodes
8 embed len
10 nodes
10 embed len


Processing Chunks:   5%|█▏                       | 1/22 [00:00<00:14,  1.50it/s]

15 nodes
15 embed len
12 nodes
12 embed len
11 nodes
11 embed len
12 nodes
12 embed len
7 nodes
7 embed len


Processing Chunks:   9%|██▎                      | 2/22 [00:01<00:18,  1.10it/s]

6 nodes
7 embed len
8 nodes
9 embed len
5 nodes
9 embed len
10 nodes
10 embed len
10 nodes
10 embed len


Processing Chunks:  14%|███▍                     | 3/22 [00:02<00:17,  1.09it/s]

12 nodes
12 embed len
7 nodes
9 embed len
11 nodes
11 embed len
7 nodes
7 embed len
5 nodes
5 embed len


Processing Chunks:  18%|████▌                    | 4/22 [00:03<00:17,  1.00it/s]

7 nodes
7 embed len
6 nodes
6 embed len
8 nodes
8 embed len
7 nodes
7 embed len
8 nodes
8 embed len


Processing Chunks:  23%|█████▋                   | 5/22 [00:04<00:16,  1.02it/s]

7 nodes
7 embed len
10 nodes
10 embed len
15 nodes
15 embed len
5 nodes
5 embed len
9 nodes
9 embed len


Processing Chunks:  27%|██████▊                  | 6/22 [00:05<00:15,  1.06it/s]

9 nodes
9 embed len
8 nodes
9 embed len
11 nodes
11 embed len
8 nodes
8 embed len
7 nodes
7 embed len


Processing Chunks:  32%|███████▉                 | 7/22 [00:06<00:13,  1.13it/s]

10 nodes
10 embed len
7 nodes
8 embed len
8 nodes
8 embed len
13 nodes
13 embed len
8 nodes
8 embed len


Processing Chunks:  36%|█████████                | 8/22 [00:07<00:12,  1.11it/s]

10 nodes
10 embed len
9 nodes
9 embed len
6 nodes
6 embed len
14 nodes
15 embed len
20 nodes
20 embed len


Processing Chunks:  41%|██████████▏              | 9/22 [00:08<00:13,  1.07s/it]

12 nodes
12 embed len
23 nodes
23 embed len
21 nodes
21 embed len
11 nodes
11 embed len
12 nodes
12 embed len


Processing Chunks:  45%|██████████▉             | 10/22 [00:10<00:14,  1.21s/it]

9 nodes
9 embed len
11 nodes
11 embed len
16 nodes
17 embed len
8 nodes
8 embed len
6 nodes
6 embed len


Processing Chunks:  50%|████████████            | 11/22 [00:11<00:12,  1.10s/it]

11 nodes
12 embed len
7 nodes
7 embed len
10 nodes
10 embed len
6 nodes
6 embed len
6 nodes
6 embed len


Processing Chunks:  55%|█████████████           | 12/22 [00:12<00:11,  1.11s/it]

8 nodes
8 embed len
9 nodes
9 embed len
19 nodes
19 embed len
11 nodes
11 embed len
9 nodes
9 embed len


Processing Chunks:  59%|██████████████▏         | 13/22 [00:13<00:10,  1.20s/it]

10 nodes
13 embed len
4 nodes
4 embed len
10 nodes
10 embed len
8 nodes
9 embed len
4 nodes
4 embed len


Processing Chunks:  64%|███████████████▎        | 14/22 [00:14<00:09,  1.20s/it]

7 nodes
7 embed len
8 nodes
8 embed len
11 nodes
11 embed len
8 nodes
8 embed len
15 nodes
15 embed len


Processing Chunks:  68%|████████████████▎       | 15/22 [00:16<00:08,  1.18s/it]

7 nodes
8 embed len
6 nodes
6 embed len
10 nodes
11 embed len
5 nodes
6 embed len
3 nodes
3 embed len


Processing Chunks:  73%|█████████████████▍      | 16/22 [00:16<00:06,  1.07s/it]

6 nodes
6 embed len
7 nodes
8 embed len
5 nodes
5 embed len
8 nodes
8 embed len
5 nodes
5 embed len


Processing Chunks:  77%|██████████████████▌     | 17/22 [00:17<00:05,  1.01s/it]

10 nodes
10 embed len
5 nodes
5 embed len
6 nodes
6 embed len
9 nodes
9 embed len
8 nodes
9 embed len


Processing Chunks:  82%|███████████████████▋    | 18/22 [00:18<00:03,  1.06it/s]

8 nodes
8 embed len
8 nodes
8 embed len
9 nodes
9 embed len
10 nodes
10 embed len
10 nodes
10 embed len


Processing Chunks:  86%|████████████████████▋   | 19/22 [00:19<00:03,  1.02s/it]

9 nodes
9 embed len
8 nodes
8 embed len
9 nodes
9 embed len
7 nodes
7 embed len
9 nodes
9 embed len


Processing Chunks:  91%|█████████████████████▊  | 20/22 [00:20<00:02,  1.10s/it]

10 nodes
10 embed len
8 nodes
8 embed len
9 nodes
9 embed len
8 nodes
10 embed len
10 nodes
10 embed len


Processing Chunks:  95%|██████████████████████▉ | 21/22 [00:22<00:01,  1.14s/it]

5 nodes
7 embed len
12 nodes
12 embed len
11 nodes
13 embed len


Processing Chunks: 100%|████████████████████████| 22/22 [00:22<00:00,  1.04s/it]


completed! SER1 span: 1
66
66
performing TDA on  MASM  span:  1


Processing Chunks:   0%|                                 | 0/14 [00:00<?, ?it/s]

37 nodes
56 embed len
49 nodes
56 embed len
63 nodes
72 embed len
52 nodes
54 embed len
36 nodes
38 embed len


Processing Chunks:   7%|█▊                       | 1/14 [00:14<03:05, 14.26s/it]

60 nodes
67 embed len
50 nodes
57 embed len
38 nodes
38 embed len
54 nodes
58 embed len
41 nodes
49 embed len


Processing Chunks:  14%|███▌                     | 2/14 [00:27<02:45, 13.82s/it]

34 nodes
44 embed len
35 nodes
41 embed len
78 nodes
83 embed len
80 nodes
91 embed len
80 nodes
90 embed len


Processing Chunks:  21%|█████▎                   | 3/14 [01:25<06:13, 34.00s/it]

36 nodes
42 embed len
50 nodes
56 embed len
57 nodes
62 embed len
64 nodes
76 embed len
50 nodes
54 embed len


Processing Chunks:  29%|███████▏                 | 4/14 [01:47<04:50, 29.06s/it]

63 nodes
70 embed len
55 nodes
72 embed len
56 nodes
72 embed len
78 nodes
85 embed len
66 nodes
74 embed len


Processing Chunks:  36%|████████▉                | 5/14 [02:22<04:42, 31.39s/it]

46 nodes
52 embed len
29 nodes
29 embed len
18 nodes
21 embed len
34 nodes
40 embed len
34 nodes
38 embed len


Processing Chunks:  43%|██████████▋              | 6/14 [02:26<02:55, 21.93s/it]

21 nodes
23 embed len
47 nodes
53 embed len
51 nodes
59 embed len
25 nodes
26 embed len
50 nodes
52 embed len


Processing Chunks:  50%|████████████▌            | 7/14 [02:34<02:00, 17.27s/it]

26 nodes
30 embed len
64 nodes
73 embed len
50 nodes
55 embed len
131 nodes
155 embed len
129 nodes
145 embed len


Processing Chunks:  57%|██████████████▎          | 8/14 [06:58<09:35, 95.84s/it]

57 nodes
60 embed len
48 nodes
50 embed len
59 nodes
66 embed len
46 nodes
51 embed len
43 nodes
47 embed len


Processing Chunks:  64%|████████████████         | 9/14 [07:18<06:01, 72.34s/it]

54 nodes
57 embed len
45 nodes
47 embed len
35 nodes
35 embed len
40 nodes
45 embed len
85 nodes
98 embed len


Processing Chunks:  71%|█████████████████▏      | 10/14 [07:47<03:55, 58.90s/it]

65 nodes
77 embed len
48 nodes
57 embed len
32 nodes
32 embed len
46 nodes
49 embed len
45 nodes
54 embed len


Processing Chunks:  79%|██████████████████▊     | 11/14 [08:04<02:18, 46.15s/it]

45 nodes
50 embed len
34 nodes
36 embed len
67 nodes
78 embed len
43 nodes
47 embed len
73 nodes
81 embed len


Processing Chunks:  86%|████████████████████▌   | 12/14 [08:36<01:23, 41.80s/it]

46 nodes
53 embed len
42 nodes
51 embed len
43 nodes
45 embed len
67 nodes
72 embed len
46 nodes
50 embed len


Processing Chunks:  93%|██████████████████████▎ | 13/14 [08:52<00:33, 33.87s/it]

53 nodes
54 embed len


Processing Chunks: 100%|████████████████████████| 14/14 [08:55<00:00, 38.26s/it]


completed! MASM span: 1
73
73
performing TDA on  cleaned_DEI  span:  1


Processing Chunks:   0%|                                 | 0/15 [00:00<?, ?it/s]

14 nodes
16 embed len
13 nodes
13 embed len
7 nodes
7 embed len
20 nodes
20 embed len
9 nodes
9 embed len


Processing Chunks:   7%|█▋                       | 1/15 [00:01<00:25,  1.81s/it]

14 nodes
14 embed len
18 nodes
18 embed len
16 nodes
17 embed len
10 nodes
10 embed len
21 nodes
21 embed len


Processing Chunks:  13%|███▎                     | 2/15 [00:03<00:22,  1.77s/it]

23 nodes
23 embed len
19 nodes
19 embed len
14 nodes
14 embed len
9 nodes
9 embed len
9 nodes
9 embed len


Processing Chunks:  20%|█████                    | 3/15 [00:05<00:19,  1.66s/it]

13 nodes
13 embed len
10 nodes
10 embed len
10 nodes
11 embed len
13 nodes
13 embed len
9 nodes
9 embed len


Processing Chunks:  27%|██████▋                  | 4/15 [00:05<00:14,  1.36s/it]

15 nodes
15 embed len
17 nodes
23 embed len
11 nodes
11 embed len
12 nodes
12 embed len
11 nodes
11 embed len


Processing Chunks:  33%|████████▎                | 5/15 [00:07<00:12,  1.28s/it]

5 nodes
7 embed len
14 nodes
14 embed len
7 nodes
7 embed len
13 nodes
13 embed len
14 nodes
14 embed len


Processing Chunks:  40%|██████████               | 6/15 [00:08<00:10,  1.22s/it]

11 nodes
11 embed len
8 nodes
8 embed len
12 nodes
13 embed len
11 nodes
11 embed len
11 nodes
11 embed len


Processing Chunks:  47%|███████████▋             | 7/15 [00:09<00:10,  1.27s/it]

13 nodes
14 embed len
15 nodes
15 embed len
9 nodes
9 embed len
23 nodes
23 embed len
16 nodes
16 embed len


Processing Chunks:  53%|█████████████▎           | 8/15 [00:11<00:09,  1.35s/it]

8 nodes
8 embed len
11 nodes
11 embed len
9 nodes
12 embed len
14 nodes
14 embed len
7 nodes
7 embed len


Processing Chunks:  60%|███████████████          | 9/15 [00:12<00:07,  1.21s/it]

18 nodes
18 embed len
18 nodes
18 embed len
15 nodes
15 embed len
25 nodes
25 embed len
20 nodes
20 embed len


Processing Chunks:  67%|████████████████        | 10/15 [00:13<00:06,  1.31s/it]

6 nodes
6 embed len
20 nodes
20 embed len
14 nodes
14 embed len
11 nodes
11 embed len
17 nodes
17 embed len


Processing Chunks:  73%|█████████████████▌      | 11/15 [00:14<00:05,  1.27s/it]

13 nodes
13 embed len
13 nodes
14 embed len
10 nodes
10 embed len
6 nodes
6 embed len
16 nodes
16 embed len


Processing Chunks:  80%|███████████████████▏    | 12/15 [00:16<00:03,  1.27s/it]

19 nodes
19 embed len
13 nodes
13 embed len
8 nodes
8 embed len
20 nodes
20 embed len
12 nodes
12 embed len


Processing Chunks:  87%|████████████████████▊   | 13/15 [00:17<00:02,  1.29s/it]

10 nodes
10 embed len
13 nodes
13 embed len
15 nodes
15 embed len
22 nodes
22 embed len
15 nodes
15 embed len


Processing Chunks:  93%|██████████████████████▍ | 14/15 [00:18<00:01,  1.25s/it]

13 nodes
13 embed len
13 nodes
13 embed len
10 nodes
10 embed len


Processing Chunks: 100%|████████████████████████| 15/15 [00:19<00:00,  1.30s/it]


completed! cleaned_DEI span: 1
130
130
performing TDA on  PEM_df  span:  2


Processing Chunks:   0%|                                 | 0/26 [00:00<?, ?it/s]

81 nodes
81 embed len
76 nodes
76 embed len
70 nodes
70 embed len
67 nodes
67 embed len
51 nodes
51 embed len


Processing Chunks:   4%|▉                        | 1/26 [01:01<25:46, 61.85s/it]

57 nodes
57 embed len
90 nodes
90 embed len
70 nodes
70 embed len
65 nodes
65 embed len
54 nodes
54 embed len


Processing Chunks:   8%|█▉                       | 2/26 [02:15<27:35, 69.00s/it]

67 nodes
67 embed len
50 nodes
50 embed len
98 nodes
98 embed len
75 nodes
75 embed len
76 nodes
76 embed len


Processing Chunks:  12%|██▊                     | 3/26 [04:34<38:33, 100.59s/it]

77 nodes
77 embed len
42 nodes
42 embed len
89 nodes
89 embed len
88 nodes
88 embed len
55 nodes
55 embed len


Processing Chunks:  15%|███▋                    | 4/26 [06:29<39:01, 106.41s/it]

103 nodes
103 embed len
92 nodes
92 embed len
110 nodes
110 embed len
85 nodes
85 embed len
67 nodes
67 embed len


Processing Chunks:  19%|████▌                   | 5/26 [09:38<47:40, 136.23s/it]

64 nodes
64 embed len
53 nodes
53 embed len
63 nodes
63 embed len
79 nodes
79 embed len
70 nodes
70 embed len


Processing Chunks:  23%|█████▌                  | 6/26 [10:38<36:49, 110.45s/it]

68 nodes
68 embed len
122 nodes
122 embed len
63 nodes
63 embed len
46 nodes
46 embed len
87 nodes
87 embed len


Processing Chunks:  27%|██████▍                 | 7/26 [12:53<37:25, 118.20s/it]

82 nodes
82 embed len
68 nodes
68 embed len
97 nodes
97 embed len
16 nodes
16 embed len
58 nodes
58 embed len


Processing Chunks:  31%|███████▍                | 8/26 [14:28<33:18, 111.00s/it]

89 nodes
89 embed len
100 nodes
100 embed len
76 nodes
76 embed len
78 nodes
78 embed len
32 nodes
32 embed len


Processing Chunks:  35%|████████▎               | 9/26 [16:07<30:20, 107.07s/it]

97 nodes
97 embed len
69 nodes
69 embed len
127 nodes
127 embed len
111 nodes
111 embed len
56 nodes
56 embed len


Processing Chunks:  38%|████████▊              | 10/26 [20:56<43:34, 163.40s/it]

139 nodes
139 embed len
84 nodes
84 embed len
83 nodes
83 embed len
127 nodes
127 embed len
62 nodes
62 embed len


Processing Chunks:  42%|████████▉            | 11/26 [31:39<1:17:30, 310.03s/it]

55 nodes
55 embed len
51 nodes
51 embed len
98 nodes
98 embed len
61 nodes
61 embed len
101 nodes
101 embed len


Processing Chunks:  46%|██████████▌            | 12/26 [33:40<58:58, 252.78s/it]

55 nodes
55 embed len
53 nodes
53 embed len
62 nodes
62 embed len
74 nodes
74 embed len
55 nodes
55 embed len


Processing Chunks:  50%|███████████▌           | 13/26 [34:32<41:35, 191.95s/it]

58 nodes
58 embed len
43 nodes
43 embed len
82 nodes
82 embed len
48 nodes
48 embed len
92 nodes
92 embed len


Processing Chunks:  54%|████████████▍          | 14/26 [35:51<31:33, 157.83s/it]

66 nodes
66 embed len
67 nodes
67 embed len
98 nodes
98 embed len
64 nodes
64 embed len
59 nodes
59 embed len


Processing Chunks:  58%|█████████████▎         | 15/26 [37:26<25:27, 138.85s/it]

124 nodes
125 embed len
75 nodes
75 embed len
89 nodes
89 embed len
50 nodes
50 embed len
93 nodes
93 embed len


Processing Chunks:  62%|██████████████▏        | 16/26 [41:13<27:31, 165.20s/it]

71 nodes
71 embed len
99 nodes
99 embed len
107 nodes
107 embed len
82 nodes
82 embed len
87 nodes
87 embed len


Processing Chunks:  65%|███████████████        | 17/26 [43:33<23:40, 157.86s/it]

72 nodes
72 embed len
91 nodes
91 embed len
70 nodes
70 embed len
87 nodes
87 embed len
86 nodes
86 embed len


Processing Chunks:  69%|███████████████▉       | 18/26 [45:16<18:49, 141.23s/it]

54 nodes
54 embed len
65 nodes
65 embed len
61 nodes
61 embed len
83 nodes
83 embed len
87 nodes
87 embed len


Processing Chunks:  73%|████████████████▊      | 19/26 [46:00<13:03, 111.90s/it]

42 nodes
42 embed len
68 nodes
68 embed len
46 nodes
46 embed len
78 nodes
78 embed len
86 nodes
86 embed len


Processing Chunks:  77%|██████████████████▍     | 20/26 [46:46<09:13, 92.19s/it]

66 nodes
66 embed len
83 nodes
83 embed len
62 nodes
62 embed len
78 nodes
78 embed len
61 nodes
61 embed len


Processing Chunks:  81%|███████████████████▍    | 21/26 [47:24<06:20, 76.00s/it]

88 nodes
88 embed len
66 nodes
66 embed len
106 nodes
106 embed len
92 nodes
92 embed len
70 nodes
70 embed len


Processing Chunks:  85%|████████████████████▎   | 22/26 [49:04<05:32, 83.23s/it]

65 nodes
65 embed len
86 nodes
86 embed len
63 nodes
63 embed len
64 nodes
64 embed len
94 nodes
94 embed len


Processing Chunks:  88%|█████████████████████▏  | 23/26 [50:04<03:48, 76.18s/it]

37 nodes
37 embed len
42 nodes
44 embed len
92 nodes
92 embed len
89 nodes
89 embed len
118 nodes
118 embed len


Processing Chunks:  92%|██████████████████████▏ | 24/26 [51:35<02:41, 80.62s/it]

54 nodes
54 embed len
80 nodes
81 embed len
44 nodes
44 embed len
86 nodes
86 embed len
89 nodes
89 embed len


Processing Chunks:  96%|███████████████████████ | 25/26 [52:38<01:15, 75.37s/it]

79 nodes
79 embed len
86 nodes
86 embed len
110 nodes
110 embed len
71 nodes
71 embed len
58 nodes
58 embed len


Processing Chunks: 100%|███████████████████████| 26/26 [54:49<00:00, 126.51s/it]


completed! PEM_df span: 2
101
101
performing TDA on  SER_monologs  span:  2


Processing Chunks:   0%|                                 | 0/20 [00:00<?, ?it/s]

17 nodes
17 embed len
33 nodes
33 embed len
24 nodes
24 embed len
20 nodes
20 embed len
36 nodes
36 embed len


Processing Chunks:   5%|█▎                       | 1/20 [00:01<00:20,  1.10s/it]

35 nodes
35 embed len
31 nodes
31 embed len
37 nodes
37 embed len
32 nodes
32 embed len
53 nodes
53 embed len


Processing Chunks:  10%|██▌                      | 2/20 [00:03<00:36,  2.04s/it]

53 nodes
53 embed len
25 nodes
26 embed len
34 nodes
34 embed len
30 nodes
30 embed len
32 nodes
32 embed len


Processing Chunks:  15%|███▊                     | 3/20 [00:07<00:49,  2.94s/it]

48 nodes
48 embed len
36 nodes
36 embed len
26 nodes
26 embed len
35 nodes
35 embed len
49 nodes
49 embed len


Processing Chunks:  20%|█████                    | 4/20 [00:12<00:56,  3.51s/it]

58 nodes
58 embed len
56 nodes
56 embed len
17 nodes
17 embed len
7 nodes
7 embed len
56 nodes
56 embed len


Processing Chunks:  25%|██████▎                  | 5/20 [00:18<01:07,  4.50s/it]

69 nodes
69 embed len
67 nodes
67 embed len
46 nodes
46 embed len
27 nodes
27 embed len
43 nodes
43 embed len


Processing Chunks:  30%|███████▌                 | 6/20 [00:34<01:59,  8.51s/it]

37 nodes
37 embed len
38 nodes
38 embed len
41 nodes
41 embed len
20 nodes
20 embed len
42 nodes
42 embed len


Processing Chunks:  35%|████████▊                | 7/20 [00:36<01:23,  6.39s/it]

59 nodes
59 embed len
49 nodes
49 embed len
53 nodes
53 embed len
43 nodes
43 embed len
49 nodes
49 embed len


Processing Chunks:  40%|██████████               | 8/20 [00:44<01:22,  6.91s/it]

46 nodes
46 embed len
44 nodes
44 embed len
108 nodes
108 embed len
49 nodes
49 embed len
56 nodes
56 embed len


Processing Chunks:  45%|███████████▎             | 9/20 [01:14<02:33, 13.98s/it]

65 nodes
65 embed len
31 nodes
31 embed len
26 nodes
26 embed len
29 nodes
29 embed len
23 nodes
23 embed len


Processing Chunks:  50%|████████████            | 10/20 [01:19<01:52, 11.30s/it]

40 nodes
40 embed len
35 nodes
35 embed len
44 nodes
44 embed len
27 nodes
27 embed len
52 nodes
52 embed len


Processing Chunks:  55%|█████████████▏          | 11/20 [01:24<01:22,  9.21s/it]

41 nodes
41 embed len
69 nodes
69 embed len
67 nodes
69 embed len
100 nodes
100 embed len
12 nodes
14 embed len


Processing Chunks:  60%|██████████████▍         | 12/20 [01:53<02:03, 15.47s/it]

32 nodes
32 embed len
39 nodes
39 embed len
37 nodes
37 embed len
40 nodes
40 embed len
10 nodes
10 embed len


Processing Chunks:  65%|███████████████▌        | 13/20 [01:58<01:25, 12.26s/it]

14 nodes
14 embed len
10 nodes
10 embed len
23 nodes
23 embed len
17 nodes
17 embed len
20 nodes
20 embed len


Processing Chunks:  70%|████████████████▊       | 14/20 [01:59<00:52,  8.74s/it]

25 nodes
25 embed len
27 nodes
27 embed len
19 nodes
19 embed len
39 nodes
39 embed len
30 nodes
30 embed len


Processing Chunks:  75%|██████████████████      | 15/20 [02:00<00:32,  6.56s/it]

27 nodes
27 embed len
36 nodes
36 embed len
34 nodes
34 embed len
38 nodes
38 embed len
39 nodes
39 embed len


Processing Chunks:  80%|███████████████████▏    | 16/20 [02:03<00:21,  5.34s/it]

36 nodes
36 embed len
39 nodes
39 embed len
29 nodes
29 embed len
26 nodes
26 embed len
22 nodes
22 embed len


Processing Chunks:  85%|████████████████████▍   | 17/20 [02:05<00:12,  4.25s/it]

34 nodes
34 embed len
11 nodes
11 embed len
15 nodes
15 embed len
39 nodes
39 embed len
23 nodes
23 embed len


Processing Chunks:  90%|█████████████████████▌  | 18/20 [02:06<00:06,  3.29s/it]

62 nodes
62 embed len
18 nodes
18 embed len
24 nodes
24 embed len
16 nodes
16 embed len
25 nodes
25 embed len


Processing Chunks:  95%|██████████████████████▊ | 19/20 [02:08<00:03,  3.07s/it]

23 nodes
28 embed len
37 nodes
37 embed len
17 nodes
17 embed len
41 nodes
41 embed len
39 nodes
39 embed len


Processing Chunks: 100%|████████████████████████| 20/20 [02:10<00:00,  6.52s/it]


completed! SER_monologs span: 2
107
107
performing TDA on  SER_IPSP  span:  2


Processing Chunks:   0%|                                 | 0/22 [00:00<?, ?it/s]

36 nodes
36 embed len
58 nodes
58 embed len
45 nodes
45 embed len
52 nodes
52 embed len
36 nodes
36 embed len


Processing Chunks:   5%|█▏                       | 1/22 [00:05<02:03,  5.87s/it]

39 nodes
39 embed len
52 nodes
52 embed len
52 nodes
53 embed len
61 nodes
61 embed len
58 nodes
58 embed len


Processing Chunks:   9%|██▎                      | 2/22 [00:18<03:21, 10.07s/it]

45 nodes
45 embed len
81 nodes
81 embed len
39 nodes
39 embed len
54 nodes
54 embed len
46 nodes
46 embed len


Processing Chunks:  14%|███▍                     | 3/22 [00:55<07:02, 22.23s/it]

52 nodes
52 embed len
51 nodes
51 embed len
46 nodes
46 embed len
47 nodes
47 embed len
82 nodes
82 embed len


Processing Chunks:  18%|████▌                    | 4/22 [01:30<08:11, 27.32s/it]

60 nodes
60 embed len
44 nodes
44 embed len
38 nodes
38 embed len
35 nodes
35 embed len
42 nodes
43 embed len


Processing Chunks:  23%|█████▋                   | 5/22 [01:39<05:48, 20.50s/it]

66 nodes
66 embed len
65 nodes
65 embed len
70 nodes
70 embed len
52 nodes
52 embed len
65 nodes
65 embed len


Processing Chunks:  27%|██████▊                  | 6/22 [01:54<05:00, 18.78s/it]

46 nodes
46 embed len
35 nodes
35 embed len
37 nodes
37 embed len
20 nodes
20 embed len
33 nodes
33 embed len


Processing Chunks:  32%|███████▉                 | 7/22 [01:58<03:29, 13.95s/it]

59 nodes
59 embed len
33 nodes
33 embed len
25 nodes
25 embed len
51 nodes
51 embed len
52 nodes
52 embed len


Processing Chunks:  36%|█████████                | 8/22 [02:06<02:47, 11.95s/it]

42 nodes
42 embed len
58 nodes
58 embed len
46 nodes
46 embed len
74 nodes
74 embed len
57 nodes
57 embed len


Processing Chunks:  41%|██████████▏              | 9/22 [02:26<03:10, 14.62s/it]

75 nodes
75 embed len
78 nodes
78 embed len
82 nodes
82 embed len
60 nodes
60 embed len
56 nodes
56 embed len


Processing Chunks:  45%|██████████▉             | 10/22 [03:13<04:54, 24.52s/it]

43 nodes
43 embed len
34 nodes
35 embed len
49 nodes
50 embed len
38 nodes
38 embed len
38 nodes
38 embed len


Processing Chunks:  50%|████████████            | 11/22 [03:16<03:19, 18.10s/it]

19 nodes
19 embed len
45 nodes
45 embed len
38 nodes
38 embed len
45 nodes
45 embed len
37 nodes
38 embed len


Processing Chunks:  55%|█████████████           | 12/22 [03:21<02:18, 13.87s/it]

57 nodes
57 embed len
39 nodes
39 embed len
62 nodes
62 embed len
57 nodes
57 embed len
79 nodes
79 embed len


Processing Chunks:  59%|██████████████▏         | 13/22 [04:13<03:48, 25.40s/it]

99 nodes
99 embed len
15 nodes
15 embed len
54 nodes
55 embed len
57 nodes
57 embed len
34 nodes
34 embed len


Processing Chunks:  64%|███████████████▎        | 14/22 [04:45<03:39, 27.41s/it]

49 nodes
49 embed len
19 nodes
19 embed len
24 nodes
24 embed len
27 nodes
27 embed len
18 nodes
18 embed len


Processing Chunks:  68%|████████████████▎       | 15/22 [04:47<02:18, 19.78s/it]

77 nodes
77 embed len
98 nodes
98 embed len
83 nodes
83 embed len
25 nodes
25 embed len
24 nodes
24 embed len


Processing Chunks:  73%|█████████████████▍      | 16/22 [05:27<02:35, 25.93s/it]

22 nodes
22 embed len
59 nodes
59 embed len
58 nodes
59 embed len
49 nodes
49 embed len
73 nodes
73 embed len


Processing Chunks:  77%|██████████████████▌     | 17/22 [05:38<01:47, 21.51s/it]

77 nodes
77 embed len
84 nodes
84 embed len
49 nodes
49 embed len
62 nodes
62 embed len
49 nodes
49 embed len


Processing Chunks:  82%|███████████████████▋    | 18/22 [06:10<01:38, 24.61s/it]

24 nodes
24 embed len
22 nodes
22 embed len
55 nodes
55 embed len
58 nodes
58 embed len
67 nodes
67 embed len


Processing Chunks:  86%|████████████████████▋   | 19/22 [06:21<01:01, 20.38s/it]

34 nodes
34 embed len
27 nodes
27 embed len
29 nodes
29 embed len
53 nodes
53 embed len
73 nodes
73 embed len


Processing Chunks:  91%|█████████████████████▊  | 20/22 [06:39<00:39, 19.77s/it]

68 nodes
68 embed len
29 nodes
29 embed len
43 nodes
43 embed len
52 nodes
52 embed len
45 nodes
45 embed len


Processing Chunks:  95%|██████████████████████▉ | 21/22 [06:51<00:17, 17.47s/it]

47 nodes
48 embed len
63 nodes
63 embed len


Processing Chunks: 100%|████████████████████████| 22/22 [06:54<00:00, 18.86s/it]


completed! SER_IPSP span: 2
108
108
performing TDA on  SER1  span:  2


Processing Chunks:   0%|                                 | 0/22 [00:00<?, ?it/s]

7 nodes
7 embed len
3 nodes
3 embed len
5 nodes
5 embed len
7 nodes
7 embed len
9 nodes
9 embed len


Processing Chunks:   9%|██▎                      | 2/22 [00:00<00:08,  2.43it/s]

14 nodes
14 embed len
11 nodes
11 embed len
10 nodes
10 embed len
11 nodes
11 embed len
6 nodes
6 embed len
6 nodes
6 embed len
8 nodes
8 embed len
8 nodes
8 embed len
9 nodes
9 embed len
9 nodes
9 embed len


Processing Chunks:  14%|███▍                     | 3/22 [00:01<00:07,  2.54it/s]

10 nodes
11 embed len
7 nodes
8 embed len
10 nodes
10 embed len
6 nodes
6 embed len
4 nodes
4 embed len


Processing Chunks:  18%|████▌                    | 4/22 [00:01<00:08,  2.24it/s]

6 nodes
6 embed len
5 nodes
5 embed len
7 nodes
7 embed len
6 nodes
6 embed len
7 nodes
7 embed len


Processing Chunks:  23%|█████▋                   | 5/22 [00:02<00:07,  2.19it/s]

6 nodes
6 embed len
9 nodes
9 embed len
14 nodes
14 embed len
4 nodes
4 embed len
8 nodes
8 embed len


Processing Chunks:  27%|██████▊                  | 6/22 [00:02<00:07,  2.23it/s]

8 nodes
8 embed len
8 nodes
8 embed len
10 nodes
10 embed len
7 nodes
7 embed len
6 nodes
6 embed len


Processing Chunks:  32%|███████▉                 | 7/22 [00:03<00:06,  2.21it/s]

9 nodes
9 embed len
7 nodes
7 embed len
7 nodes
7 embed len
12 nodes
12 embed len
7 nodes
7 embed len


Processing Chunks:  36%|█████████                | 8/22 [00:03<00:06,  2.24it/s]

9 nodes
9 embed len
8 nodes
8 embed len
5 nodes
5 embed len
14 nodes
14 embed len
19 nodes
19 embed len


Processing Chunks:  41%|██████████▏              | 9/22 [00:03<00:05,  2.27it/s]

11 nodes
11 embed len
22 nodes
22 embed len
20 nodes
20 embed len
10 nodes
10 embed len
11 nodes
11 embed len


Processing Chunks:  45%|██████████▉             | 10/22 [00:04<00:06,  1.97it/s]

7 nodes
8 embed len
10 nodes
10 embed len
16 nodes
16 embed len
7 nodes
7 embed len
5 nodes
5 embed len


Processing Chunks:  50%|████████████            | 11/22 [00:05<00:05,  1.89it/s]

11 nodes
11 embed len
6 nodes
6 embed len
9 nodes
9 embed len
5 nodes
5 embed len
5 nodes
5 embed len


Processing Chunks:  55%|█████████████           | 12/22 [00:05<00:05,  1.93it/s]

5 nodes
7 embed len
8 nodes
8 embed len
18 nodes
18 embed len
10 nodes
10 embed len
8 nodes
8 embed len


Processing Chunks:  59%|██████████████▏         | 13/22 [00:06<00:05,  1.76it/s]

12 nodes
12 embed len
3 nodes
3 embed len
9 nodes
9 embed len
8 nodes
8 embed len
3 nodes
3 embed len


Processing Chunks:  64%|███████████████▎        | 14/22 [00:06<00:04,  1.87it/s]

6 nodes
6 embed len
7 nodes
7 embed len
10 nodes
10 embed len
7 nodes
7 embed len
14 nodes
14 embed len


Processing Chunks:  68%|████████████████▎       | 15/22 [00:07<00:03,  2.02it/s]

7 nodes
7 embed len
5 nodes
5 embed len
10 nodes
10 embed len
5 nodes
5 embed len
5 nodes
5 embed len


Processing Chunks:  73%|█████████████████▍      | 16/22 [00:07<00:02,  2.18it/s]

7 nodes
7 embed len
4 nodes
4 embed len
7 nodes
7 embed len
4 nodes
4 embed len
9 nodes
9 embed len


Processing Chunks:  77%|██████████████████▌     | 17/22 [00:07<00:02,  2.35it/s]

4 nodes
4 embed len
5 nodes
5 embed len
8 nodes
8 embed len
8 nodes
8 embed len
7 nodes
7 embed len


Processing Chunks:  82%|███████████████████▋    | 18/22 [00:08<00:01,  2.36it/s]

7 nodes
7 embed len
8 nodes
8 embed len
9 nodes
9 embed len
8 nodes
9 embed len
8 nodes
8 embed len


Processing Chunks:  86%|████████████████████▋   | 19/22 [00:08<00:01,  2.24it/s]

7 nodes
7 embed len
8 nodes
8 embed len
6 nodes
6 embed len
8 nodes
8 embed len
9 nodes
9 embed len


Processing Chunks:  91%|█████████████████████▊  | 20/22 [00:09<00:00,  2.23it/s]

7 nodes
7 embed len
8 nodes
8 embed len
9 nodes
9 embed len
9 nodes
9 embed len
6 nodes
6 embed len


Processing Chunks:  95%|██████████████████████▉ | 21/22 [00:09<00:00,  2.29it/s]

11 nodes
11 embed len
12 nodes
12 embed len


Processing Chunks: 100%|████████████████████████| 22/22 [00:10<00:00,  2.17it/s]


completed! SER1 span: 2
66
66
performing TDA on  MASM  span:  2


Processing Chunks:   0%|                                 | 0/14 [00:00<?, ?it/s]

50 nodes
55 embed len
55 nodes
55 embed len
71 nodes
71 embed len
53 nodes
53 embed len
37 nodes
37 embed len


Processing Chunks:   7%|█▊                       | 1/14 [00:07<01:40,  7.71s/it]

66 nodes
66 embed len
56 nodes
56 embed len
37 nodes
37 embed len
57 nodes
57 embed len
48 nodes
48 embed len


Processing Chunks:  14%|███▌                     | 2/14 [00:17<01:47,  8.92s/it]

43 nodes
43 embed len
40 nodes
40 embed len
82 nodes
82 embed len
90 nodes
90 embed len
89 nodes
89 embed len


Processing Chunks:  21%|█████▎                   | 3/14 [00:47<03:24, 18.55s/it]

41 nodes
41 embed len
55 nodes
55 embed len
61 nodes
61 embed len
75 nodes
75 embed len
53 nodes
53 embed len


Processing Chunks:  29%|███████▏                 | 4/14 [01:01<02:49, 16.94s/it]

69 nodes
69 embed len
71 nodes
71 embed len
71 nodes
71 embed len
84 nodes
84 embed len
73 nodes
73 embed len


Processing Chunks:  36%|████████▉                | 5/14 [01:23<02:47, 18.62s/it]

51 nodes
51 embed len
28 nodes
28 embed len
20 nodes
20 embed len
39 nodes
39 embed len
37 nodes
37 embed len


Processing Chunks:  43%|██████████▋              | 6/14 [01:25<01:44, 13.07s/it]

21 nodes
22 embed len
52 nodes
52 embed len
58 nodes
58 embed len
25 nodes
25 embed len
51 nodes
51 embed len


Processing Chunks:  50%|████████████▌            | 7/14 [01:31<01:13, 10.51s/it]

29 nodes
29 embed len
72 nodes
72 embed len
54 nodes
54 embed len
154 nodes
154 embed len
144 nodes
144 embed len


Processing Chunks:  57%|██████████████▎          | 8/14 [04:34<06:32, 65.43s/it]

59 nodes
59 embed len
49 nodes
49 embed len
65 nodes
65 embed len
50 nodes
50 embed len
46 nodes
46 embed len


Processing Chunks:  64%|████████████████         | 9/14 [04:41<03:56, 47.34s/it]

56 nodes
56 embed len
46 nodes
46 embed len
34 nodes
34 embed len
44 nodes
44 embed len
97 nodes
97 embed len


Processing Chunks:  71%|█████████████████▏      | 10/14 [04:56<02:29, 37.43s/it]

76 nodes
76 embed len
56 nodes
56 embed len
31 nodes
31 embed len
48 nodes
48 embed len
53 nodes
53 embed len


Processing Chunks:  79%|██████████████████▊     | 11/14 [05:04<01:24, 28.24s/it]

48 nodes
49 embed len
35 nodes
35 embed len
77 nodes
77 embed len
46 nodes
46 embed len
80 nodes
80 embed len


Processing Chunks:  86%|████████████████████▌   | 12/14 [05:19<00:48, 24.28s/it]

52 nodes
52 embed len
50 nodes
50 embed len
44 nodes
44 embed len
71 nodes
71 embed len
49 nodes
49 embed len


Processing Chunks:  93%|██████████████████████▎ | 13/14 [05:28<00:19, 19.71s/it]

53 nodes
53 embed len


Processing Chunks: 100%|████████████████████████| 14/14 [05:31<00:00, 23.70s/it]


completed! MASM span: 2
73
73
performing TDA on  cleaned_DEI  span:  2


Processing Chunks:   0%|                                 | 0/15 [00:00<?, ?it/s]

15 nodes
15 embed len
12 nodes
12 embed len
6 nodes
6 embed len
19 nodes
19 embed len
8 nodes
8 embed len


Processing Chunks:   7%|█▋                       | 1/15 [00:00<00:07,  1.96it/s]

13 nodes
13 embed len
17 nodes
17 embed len
16 nodes
16 embed len
9 nodes
9 embed len
20 nodes
20 embed len


Processing Chunks:  13%|███▎                     | 2/15 [00:01<00:06,  2.01it/s]

22 nodes
22 embed len
18 nodes
18 embed len
13 nodes
13 embed len
8 nodes
8 embed len
8 nodes
8 embed len


Processing Chunks:  20%|█████                    | 3/15 [00:01<00:05,  2.04it/s]

12 nodes
12 embed len
9 nodes
9 embed len
10 nodes
10 embed len
12 nodes
12 embed len
8 nodes
8 embed len


Processing Chunks:  27%|██████▋                  | 4/15 [00:01<00:05,  2.11it/s]

14 nodes
14 embed len
22 nodes
22 embed len
10 nodes
10 embed len
9 nodes
11 embed len
10 nodes
10 embed len


Processing Chunks:  33%|████████▎                | 5/15 [00:02<00:05,  1.97it/s]

4 nodes
6 embed len
13 nodes
13 embed len
6 nodes
6 embed len
12 nodes
12 embed len
13 nodes
13 embed len


Processing Chunks:  40%|██████████               | 6/15 [00:02<00:04,  2.02it/s]

10 nodes
10 embed len
7 nodes
7 embed len
12 nodes
12 embed len
10 nodes
10 embed len
10 nodes
10 embed len


Processing Chunks:  47%|███████████▋             | 7/15 [00:03<00:03,  2.07it/s]

13 nodes
13 embed len
14 nodes
14 embed len
8 nodes
8 embed len
22 nodes
22 embed len
15 nodes
15 embed len


Processing Chunks:  53%|█████████████▎           | 8/15 [00:04<00:03,  1.89it/s]

7 nodes
7 embed len
10 nodes
10 embed len
11 nodes
11 embed len
13 nodes
13 embed len
6 nodes
6 embed len


Processing Chunks:  60%|███████████████          | 9/15 [00:04<00:02,  2.01it/s]

17 nodes
17 embed len
17 nodes
17 embed len
14 nodes
14 embed len
24 nodes
24 embed len
19 nodes
19 embed len


Processing Chunks:  67%|████████████████        | 10/15 [00:05<00:02,  1.94it/s]

5 nodes
5 embed len
18 nodes
19 embed len
13 nodes
13 embed len
10 nodes
10 embed len
16 nodes
16 embed len


Processing Chunks:  73%|█████████████████▌      | 11/15 [00:05<00:02,  1.98it/s]

12 nodes
12 embed len
13 nodes
13 embed len
9 nodes
9 embed len
5 nodes
5 embed len
15 nodes
15 embed len


Processing Chunks:  80%|███████████████████▏    | 12/15 [00:05<00:01,  2.15it/s]

18 nodes
18 embed len
12 nodes
12 embed len
6 nodes
7 embed len
19 nodes
19 embed len
11 nodes
11 embed len


Processing Chunks:  87%|████████████████████▊   | 13/15 [00:06<00:00,  2.11it/s]

9 nodes
9 embed len
12 nodes
12 embed len
14 nodes
14 embed len
21 nodes
21 embed len
14 nodes
14 embed len


Processing Chunks:  93%|██████████████████████▍ | 14/15 [00:06<00:00,  2.17it/s]

12 nodes
12 embed len
12 nodes
12 embed len
9 nodes
9 embed len


Processing Chunks: 100%|████████████████████████| 15/15 [00:07<00:00,  2.08it/s]


completed! cleaned_DEI span: 2
130
130
performing TDA on  PEM_df  span:  3


Processing Chunks:   0%|                                 | 0/26 [00:00<?, ?it/s]

80 nodes
80 embed len
75 nodes
75 embed len
69 nodes
69 embed len
66 nodes
66 embed len
50 nodes
50 embed len


Processing Chunks:   4%|▉                        | 1/26 [00:22<09:33, 22.94s/it]

56 nodes
56 embed len
89 nodes
89 embed len
69 nodes
69 embed len
64 nodes
64 embed len
53 nodes
53 embed len


Processing Chunks:   8%|█▉                       | 2/26 [01:06<14:03, 35.14s/it]

66 nodes
66 embed len
49 nodes
49 embed len
97 nodes
97 embed len
74 nodes
74 embed len
75 nodes
75 embed len


Processing Chunks:  12%|██▉                      | 3/26 [02:18<19:50, 51.75s/it]

76 nodes
76 embed len
41 nodes
41 embed len
88 nodes
88 embed len
87 nodes
87 embed len
54 nodes
54 embed len


Processing Chunks:  15%|███▊                     | 4/26 [03:22<20:51, 56.88s/it]

102 nodes
102 embed len
91 nodes
91 embed len
109 nodes
109 embed len
84 nodes
84 embed len
66 nodes
66 embed len


Processing Chunks:  19%|████▊                    | 5/26 [05:33<29:11, 83.40s/it]

63 nodes
63 embed len
52 nodes
52 embed len
62 nodes
62 embed len
78 nodes
78 embed len
69 nodes
69 embed len


Processing Chunks:  23%|█████▊                   | 6/26 [06:07<22:13, 66.68s/it]

67 nodes
67 embed len
121 nodes
121 embed len
62 nodes
62 embed len
45 nodes
45 embed len
86 nodes
86 embed len


Processing Chunks:  27%|██████▋                  | 7/26 [07:49<24:47, 78.31s/it]

81 nodes
81 embed len
67 nodes
67 embed len
96 nodes
96 embed len
15 nodes
15 embed len
57 nodes
57 embed len


Processing Chunks:  31%|███████▋                 | 8/26 [08:35<20:21, 67.85s/it]

88 nodes
88 embed len
99 nodes
99 embed len
75 nodes
75 embed len
77 nodes
77 embed len
31 nodes
31 embed len


Processing Chunks:  35%|████████▋                | 9/26 [09:27<17:47, 62.82s/it]

96 nodes
96 embed len
68 nodes
68 embed len
126 nodes
126 embed len
110 nodes
110 embed len
55 nodes
55 embed len


Processing Chunks:  38%|████████▊              | 10/26 [13:45<32:53, 123.36s/it]

138 nodes
138 embed len
83 nodes
83 embed len
82 nodes
82 embed len
126 nodes
126 embed len
61 nodes
61 embed len


Processing Chunks:  42%|████████▉            | 11/26 [22:38<1:02:09, 248.60s/it]

54 nodes
54 embed len
50 nodes
50 embed len
97 nodes
97 embed len
60 nodes
60 embed len
100 nodes
100 embed len


Processing Chunks:  46%|██████████▌            | 12/26 [23:43<44:59, 192.85s/it]

54 nodes
54 embed len
52 nodes
52 embed len
61 nodes
61 embed len
73 nodes
73 embed len
54 nodes
54 embed len


Processing Chunks:  50%|███████████▌           | 13/26 [24:22<31:38, 146.07s/it]

57 nodes
57 embed len
42 nodes
42 embed len
81 nodes
81 embed len
47 nodes
47 embed len
91 nodes
91 embed len


Processing Chunks:  54%|████████████▍          | 14/26 [25:24<24:07, 120.61s/it]

65 nodes
65 embed len
66 nodes
66 embed len
97 nodes
97 embed len
63 nodes
63 embed len
58 nodes
58 embed len


Processing Chunks:  58%|█████████████▎         | 15/26 [26:52<20:21, 111.06s/it]

124 nodes
124 embed len
74 nodes
74 embed len
88 nodes
88 embed len
49 nodes
49 embed len
92 nodes
92 embed len


Processing Chunks:  62%|██████████████▏        | 16/26 [29:35<21:04, 126.42s/it]

70 nodes
70 embed len
98 nodes
98 embed len
106 nodes
106 embed len
81 nodes
81 embed len
86 nodes
86 embed len


Processing Chunks:  65%|███████████████        | 17/26 [31:27<18:19, 122.22s/it]

71 nodes
71 embed len
90 nodes
90 embed len
69 nodes
69 embed len
86 nodes
86 embed len
85 nodes
85 embed len


Processing Chunks:  69%|███████████████▉       | 18/26 [33:02<15:12, 114.04s/it]

53 nodes
53 embed len
64 nodes
64 embed len
60 nodes
60 embed len
82 nodes
82 embed len
86 nodes
86 embed len


Processing Chunks:  73%|█████████████████▌      | 19/26 [33:35<10:26, 89.56s/it]

41 nodes
41 embed len
67 nodes
67 embed len
45 nodes
45 embed len
77 nodes
77 embed len
85 nodes
85 embed len


Processing Chunks:  77%|██████████████████▍     | 20/26 [34:35<08:04, 80.73s/it]

65 nodes
65 embed len
82 nodes
82 embed len
61 nodes
61 embed len
77 nodes
77 embed len
60 nodes
60 embed len


Processing Chunks:  81%|███████████████████▍    | 21/26 [35:05<05:27, 65.54s/it]

87 nodes
87 embed len
65 nodes
65 embed len
105 nodes
105 embed len
91 nodes
91 embed len
69 nodes
69 embed len


Processing Chunks:  85%|████████████████████▎   | 22/26 [36:25<04:39, 69.88s/it]

64 nodes
64 embed len
85 nodes
85 embed len
62 nodes
62 embed len
63 nodes
63 embed len
93 nodes
93 embed len


Processing Chunks:  88%|█████████████████████▏  | 23/26 [37:51<03:44, 74.92s/it]

36 nodes
36 embed len
43 nodes
43 embed len
91 nodes
91 embed len
88 nodes
88 embed len
117 nodes
117 embed len


Processing Chunks:  92%|██████████████████████▏ | 24/26 [39:02<02:27, 73.56s/it]

53 nodes
53 embed len
80 nodes
80 embed len
43 nodes
43 embed len
85 nodes
85 embed len
88 nodes
88 embed len


Processing Chunks:  96%|███████████████████████ | 25/26 [39:30<00:59, 59.90s/it]

78 nodes
78 embed len
85 nodes
85 embed len
109 nodes
109 embed len
70 nodes
70 embed len
57 nodes
57 embed len


Processing Chunks: 100%|████████████████████████| 26/26 [40:36<00:00, 93.73s/it]


completed! PEM_df span: 3
101
101
performing TDA on  SER_monologs  span:  3


Processing Chunks:   0%|                                 | 0/20 [00:00<?, ?it/s]

16 nodes
16 embed len
32 nodes
32 embed len
23 nodes
23 embed len
19 nodes
19 embed len
35 nodes
35 embed len


Processing Chunks:   5%|█▎                       | 1/20 [00:01<00:28,  1.50s/it]

34 nodes
34 embed len
30 nodes
30 embed len
36 nodes
36 embed len
31 nodes
31 embed len
52 nodes
52 embed len


Processing Chunks:  10%|██▌                      | 2/20 [00:05<00:49,  2.77s/it]

52 nodes
52 embed len
25 nodes
25 embed len
33 nodes
33 embed len
29 nodes
29 embed len
31 nodes
31 embed len


Processing Chunks:  15%|███▊                     | 3/20 [00:08<00:54,  3.20s/it]

47 nodes
47 embed len
35 nodes
35 embed len
25 nodes
25 embed len
34 nodes
34 embed len
48 nodes
48 embed len


Processing Chunks:  20%|█████                    | 4/20 [00:14<01:05,  4.12s/it]

57 nodes
57 embed len
55 nodes
55 embed len
16 nodes
16 embed len
6 nodes
6 embed len
55 nodes
55 embed len


Processing Chunks:  25%|██████▎                  | 5/20 [00:25<01:38,  6.54s/it]

68 nodes
68 embed len
66 nodes
66 embed len
45 nodes
45 embed len
26 nodes
26 embed len
42 nodes
42 embed len


Processing Chunks:  30%|███████▌                 | 6/20 [00:50<03:01, 12.93s/it]

36 nodes
36 embed len
37 nodes
37 embed len
40 nodes
40 embed len
19 nodes
19 embed len
41 nodes
41 embed len


Processing Chunks:  35%|████████▊                | 7/20 [00:54<02:09,  9.93s/it]

58 nodes
58 embed len
48 nodes
48 embed len
52 nodes
52 embed len
42 nodes
42 embed len
48 nodes
48 embed len


Processing Chunks:  40%|██████████               | 8/20 [01:07<02:09, 10.83s/it]

45 nodes
45 embed len
43 nodes
43 embed len
107 nodes
107 embed len
48 nodes
48 embed len
55 nodes
55 embed len


Processing Chunks:  45%|███████████▎             | 9/20 [01:45<03:32, 19.33s/it]

64 nodes
64 embed len
30 nodes
30 embed len
25 nodes
25 embed len
28 nodes
28 embed len
22 nodes
22 embed len


Processing Chunks:  50%|████████████            | 10/20 [01:52<02:35, 15.58s/it]

39 nodes
39 embed len
34 nodes
34 embed len
43 nodes
43 embed len
26 nodes
26 embed len
51 nodes
51 embed len


Processing Chunks:  55%|█████████████▏          | 11/20 [01:57<01:51, 12.36s/it]

40 nodes
40 embed len
68 nodes
68 embed len
68 nodes
68 embed len
99 nodes
99 embed len
13 nodes
13 embed len


Processing Chunks:  60%|██████████████▍         | 12/20 [02:36<02:45, 20.64s/it]

31 nodes
31 embed len
38 nodes
38 embed len
36 nodes
36 embed len
39 nodes
39 embed len
8 nodes
9 embed len


Processing Chunks:  65%|███████████████▌        | 13/20 [02:39<01:46, 15.23s/it]

13 nodes
13 embed len
9 nodes
9 embed len
22 nodes
22 embed len
16 nodes
16 embed len
19 nodes
19 embed len


Processing Chunks:  70%|████████████████▊       | 14/20 [02:40<01:05, 10.87s/it]

24 nodes
24 embed len
26 nodes
26 embed len
18 nodes
18 embed len
38 nodes
38 embed len
29 nodes
29 embed len


Processing Chunks:  75%|██████████████████      | 15/20 [02:42<00:41,  8.33s/it]

26 nodes
26 embed len
35 nodes
35 embed len
33 nodes
33 embed len
37 nodes
37 embed len
38 nodes
38 embed len


Processing Chunks:  80%|███████████████████▏    | 16/20 [02:50<00:32,  8.01s/it]

35 nodes
35 embed len
38 nodes
38 embed len
28 nodes
28 embed len
25 nodes
25 embed len
21 nodes
21 embed len


Processing Chunks:  85%|████████████████████▍   | 17/20 [02:53<00:20,  6.69s/it]

33 nodes
33 embed len
10 nodes
10 embed len
14 nodes
14 embed len
38 nodes
38 embed len
22 nodes
22 embed len


Processing Chunks:  90%|█████████████████████▌  | 18/20 [02:55<00:10,  5.16s/it]

61 nodes
61 embed len
17 nodes
17 embed len
23 nodes
23 embed len
15 nodes
15 embed len
24 nodes
24 embed len


Processing Chunks:  95%|██████████████████████▊ | 19/20 [02:59<00:04,  4.94s/it]

27 nodes
27 embed len
36 nodes
36 embed len
16 nodes
16 embed len
40 nodes
40 embed len
38 nodes
38 embed len


Processing Chunks: 100%|████████████████████████| 20/20 [03:03<00:00,  9.19s/it]


completed! SER_monologs span: 3
107
107
performing TDA on  SER_IPSP  span:  3


Processing Chunks:   0%|                                 | 0/22 [00:00<?, ?it/s]

35 nodes
35 embed len
57 nodes
57 embed len
44 nodes
44 embed len
51 nodes
51 embed len
35 nodes
35 embed len


Processing Chunks:   5%|█▏                       | 1/22 [00:08<02:52,  8.19s/it]

38 nodes
38 embed len
51 nodes
51 embed len
52 nodes
52 embed len
60 nodes
60 embed len
57 nodes
57 embed len


Processing Chunks:   9%|██▎                      | 2/22 [00:28<05:03, 15.19s/it]

44 nodes
44 embed len
80 nodes
80 embed len
38 nodes
38 embed len
53 nodes
53 embed len
45 nodes
45 embed len


Processing Chunks:  14%|███▍                     | 3/22 [00:45<05:06, 16.11s/it]

51 nodes
51 embed len
50 nodes
50 embed len
45 nodes
45 embed len
46 nodes
46 embed len
81 nodes
81 embed len


Processing Chunks:  18%|████▌                    | 4/22 [01:06<05:23, 17.96s/it]

59 nodes
59 embed len
43 nodes
43 embed len
37 nodes
37 embed len
34 nodes
34 embed len
42 nodes
42 embed len


Processing Chunks:  23%|█████▋                   | 5/22 [01:16<04:17, 15.17s/it]

65 nodes
65 embed len
64 nodes
64 embed len
69 nodes
69 embed len
51 nodes
51 embed len
64 nodes
64 embed len


Processing Chunks:  27%|██████▊                  | 6/22 [01:42<04:59, 18.74s/it]

45 nodes
45 embed len
34 nodes
34 embed len
36 nodes
36 embed len
19 nodes
19 embed len
32 nodes
32 embed len


Processing Chunks:  32%|███████▉                 | 7/22 [01:46<03:32, 14.17s/it]

58 nodes
58 embed len
32 nodes
32 embed len
24 nodes
24 embed len
50 nodes
50 embed len
51 nodes
51 embed len


Processing Chunks:  36%|█████████                | 8/22 [01:56<02:55, 12.55s/it]

41 nodes
41 embed len
57 nodes
57 embed len
45 nodes
45 embed len
73 nodes
73 embed len
56 nodes
56 embed len


Processing Chunks:  41%|██████████▏              | 9/22 [02:20<03:32, 16.35s/it]

74 nodes
74 embed len
77 nodes
77 embed len
81 nodes
81 embed len
59 nodes
59 embed len
55 nodes
55 embed len


Processing Chunks:  45%|██████████▉             | 10/22 [03:00<04:43, 23.58s/it]

42 nodes
42 embed len
34 nodes
34 embed len
49 nodes
49 embed len
37 nodes
37 embed len
37 nodes
37 embed len


Processing Chunks:  50%|████████████            | 11/22 [03:04<03:13, 17.59s/it]

18 nodes
18 embed len
44 nodes
44 embed len
37 nodes
37 embed len
44 nodes
44 embed len
37 nodes
37 embed len


Processing Chunks:  55%|█████████████           | 12/22 [03:13<02:29, 14.96s/it]

56 nodes
56 embed len
38 nodes
38 embed len
61 nodes
61 embed len
56 nodes
56 embed len
78 nodes
78 embed len


Processing Chunks:  59%|██████████████▏         | 13/22 [03:42<02:53, 19.23s/it]

98 nodes
98 embed len
13 nodes
14 embed len
54 nodes
54 embed len
56 nodes
56 embed len
33 nodes
33 embed len


Processing Chunks:  64%|███████████████▎        | 14/22 [04:40<04:06, 30.81s/it]

48 nodes
48 embed len
18 nodes
18 embed len
23 nodes
23 embed len
26 nodes
26 embed len
17 nodes
17 embed len


Processing Chunks:  68%|████████████████▎       | 15/22 [04:42<02:35, 22.21s/it]

76 nodes
76 embed len
97 nodes
97 embed len
82 nodes
82 embed len
24 nodes
24 embed len
23 nodes
23 embed len


Processing Chunks:  73%|█████████████████▍      | 16/22 [06:11<04:13, 42.21s/it]

21 nodes
21 embed len
58 nodes
58 embed len
58 nodes
58 embed len
48 nodes
48 embed len
72 nodes
72 embed len


Processing Chunks:  77%|██████████████████▌     | 17/22 [06:28<02:54, 34.88s/it]

76 nodes
76 embed len
83 nodes
83 embed len
48 nodes
48 embed len
61 nodes
61 embed len
48 nodes
48 embed len


Processing Chunks:  82%|███████████████████▋    | 18/22 [06:58<02:13, 33.43s/it]

23 nodes
23 embed len
21 nodes
21 embed len
54 nodes
54 embed len
57 nodes
57 embed len
66 nodes
66 embed len


Processing Chunks:  86%|████████████████████▋   | 19/22 [07:12<01:22, 27.37s/it]

33 nodes
33 embed len
26 nodes
26 embed len
28 nodes
28 embed len
52 nodes
52 embed len
72 nodes
72 embed len


Processing Chunks:  91%|█████████████████████▊  | 20/22 [07:35<00:52, 26.24s/it]

67 nodes
67 embed len
28 nodes
28 embed len
42 nodes
42 embed len
51 nodes
51 embed len
44 nodes
44 embed len


Processing Chunks:  95%|██████████████████████▉ | 21/22 [07:51<00:22, 22.95s/it]

47 nodes
47 embed len
62 nodes
62 embed len


Processing Chunks: 100%|████████████████████████| 22/22 [07:57<00:00, 21.70s/it]


completed! SER_IPSP span: 3
108
108
performing TDA on  SER1  span:  3


Processing Chunks:   0%|                                 | 0/21 [00:00<?, ?it/s]

6 nodes
6 embed len
4 nodes
4 embed len
6 nodes
6 embed len
8 nodes
8 embed len
13 nodes
13 embed len


Processing Chunks:   5%|█▏                       | 1/21 [00:00<00:10,  1.85it/s]

10 nodes
10 embed len
8 nodes
9 embed len
10 nodes
10 embed len
5 nodes
5 embed len
5 nodes
5 embed len


Processing Chunks:  10%|██▍                      | 2/21 [00:00<00:09,  2.07it/s]

7 nodes
7 embed len
7 nodes
7 embed len
8 nodes
8 embed len
8 nodes
8 embed len
10 nodes
10 embed len


Processing Chunks:  14%|███▌                     | 3/21 [00:01<00:08,  2.20it/s]

7 nodes
7 embed len
9 nodes
9 embed len
5 nodes
5 embed len
3 nodes
3 embed len
5 nodes
5 embed len


Processing Chunks:  19%|████▊                    | 4/21 [00:01<00:06,  2.48it/s]

4 nodes
4 embed len
6 nodes
6 embed len
5 nodes
5 embed len
6 nodes
6 embed len
5 nodes
5 embed len


Processing Chunks:  24%|█████▉                   | 5/21 [00:02<00:06,  2.49it/s]

8 nodes
8 embed len
13 nodes
13 embed len
3 nodes
3 embed len
7 nodes
7 embed len
7 nodes
7 embed len


Processing Chunks:  29%|███████▏                 | 6/21 [00:02<00:05,  2.50it/s]

7 nodes
7 embed len
9 nodes
9 embed len
6 nodes
6 embed len
5 nodes
5 embed len
8 nodes
8 embed len


Processing Chunks:  33%|████████▎                | 7/21 [00:02<00:05,  2.52it/s]

6 nodes
6 embed len
6 nodes
6 embed len
11 nodes
11 embed len
6 nodes
6 embed len
8 nodes
8 embed len


Processing Chunks:  38%|█████████▌               | 8/21 [00:03<00:05,  2.44it/s]

7 nodes
7 embed len
4 nodes
4 embed len
13 nodes
13 embed len
18 nodes
18 embed len
10 nodes
10 embed len


Processing Chunks:  43%|██████████▋              | 9/21 [00:03<00:05,  2.29it/s]

21 nodes
21 embed len
19 nodes
19 embed len
9 nodes
9 embed len
10 nodes
10 embed len
7 nodes
7 embed len


Processing Chunks:  48%|███████████▍            | 10/21 [00:04<00:05,  1.95it/s]

9 nodes
9 embed len
15 nodes
15 embed len
6 nodes
6 embed len
4 nodes
4 embed len
10 nodes
10 embed len


Processing Chunks:  52%|████████████▌           | 11/21 [00:05<00:05,  1.92it/s]

5 nodes
5 embed len
8 nodes
8 embed len
4 nodes
4 embed len
4 nodes
4 embed len
6 nodes
6 embed len


Processing Chunks:  57%|█████████████▋          | 12/21 [00:05<00:04,  1.88it/s]

7 nodes
7 embed len
17 nodes
17 embed len
9 nodes
9 embed len
7 nodes
7 embed len
11 nodes
11 embed len


Processing Chunks:  62%|██████████████▊         | 13/21 [00:06<00:04,  1.91it/s]

8 nodes
8 embed len
7 nodes
7 embed len
5 nodes
5 embed len
6 nodes
6 embed len
9 nodes
9 embed len


Processing Chunks:  67%|████████████████        | 14/21 [00:06<00:03,  1.93it/s]

6 nodes
6 embed len
13 nodes
13 embed len
6 nodes
6 embed len
4 nodes
4 embed len
9 nodes
9 embed len


Processing Chunks:  71%|█████████████████▏      | 15/21 [00:07<00:02,  2.02it/s]

4 nodes
4 embed len
4 nodes
4 embed len
6 nodes
6 embed len
3 nodes
3 embed len
6 nodes
6 embed len


Processing Chunks:  76%|██████████████████▎     | 16/21 [00:07<00:02,  2.12it/s]

3 nodes
3 embed len
8 nodes
8 embed len
3 nodes
3 embed len
4 nodes
4 embed len
7 nodes
7 embed len


Processing Chunks:  81%|███████████████████▍    | 17/21 [00:07<00:01,  2.28it/s]

7 nodes
7 embed len
6 nodes
6 embed len
6 nodes
6 embed len
7 nodes
7 embed len
8 nodes
8 embed len


Processing Chunks:  86%|████████████████████▌   | 18/21 [00:08<00:01,  2.21it/s]

8 nodes
8 embed len
7 nodes
7 embed len
6 nodes
6 embed len
7 nodes
7 embed len
5 nodes
5 embed len


Processing Chunks:  90%|█████████████████████▋  | 19/21 [00:08<00:00,  2.12it/s]

7 nodes
7 embed len
8 nodes
8 embed len
6 nodes
6 embed len
7 nodes
7 embed len
8 nodes
8 embed len


Processing Chunks:  95%|██████████████████████▊ | 20/21 [00:09<00:00,  2.14it/s]

8 nodes
8 embed len
5 nodes
5 embed len
10 nodes
10 embed len
11 nodes
11 embed len


Processing Chunks: 100%|████████████████████████| 21/21 [00:09<00:00,  2.17it/s]


completed! SER1 span: 3
66
66
performing TDA on  MASM  span:  3


Processing Chunks:   0%|                                 | 0/14 [00:00<?, ?it/s]

51 nodes
54 embed len
54 nodes
54 embed len
70 nodes
70 embed len
52 nodes
52 embed len
36 nodes
36 embed len


Processing Chunks:   7%|█▊                       | 1/14 [00:09<01:59,  9.19s/it]

65 nodes
65 embed len
55 nodes
55 embed len
36 nodes
36 embed len
56 nodes
56 embed len
47 nodes
47 embed len


Processing Chunks:  14%|███▌                     | 2/14 [00:14<01:20,  6.69s/it]

42 nodes
42 embed len
39 nodes
39 embed len
81 nodes
81 embed len
89 nodes
89 embed len
88 nodes
88 embed len


Processing Chunks:  21%|█████▎                   | 3/14 [00:48<03:30, 19.10s/it]

40 nodes
40 embed len
54 nodes
54 embed len
60 nodes
60 embed len
74 nodes
74 embed len
52 nodes
52 embed len


Processing Chunks:  29%|███████▏                 | 4/14 [01:01<02:49, 16.91s/it]

68 nodes
68 embed len
70 nodes
70 embed len
70 nodes
70 embed len
83 nodes
83 embed len
72 nodes
72 embed len


Processing Chunks:  36%|████████▉                | 5/14 [01:24<02:50, 18.92s/it]

50 nodes
50 embed len
27 nodes
27 embed len
19 nodes
19 embed len
38 nodes
38 embed len
35 nodes
36 embed len


Processing Chunks:  43%|██████████▋              | 6/14 [01:28<01:50, 13.87s/it]

21 nodes
21 embed len
51 nodes
51 embed len
57 nodes
57 embed len
24 nodes
24 embed len
50 nodes
50 embed len


Processing Chunks:  50%|████████████▌            | 7/14 [01:32<01:14, 10.68s/it]

28 nodes
28 embed len
71 nodes
71 embed len
53 nodes
53 embed len
153 nodes
153 embed len
143 nodes
143 embed len


In [None]:
for SP in [1,2,3]:
    for name in ['PEM_df','SER_IPSP','SER_monologs','MASM', 'cleaned_DEI']:
        data_save_dir=working_dir+'TDA_output/'
        data_save_dir_name=working_dir+f'TDA_output/{name}_{SP}_mean/'
        list_files=sorted(os.listdir(data_save_dir_name))
        dfs=[]
        for f in list_files:
            dfs.append(pd.read_csv(data_save_dir_name+f))
        data=pd.concat(dfs)
        data.to_csv(data_save_dir+f'{name}_{SP}_back_utterance_TDA_results.csv')
        print(name, len(data))

In [26]:
test_mode=False
plot=False
if test_mode:
    save=False
else:
    save=True
print('TEST MODE')

threshold=244
# infile = open(f'/home/ll16598/Documents/POSTDOC/Context-DATM/sentenceBERT_cluster_dicts_{window}_{embedding_step}/cluster_dictionary_{save_thresh}','rb')
# cluster_dictionary=pickle.load(infile)
# infile.close()

user='luke'
if user=='luke':
    working_dir='/home/ll16598/Documents/POSTDOC/'
    dir_atom_dfs='/home/ll16598/Documents/POSTDOC/TDA/TDA_cluster/atom_assigned_dfs'
    dir_array='/home/ll16598/Documents/POSTDOC/TDA/TDA_cluster/vector_assigned_dfs'
elif user=='cluster':
    working_dir='/N/u/lleckie/Quartz/TDA/'
    dir_atom_dfs=working_dir+'/atom_assigned_dfs'

    dir_array=working_dir+'vector_assigned_dfs'

#df_drug=pd.read_csv(f'.{}/df_monolog_{threshold}.csv')


TEST MODE


In [27]:
# 3) Define a helper function to transform a single row's embeddings
ML=3
embeddings='sentence_embeddings'
reduce_dims=True
SPARSE=True
sparse_param=0.5
dims_simplex=3
chunk_size=5
df_monologs=pd.read_csv(f'{dir_atom_dfs}/df_monolog_{threshold}.csv')
df_SER2=pd.read_csv(f'{dir_atom_dfs}/df_SER2_{threshold}.csv')
df_PEM=pd.read_csv(f'{dir_atom_dfs}/df_PEM.csv')
df_SER_MA=pd.read_csv(f'{dir_atom_dfs}/SER1.csv')


In [5]:
import gc
import shutil
df_names=['PEM_df','SER_monologs', 'SER_IPSP', 'SER1']
data_save_dir=working_dir+'TDA_output/'
os.makedirs(data_save_dir, exist_ok=True)

completed_files=os.listdir(data_save_dir)
# for overlap in [0.1,0.2,0.4]:
#     for window in [60,80,100,120,140,160,180,200]:

# for overlap in [0.1,0.2,0.4]:
#     for window in [60,80,100,120,140,160,180,200]:
        
for overlap in [0.1]:
    for window in ['utterances']:
        layers='last'

        dfs=[df_PEM, df_monologs, df_SER2, df_SER_MA]

        for df_no, df_monolog in enumerate(dfs):

            df_name=df_names[df_no]
#             if newfilename in completed_files:
#                 print(f'Already completed {newfilename}')
#                 continue
            data_save_dir_name=working_dir+f'TDA_output/{df_name}/'
            if os.path.exists(data_save_dir_name):
                shutil.rmtree(data_save_dir_name) 
            os.makedirs(data_save_dir_name, exist_ok=True)

            if window=='utterances':
                with open(f'{dir_array}/utterance_{df_name}_sentence_embeddings_arrays.pkl', 'rb') as f:
                    embeds= pickle.load(f)   
                newfilename=f'{df_name}_utterance_distance_results.csv'

            else:
                step=int(window*overlap)#4
                with open(f'{dir_array}/{window}_{step}_{df_name}_sentence_embeddings_arrays.pkl', 'rb') as f:
                    embeds = pickle.load(f)   
                newfilename=f'{df_name}_{window}_{step}_distance_results.csv'
            
            if len(embeds)!=len(df_monolog):
                raise Exception('MISMATCH IN LENGTH')
                
            df_monolog['sentence_embeddings'] = embeds


            print(len(embeds))
            print(len(df_monolog))
            if len(embeds)!=len(df_monolog):
                raise Exception('MISMATCH IN LENGTH')

            df_monolog['sentence_embeddings'] = embeds
            df_monolog['length'] = [len(i) for i in embeds]
            if test_mode:
                df_monolog=df_monolog[0:10]
            df_monolog = df_monolog[
                df_monolog["sentence_embeddings"].apply(
                    lambda x: (
                        not isinstance(x, float)               # exclude floats
                        and isinstance(x, (list, tuple, np.ndarray))  # must be list/tuple/np.ndarray
                        and len(x) >= 3                        # length >= 3
                    )
                )]

            # 4) Apply that function to each row -> produce a new column
            if reduce_dims:
                all_vecs = []
                for row in df_monolog['sentence_embeddings']:
                    arr = np.array(row)  
                    all_vecs.append(arr)
                big_matrix = np.concatenate(all_vecs, axis=0)
                pca = PCA(n_components=50)
                pca.fit(big_matrix)
                def transform_embeddings(emb_list):
                    emb_array = np.array(emb_list)   # shape (k_i, 384)
                    emb_pca = pca.transform(emb_array)  # shape (k_i, 50)
                    return emb_pca
                df_monolog['sentence_embeddings'] = df_monolog['sentence_embeddings'].apply(transform_embeddings)
            df_monolog['token_embeddings']=None
            print('performing TDA on ',df_name, ' window: ', window)#, 'step: ', step)
            
            for fi in range(0, len(df_monolog), chunk_size):
                
                if fi+chunk_size>=len(df_monolog):
                    df_subset=df_monolog[fi:].reset_index(drop=True)
                else:
                    df_subset=df_monolog[fi:fi+chunk_size].reset_index(drop=True)
                

                drugs=list(set(df_subset['Drug']))
                #Participants=list(set(df_subset['Participant']))
                df_subset=get_rips_time(df_subset,embeddings=embeddings)
               # df_subset=get_rips_time_centroid(df_subset,embeddings=embeddings)
                #df_monolog=get_simplices_over_time(df_monolog,simplex_tree_type='simplex_tree')
                print('got RIPS')
                df_with_graph=get_rips_complex_G(df_subset)
                print('got G')
                print(df_with_graph['graph'][0].nodes())
                #df_with_graph['euler'] = df_with_graph['rt_simplex_tree'].apply(lambda st: compute_euler_characteristic(st, max_dim=4))
                # Apply the function to each graph in df_with_graph
                graph_metrics = df_with_graph['graph'].apply(compute_graph_metrics)

                print(graph_metrics)
                graph_metrics_df = pd.DataFrame(graph_metrics.tolist())
                df_with_graph = pd.concat([df_with_graph, graph_metrics_df], axis=1)
                            # Create a new DataFrame

                # We'll accumulate new rows in a list of dicts
                new_rows = []
                dimensions = [0, 1, 2]

                for idx, row in df_with_graph.iterrows():
                    embed = row[embeddings]  # Adjust as needed
                    # We’ll store births, deaths, pers LENGTHS in a dict keyed by dimension
                    dim_dict = {
                        dim: {'births': [], 'deaths': [], 'pers': []}
                        for dim in dimensions
                    }



                    # Build the Rips Complex for *this row only*
                    rips_complex =row['rips']
                    try:
                        simplex_tree = \
                        rips_complex.create_simplex_tree(max_dimension=dims_simplex)
                    except Exception as e:
                        continue
                    persistence = simplex_tree.persistence()

                    # Collect intervals by dimension
                    for dim, (b, d) in persistence:
                        if d == float('inf'):
                            continue
                        if dim in dimensions:
                            dim_dict[dim]['births'].append(b)
                            dim_dict[dim]['deaths'].append(d)
                            dim_dict[dim]['pers'].append(d - b)


                    row_dict = row.to_dict()  # Start with original row's columns

                    for dim in dimensions:
                        bdp = dim_dict[dim]
                        stats_dict = compute_distribution_stats(bdp['births'], bdp['deaths'], bdp['pers'])
                        # prefix each stat key with dim
                        for stat_key, stat_val in stats_dict.items():
                            row_dict[f"{stat_key}_dim{dim}"] = stat_val

                    # Add row_dict to new_rows
                    new_rows.append(row_dict)

                # Create a new DataFrame
                df_with_tda = pd.DataFrame(new_rows)
                if window=='utterances':
                    df_with_tda.to_csv(data_save_dir_name + f'{df_name}_{fi}_{window}_TDA_results.csv')
                else:
                    df_with_tda.to_csv(data_save_dir_name + f'{df_name}_{fi}_{window}_{step}_TDA_results.csv')
                print(f'completed! {df_name} window: {window}')
                del df_subset
                del df_with_graph
                gc.collect()

130
130
performing TDA on  PEM_df  window:  utterances
rips mem 0 285.8203125
simplex mem 346.5078125
persistence mem 402.87109375
rips max mem 402.87109375
rips mem 1 402.87109375
simplex mem 432.5625
persistence mem 432.5625
rips max mem 432.5625
rips mem 2 432.5625
simplex mem 432.5625
persistence mem 432.5625
rips max mem 432.5625
rips mem 3 432.5625
simplex mem 432.5625
persistence mem 432.5625
rips max mem 432.5625
rips mem 4 432.5625
simplex mem 432.5625
persistence mem 432.5625
rips max mem 432.5625
got RIPS
got G
[0, 5, 7, 8, 18, 23, 28, 38, 39, 76, 43, 24, 37, 67, 55, 59, 66, 40, 58, 44, 77, 1, 2, 3, 4, 13, 27, 30, 31, 53, 60, 19, 61, 62, 63, 26, 68, 75, 33, 57, 29, 36, 71, 11, 12, 20, 21, 22, 64, 14, 10, 52, 25, 16, 46, 47, 81, 65, 17, 32, 35, 9, 56, 80, 42, 15, 69, 48, 70, 45, 54, 6, 78, 34, 72, 73, 74, 49, 51, 41, 79, 50]
0    {'shortest_path_unweighted': 2, 'nodes': 82, '...
1    {'shortest_path_unweighted': 2, 'nodes': 75, '...
2    {'shortest_path_unweighted': 1, 'nodes

persistence mem 654.16015625
rips max mem 654.16015625
rips mem 2 654.16015625
simplex mem 654.16015625
persistence mem 654.14453125
rips max mem 654.14453125
rips mem 3 654.14453125
simplex mem 654.14453125
persistence mem 654.14453125
rips max mem 654.14453125
rips mem 4 654.14453125
simplex mem 654.14453125
persistence mem 654.14453125
rips max mem 654.14453125
got RIPS
got G
[0, 4, 22, 23, 26, 27, 28, 30, 33, 37, 49, 53, 58, 59, 62, 66, 69, 75, 43, 79, 73, 52, 77, 32, 78, 1, 2, 3, 7, 9, 11, 18, 20, 21, 44, 45, 50, 57, 63, 64, 65, 68, 76, 46, 17, 13, 51, 6, 12, 16, 55, 56, 61, 72, 47, 54, 71, 81, 15, 14, 74, 60, 67, 70, 82, 80, 42, 19, 29, 5, 38, 10, 34, 31, 36, 8, 35, 48, 24, 39, 41, 40]
0    {'shortest_path_unweighted': 2, 'nodes': 82, '...
1    {'shortest_path_unweighted': 2, 'nodes': 68, '...
2    {'shortest_path_unweighted': 1, 'nodes': 89, '...
3    {'shortest_path_unweighted': 2, 'nodes': 17, '...
4    {'shortest_path_unweighted': 1, 'nodes': 59, '...
Name: graph, dtype: obje

KeyboardInterrupt: 

using max pooling


130
130
performing TDA on  PEM_df  span:  1


Processing Chunks:   0%|                                 | 0/26 [00:00<?, ?it/s]


TypeError: must be real number, not NoneType

In [11]:
SP=2
for name in ['PEM_df','SER_IPSP','SER_monologs','MASM', 'cleaned_DEI']:
    data_save_dir=working_dir+'TDA_output/'
    data_save_dir_name=working_dir+f'TDA_output/{name}_{SP}/'
    list_files=sorted(os.listdir(data_save_dir_name))
    dfs=[]
    for f in list_files:
        dfs.append(pd.read_csv(data_save_dir_name+f))
    data=pd.concat(dfs)
    data.to_csv(data_save_dir+f'{name}_{SP}_back_utterance_TDA_results.csv')
    print(name, len(data))

PEM_df 130
SER_IPSP 107
SER_monologs 100
MASM 66
cleaned_DEI 73


In [29]:
df_names=['PEM_df','SER_monologs', 'SER_IPSP', 'SER1', 'MASM', 'cleaned_DEI']
    data_save_dir=working_dir+'TDA_output/'
    data_save_dir_name=working_dir+f'TDA_output/{name}_{1}/'
    list_files=sorted(os.listdir(data_save_dir_name))
    dfs=[]
    for f in list_files:
        dfs.append(pd.read_csv(data_save_dir_name+f))
    data=pd.concat(dfs)
    data.to_csv(data_save_dir+f'{name}_{1}_utterance_TDA_results.csv')

PEM_df 130
SER_IPSP 107
SER_monologs 100
MASM 66
cleaned_DEI 73


In [29]:
for name in ['PEM_df','SER_monologs', 'SER_IPSP', 'SER1']:
    data_save_dir=working_dir+'TDA_output/'
    data_save_dir_name=working_dir+f'TDA_output/{name}/'
    list_files=sorted(os.listdir(data_save_dir_name))
    dfs=[]
    for f in list_files:
        dfs.append(pd.read_csv(data_save_dir_name+f))
    data=pd.concat(dfs)
    data.to_csv(data_save_dir+f'{name}_utterances_TDA_results.csv')

In [16]:
data_save_dir

'/home/ll16598/Documents/POSTDOC/TDA_output/'

In [None]:
data_save_dir_name=working_dir+f'TDA_output/{df_name}/'
os.makedirs(data_save_dir_name, exist_ok=True)

df_monolog['token_embeddings']=None
print('performing TDA on ',df_name, ' window: ', window, 'step: ', step)

for fi in range(0, len(df_monolog), chunk_size):

    if fi+chunk_size>=len(df_monolog):
        df_subset=df_monolog[fi:]
    else:
        df_subset=df_monolog[fi:fi+chunk_size]

    df_subset=get_rips_time(df_subset,embeddings=embeddings)
    df_subset=get_rips_time_centroid(df_subset,embeddings=embeddings)
    print('completed rips')
    df_subset=get_simplices_over_time(df_subset,simplex_tree_type='simplex_tree')
    for D in [2, 3, 4]:
        # Explode the lists in the columns for the current dimension.
        try:
            df_exploded = df_subset.explode([f"simplex_time_dim{D}_filtration", f"simplex_time_dim{D}_count"])
        except Exception as e:
            continue
       # Convert the exploded columns to numeric.
        df_exploded[f"simplex_time_dim{D}_filtration"] = pd.to_numeric(df_exploded[f"simplex_time_dim{D}_filtration"])
        df_exploded[f"simplex_time_dim{D}_count"] = pd.to_numeric(df_exploded[f"simplex_time_dim{D}_count"])

        # Group by "Drug" and the filtration values, and compute the mean and standard error for the counts.
        grouped = df_exploded.groupby(["Drug", f"simplex_time_dim{D}_filtration"], as_index=False).agg(
            alive_mean=(f"simplex_time_dim{D}_count", "mean"),
            alive_se=(f"simplex_time_dim{D}_count", sem)  # standard error
        )
        if save:
            df_exploded.to_csv(data_save_dir_name + f'{fi}_{df_name}_{window}_{step}_{D}_skeleton_simplices_over_time.csv', index=False)


        if plot:
            import matplotlib.pyplot as plt
            # Create a plot for the current dimension.
            fig, ax = plt.subplots(figsize=(8, 6))

            # Iterate over each drug group and plot mean ± SE.
            for drug_level, df_sub in grouped.groupby("Drug"):
                ax.errorbar(
                    df_sub[f"simplex_time_dim{D}_filtration"],
                    df_sub["alive_mean"],
                    yerr=df_sub["alive_se"],
                    label=f"Drug={drug_level}",
                    marker='o',
                    capsize=3
                )

            ax.set_xlabel("Filtration Value (Distance Threshold)")
            ax.set_ylabel("Number of Alive Components (Mean ± SE)")
            ax.set_title(f"Dimension {D} Alive Components Over Filtration Value by Drug")
            ax.legend()
            plt.show()

    for D in [0,1,2]:
        df_exploded = df_subset.explode([f"scales_dim{D}", f'alive_dim{D}'])
        df_exploded[f"scales_dim{D}"] = pd.to_numeric(df_exploded[f"scales_dim{D}"])
        df_exploded[f'alive_dim{D}'] = pd.to_numeric(df_exploded[f'alive_dim{D}'])
        grouped = df_exploded.groupby(["Drug", f"scales_dim{D}"], as_index=False).agg(
            alive_mean=(f'alive_dim{D}', "mean"),
            alive_se=(f'alive_dim{D}', sem)  # standard error
        )
        df_exploded.to_csv(data_save_dir_name+f'{fi}_{df_name}_{window}_{step}_{D}_simplices_over_time.csv')
        if plot:

            fig, ax = plt.subplots(figsize=(8,6))

            # We'll iterate over each drug and plot mean ± SE
            for drug_level, df_sub in grouped.groupby("Drug"):
                ax.errorbar(
                    df_sub[f"scales_dim{D}"], 
                    df_sub["alive_mean"], 
                    yerr=df_sub["alive_se"], 
                    label=f"Drug={drug_level}",
                    marker='o',
                    capsize=3
                )

            ax.set_xlabel("Scale (distance threshold)")
            ax.set_ylabel("Number of Alive Components (Mean ± SE)")
            ax.set_title("Connected Components Over Scale by Drug")
            ax.legend()
            plt.show()

    df_with_graph=get_rips_complex_G(df_subset)
    #df_with_graph['euler'] = df_with_graph['rt_rips'].apply(lambda st: compute_euler_characteristic(st, max_dim=4))
    # Apply the function to each graph in df_with_graph
    graph_metrics = df_with_graph['graph'].apply(compute_graph_metrics)
    graph_metrics_df = pd.DataFrame(graph_metrics.tolist())
    df_with_graph = pd.concat([df_with_graph, graph_metrics_df], axis=1)



    dimensions = [0, 1, 2]

    # We'll accumulate new rows in a list of dicts
    new_rows = []

    for idx, row in df_subset.iterrows():
        embed = row[embeddings]  # Adjust as needed
        # We’ll store births, deaths, pers LENGTHS in a dict keyed by dimension
        dim_dict = {
            dim: {'births': [], 'deaths': [], 'pers': []}
            for dim in dimensions
        }



        # Build the Rips Complex for *this row only*
        rips_complex =row['rips']
        try:
            simplex_tree = rips_complex.create_simplex_tree(max_dimension=dims_simplex)
        except Exception as e:
            continue
        persistence = simplex_tree.persistence()

        # Collect intervals by dimension
        for dim, (b, d) in persistence:
            if d == float('inf'):
                continue
            if dim in dimensions:
                dim_dict[dim]['births'].append(b)
                dim_dict[dim]['deaths'].append(d)
                dim_dict[dim]['pers'].append(d - b)


        row_dict = row.to_dict()  # Start with original row's columns

        for dim in dimensions:
            bdp = dim_dict[dim]
            stats_dict = compute_distribution_stats(bdp['births'], bdp['deaths'], bdp['pers'])
            # prefix each stat key with dim
            for stat_key, stat_val in stats_dict.items():
                row_dict[f"{stat_key}_dim{dim}"] = stat_val

        # Add row_dict to new_rows
        new_rows.append(row_dict)

    # Create a new DataFrame
    print('completed',f'{df_name}_{window}_{step}')
    df_with_tda = pd.DataFrame(new_rows)
    if reduce_dims:
        df_with_tda.to_csv(data_save_dir_name + f'{fi}_D50_{SP}{df_name}_{window}_{step}_TDA_results.csv')
    else:
        df_with_tda.to_csv(data_save_dir_name + f'{fi}_{SP}{df_name}_{window}_{step}_TDA_results.csv')
    print(f'completed! {df_name} window: {window} step size: {step}')

In [None]:

df_names=['SER_monologs', 'PEM_df', 'SER_IPSP', 'SER1']
data_save_dir=working_dir+'TDA_output/'
os.makedirs(data_save_dir, exist_ok=True)

completed_files=os.listdir(data_save_dir)
# for overlap in [0.1,0.2,0.4]:
#     for window in [60,80,100,120,140,160,180,200]:

# for overlap in [0.1,0.2,0.4]:
#     for window in [60,80,100,120,140,160,180,200]:
        
for overlap in [0.1]:
    for window in [100]:
        step=int(window*overlap)#4
        layers='last'

        dfs=[df_monologs, df_PEM, df_SER2, df_SER_MA]

        for df_no, df_monolog in enumerate(dfs):

            df_name=df_names[df_no]
            newfilename=f'{df_name}_{window}_{step}_TDA_results.csv'
#             if newfilename in completed_files:
#                 print(f'Already completed {newfilename}')
#                 continue
        
            with open(f'{dir_array}/{window}_{step}_{df_name}_sentence_embeddings_arrays.pkl', 'rb') as f:
                embeds = pickle.load(f)
            
            df_monolog['sentence_embeddings'] = embeds


            print(len(embeds))
            print(len(df_monolog))
            if len(embeds)!=len(df_monolog):
                raise Exception('MISMATCH IN LENGTH')

            df_monolog['sentence_embeddings'] = embeds
            df_monolog['length'] = [len(i) for i in embeds]
            if test_mode:
                df_monolog=df_monolog[0:10]
            df_monolog = df_monolog[
                df_monolog["sentence_embeddings"].apply(
                    lambda x: (
                        not isinstance(x, float)               # exclude floats
                        and isinstance(x, (list, tuple, np.ndarray))  # must be list/tuple/np.ndarray
                        and len(x) >= 3                        # length >= 3
                    )
                )]

            # 4) Apply that function to each row -> produce a new column
            if reduce_dims:
                all_vecs = []
                for row in df_monolog['sentence_embeddings']:
                    arr = np.array(row)  
                    all_vecs.append(arr)
                big_matrix = np.concatenate(all_vecs, axis=0)
                pca = PCA(n_components=50)
                pca.fit(big_matrix)
                def transform_embeddings(emb_list):
                    emb_array = np.array(emb_list)   # shape (k_i, 384)
                    emb_pca = pca.transform(emb_array)  # shape (k_i, 50)
                    return emb_pca
                df_monolog['sentence_embeddings'] = df_monolog['sentence_embeddings'].apply(transform_embeddings)
            df_monolog['token_embeddings']=None
            print('performing TDA on ',df_name, ' window: ', window, 'step: ', step)

            for fi in range(0, len(df_monolog), chunk_size):
                
                if fi+chunk_size>=len(df_monolog):
                    df_subset=df_monolog[fi:]
                else:
                    df_subset=df_monolog[fi:fi+chunk_size]
                

                drugs=list(set(df_subset['Drug']))
                Participants=list(set(df_subset['Participant']))
                df_subset=get_rips_time(df_subset,embeddings=embeddings)
               # df_subset=get_rips_time_centroid(df_subset,embeddings=embeddings)
                #df_monolog=get_simplices_over_time(df_monolog,simplex_tree_type='simplex_tree')

                df_with_graph=get_rips_complex_G(df_subset)
                df_with_graph['euler'] = df_with_graph['rt_simplex_tree'].apply(lambda st: compute_euler_characteristic(st, max_dim=4))
                # Apply the function to each graph in df_with_graph
                graph_metrics = df_with_graph['graph'].apply(compute_graph_metrics)
                print(graph_metrics)
                graph_metrics_df = pd.DataFrame(graph_metrics.tolist())
                df_with_graph = pd.concat([df_with_graph, graph_metrics_df], axis=1)
                            # Create a new DataFrame
                df_with_graph.to_csv(data_save_dir + f'{df_name}_{fi}_{window}_{step}_TDA_results.csv')
                df_subset=None
                df_with_graph=None
                print(f'completed! {df_name} window: {window} step size: {step}')

In [None]:
def get_rips_time(df, embeddings='sentence_embeddings', step=0.025):
    """
    For each row in df, build a Rips complex, extract dimension-D intervals
    (e.g., D=0 => connected components, D=1 => loops, etc.),
    then compute how many such features are 'alive' at increments of 'step'.

    Creates two new columns in df:
    - f"scales_dim{D}": The scale values
    - f"alive_dim{D}": The counts of alive features at each scale
    """
    # Copy the DataFrame to avoid SettingWithCopy warnings
    df = df.copy()
    
    # Prepare two new columns (lists)
    df[f'scales_dim0'] = None
    df[f'alive_dim0'] = None
    df[f'scales_dim1'] = None
    df[f'alive_dim1'] = None
    df[f'scales_dim2'] = None
    df[f'alive_dim2'] = None
    df['rt'] = None
    df['simplex_tree']=None
    df["rt_rips"]=None
    
    for idx, row in df.iterrows():
        # Get the embeddings for this row
        embed = row[embeddings]
        if not isinstance(embed, (list, np.ndarray)) or len(embed) == 0:
            continue
        
        # Build the Rips Complex
        rips_complex = gd.RipsComplex(points=embed, max_edge_length=3)
        simplex_tree = rips_complex.create_simplex_tree(max_dimension=dims_simplex)

        # Extract dimension-D intervals from persistence
        persistence = simplex_tree.persistence()
        for D in [0,1,2]:
            births_dimD = []
            deaths_dimD = []

            for dim, (b, d) in persistence:
                if dim == D and d != float('inf'):  # ignoring infinite intervals
                    births_dimD.append(b)
                    deaths_dimD.append(d)

            # Compute how many features are alive at each scale
            scales, alive_components = get_alive_components_over_scales(births_dimD, deaths_dimD, step=step)
            if len(deaths_dimD)>0:
                df.at[idx, f"rt"] = max(deaths_dimD)

            # Store these lists in the new columns
            df.at[idx, f"scales_dim{D}"] = scales
            df.at[idx, f"alive_dim{D}"] = alive_components
        try:
            rips_complex_max = gd.RipsComplex(points=embed, max_edge_length=df["rt"].loc[idx])
        except Exception as e:
            print(idx, df["rt"].loc[idx], embed)
        simplex_tree_max = rips_complex.create_simplex_tree(max_dimension=dims_simplex)
        df.at[idx, f"simplex_tree"] = simplex_tree
        df.at[idx, f"rt_simplex_tree"] = simplex_tree_max
        df.at[idx, f"rt_rips"] = rips_complex_max

    return df

In [None]:
def visualize_rips_simplicial_complex(rt_rips, simp_tree, dataset_name, entry, max_edge_length=5.0,SAVE=False):
    """
    1) Builds a Rips complex (via GUDHI) from a set of high-dimensional points.
    2) Extracts simplices (up to dimension 2) from the simplex tree.
       - Edges (1-simplices) and triangles (2-simplices).
    3) Uses PCA to reduce the points to 3D.
    4) Plots a 3D visualization:
       - Nodes are shown as a scatter plot.
       - Edges are drawn as lines.
       - Triangles are drawn as filled polygons (using Poly3DCollection).
    
    Parameters:
    -----------
    embed : np.ndarray of shape (N, D)
        The high-dimensional point cloud.
    max_edge_length : float
        The maximum edge length used in the Rips complex.
    """
#     # 1) Build the Rips complex and create the simplex tree
#     rips_complex = gd.RipsComplex(points=embed, max_edge_length=max_edge_length)
    simplex_tree = rt_rips.create_simplex_tree(max_dimension=5)
    
#     # 2) Extract simplices:
    edges = []
    triangles = []
    
    # get_skeleton(2) returns all simplices up to dimension 2
    for simplex, fvalue in simplex_tree.get_skeleton(4):
        if len(simplex) == 2:
            # 1-simplices: edges
            edges.append(simplex)
        elif len(simplex) == 3:
            # 2-simplices: triangles
            triangles.append(simplex)
    G=nx.Graph()        
    for simplex, fvalue in simplex_tree.get_skeleton(4):
        if len(simplex) >= 2:
            for (i, j) in itertools.combinations(simplex, 2):
                G.add_edge(i, j, weight=fvalue)
    metrics=compute_graph_metrics(G)
    print(metrics)
    # 3) Use PCA to reduce the point cloud to 3D
    pca = PCA(n_components=3)
    coords_3d = pca.fit_transform(embed)  # shape (N, 3)
    n_points = coords_3d.shape[0]
    
    # Prepare colormap for nodes (using 'magma_r')
    norm = plt.Normalize(vmin=0, vmax=n_points - 1)
    cmap = plt.get_cmap('plasma_r')
    node_colors = cmap(norm(np.arange(n_points)))
    
    # 4) Create the 3D plot
    fig = plt.figure(figsize=(10, 7))
    ax = fig.add_subplot(111, projection='3d')
    
    # Plot nodes
    sc = ax.scatter(coords_3d[:, 0], coords_3d[:, 1], coords_3d[:, 2],
                    c=node_colors, s=30, alpha=0.9)
    
    # Plot edges as lines
    for edge in edges:
        i, j = edge
        x_vals = [coords_3d[i, 0], coords_3d[j, 0]]
        y_vals = [coords_3d[i, 1], coords_3d[j, 1]]
        z_vals = [coords_3d[i, 2], coords_3d[j, 2]]
        # Optionally, color edge based on one endpoint's index or the average.
        avg_idx = int(np.mean(edge))
        edge_color = cmap(norm(avg_idx))
        ax.plot(x_vals, y_vals, z_vals, color=edge_color, alpha=0.8, linewidth=1.5)
    
    # Plot triangles as filled faces
    face_polys = []
    face_colors = []
    for tri in triangles:
        # Get the 3 vertices for this triangle
        pts = [coords_3d[idx] for idx in tri]
        face_polys.append(pts)
        # Color can be computed from the average index of the triangle's vertices
        avg_idx = int(np.mean(tri))
        face_colors.append(cmap(norm(avg_idx)))
    
    # Create a Poly3DCollection for the triangles with a set transparency (alpha)
    poly_collection = Poly3DCollection(face_polys, alpha=0.3, edgecolor='k')
    poly_collection.set_facecolor(face_colors)
    ax.add_collection3d(poly_collection)
    
    # Set title and labels
    ax.set_title(f"", pad=20)
    ax.set_xlabel("PCA 1")
    ax.set_ylabel("PCA 2")
    ax.set_zlabel("PCA 3")
    
    # Add colorbar for node indices
    sm = plt.cm.ScalarMappable(norm=norm, cmap=cmap)
    sm.set_array([])
    cbar = plt.colorbar(sm, ax=ax, pad=0.1)
    cbar.set_label("Node Index")
        # Define three different viewing angles
    angles = [(15, 180), (30, 90), (45, 0)]  # (elevation, azimuth) in degrees
    dir_fig_save=working_dir+f'rips_skeletons/{dataset_name}_{window}_{step}/'
    os.makedirs(dir_fig_save, exist_ok=True)

    # Save figures from different angles
    for i, (elev, azim) in enumerate(angles):
        ax.view_init(elev=elev, azim=azim)  # Set camera angle
        filename = dir_fig_save+f"{entry}_{i}.png"
        if SAVE:
            plt.savefig(filename, dpi=300, bbox_inches='tight')  # Save figure
       # print(f"Saved: {filename}")

    #plt.show()

In [None]:
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d.art3d import Poly3DCollection
idx=2
embed=df_with_graph['sentence_embeddings'][idx]
simp_tre=df_with_graph['rt_rips'][idx]
rt=df_with_graph['rt'][idx]
visualize_rips_simplicial_complex(simp_tre, embed, 'x', 1, max_edge_length=rt,SAVE=False)


In [8]:

            
            # Assuming df_monolog is your DataFrame and data_save_dir, df_name, window, and step are defined.
            # For each dimension (2, 3, 4) we explode the corresponding columns and then group by Drug and filtration values.

            for D in [2, 3, 4]:
                # Explode the lists in the columns for the current dimension.
                df_exploded = df_monolog.explode([f"simplex_time_dim{D}_filtration", f"simplex_time_dim{D}_count"])

                # Convert the exploded columns to numeric.
                df_exploded[f"simplex_time_dim{D}_filtration"] = pd.to_numeric(df_exploded[f"simplex_time_dim{D}_filtration"])
                df_exploded[f"simplex_time_dim{D}_count"] = pd.to_numeric(df_exploded[f"simplex_time_dim{D}_count"])

                # Group by "Drug" and the filtration values, and compute the mean and standard error for the counts.
                grouped = df_exploded.groupby(["Drug", f"simplex_time_dim{D}_filtration"], as_index=False).agg(
                    alive_mean=(f"simplex_time_dim{D}_count", "mean"),
                    alive_se=(f"simplex_time_dim{D}_count", sem)  # standard error
                )
                if save:
                    df_exploded.to_csv(data_save_dir + f'{df_name}_{window}_{step}_{D}_skeleton_simplices_over_time.csv', index=False)


                if plot:
                    import matplotlib.pyplot as plt
                    # Create a plot for the current dimension.
                    fig, ax = plt.subplots(figsize=(8, 6))

                    # Iterate over each drug group and plot mean ± SE.
                    for drug_level, df_sub in grouped.groupby("Drug"):
                        ax.errorbar(
                            df_sub[f"simplex_time_dim{D}_filtration"],
                            df_sub["alive_mean"],
                            yerr=df_sub["alive_se"],
                            label=f"Drug={drug_level}",
                            marker='o',
                            capsize=3
                        )

                    ax.set_xlabel("Filtration Value (Distance Threshold)")
                    ax.set_ylabel("Number of Alive Components (Mean ± SE)")
                    ax.set_title(f"Dimension {D} Alive Components Over Filtration Value by Drug")
                    ax.legend()
                    plt.show()

            for D in [0,1,2]:
                df_exploded = df_monolog.explode([f"scales_dim{D}", f'alive_dim{D}'])
                df_exploded[f"scales_dim{D}"] = pd.to_numeric(df_exploded[f"scales_dim{D}"])
                df_exploded[f'alive_dim{D}'] = pd.to_numeric(df_exploded[f'alive_dim{D}'])
                grouped = df_exploded.groupby(["Drug", f"scales_dim{D}"], as_index=False).agg(
                    alive_mean=(f'alive_dim{D}', "mean"),
                    alive_se=(f'alive_dim{D}', sem)  # standard error
                )
                df_exploded.to_csv(data_save_dir+f'{df_name}_{window}_{step}_{D}_simplices_over_time.csv')
                if plot:

                    fig, ax = plt.subplots(figsize=(8,6))

                    # We'll iterate over each drug and plot mean ± SE
                    for drug_level, df_sub in grouped.groupby("Drug"):
                        ax.errorbar(
                            df_sub[f"scales_dim{D}"], 
                            df_sub["alive_mean"], 
                            yerr=df_sub["alive_se"], 
                            label=f"Drug={drug_level}",
                            marker='o',
                            capsize=3
                        )

                    ax.set_xlabel("Scale (distance threshold)")
                    ax.set_ylabel("Number of Alive Components (Mean ± SE)")
                    ax.set_title("Connected Components Over Scale by Drug")
                    ax.legend()
                    plt.show()
            

            dimensions = [0, 1, 2]

            # We'll accumulate new rows in a list of dicts
            new_rows = []

            for idx, row in df_with_graph.iterrows():
                embed = row[embeddings]  # Adjust as needed
                # We’ll store births, deaths, pers LENGTHS in a dict keyed by dimension
                dim_dict = {
                    dim: {'births': [], 'deaths': [], 'pers': []}
                    for dim in dimensions
                }

                
                    
                # Build the Rips Complex for *this row only*
                rips_complex = gd.RipsComplex(points=embed, max_edge_length=5.0)
                simplex_tree = rips_complex.create_simplex_tree(max_dimension=3)
                persistence = simplex_tree.persistence()

                # Collect intervals by dimension
                for dim, (b, d) in persistence:
                    if d == float('inf'):
                        continue
                    if dim in dimensions:
                        dim_dict[dim]['births'].append(b)
                        dim_dict[dim]['deaths'].append(d)
                        dim_dict[dim]['pers'].append(d - b)
                        
                        
                row_dict = row.to_dict()  # Start with original row's columns

                for dim in dimensions:
                    bdp = dim_dict[dim]
                    stats_dict = compute_distribution_stats(bdp['births'], bdp['deaths'], bdp['pers'])
                    # prefix each stat key with dim
                    for stat_key, stat_val in stats_dict.items():
                        row_dict[f"{stat_key}_dim{dim}"] = stat_val

                # Add row_dict to new_rows
                new_rows.append(row_dict)

            # Create a new DataFrame
            df_with_tda = pd.DataFrame(new_rows)
            df_with_tda.to_csv(data_save_dir + f'{df_name}_{window}_{step}_TDA_results.csv')
            print(f'completed! {df_name} window: {window} step size: {step}')

performing TDA on  SER_IPSP  window:  120 step:  24
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]


KeyError: 'simplex_time_dim2_filtration'