# Imports

In [4]:
import os
import glob
import math
import numpy as np
import pandas as pd
import ordpy
from scipy.io import wavfile
from tqdm import tqdm
import librosa
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import logging
import warnings
from scipy.io.wavfile import WavFileWarning
from scipy.spatial import ConvexHull
from scipy.spatial import Delaunay
# Import shapely modules for geometry operations
import shapely.geometry as geometry
from shapely.ops import polygonize, unary_union



# Logger


In [5]:
logger = logging.getLogger("audio_process")
logger.setLevel(logging.DEBUG)

# Clear existing handlers to avoid duplicates
if logger.hasHandlers():
    logger.handlers.clear()

# File Handler: Write all messages (DEBUG and above) to a log file.
fh = logging.FileHandler("process.log", mode='w')
fh.setLevel(logging.DEBUG)
fh_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
fh.setFormatter(fh_formatter)
logger.addHandler(fh)

# Console Handler: Only show INFO and above on the console.
ch = logging.StreamHandler()
ch.setLevel(logging.WARNING)
ch_formatter = logging.Formatter('%(message)s')
ch.setFormatter(ch_formatter)
logger.addHandler(ch)

# Ignore metadata from scipy.wavfile
warnings.filterwarnings("ignore", category=WavFileWarning)

# Fractal dimension coutning 

In [9]:
# 1. Create overlapping windows (z-vectors)
def create_z_vectors(file_path, window_size, hop_size):
    """
    Reads an audio file (WAV or MP3), converts it to mono if needed, and creates
    overlapping windows (each of length window_size) using the given hop_size.
    For MP3 files, librosa is used; for WAV files, scipy.io.wavfile is used.
    """
    ext = os.path.splitext(file_path)[1].lower()
    if ext == '.mp3':
        # Load mp3 using librosa.
        # sr=None preserves the native sampling rate.
        data, sr = librosa.load(file_path, sr=None, mono=True)
        logger.info(f"Processing MP3 '{file_path}' with sample rate: {sr}")
    elif ext == '.wav':
        sr, data = wavfile.read(file_path)
        logger.info(f"Processing WAV '{file_path}' with sample rate: {sr}")
        # If stereo, convert to mono by averaging channels.
        if data.ndim == 2:
            data = data.mean(axis=1)
    else:
        logger.error(f"Unsupported file extension: {ext}")
        raise ValueError(f"Unsupported file extension: {ext}")

    num_samples = len(data)
    vectors = []
    if hop_size > 0:
        num_windows = (num_samples - window_size) // hop_size
        for i in range(num_windows):
            start = i * hop_size
            end = start + window_size
            vectors.append(data[start:end])
    else:
        num_windows = num_samples // window_size
        for i in range(num_windows):
            start = i * window_size
            end = start + window_size
            vectors.append(data[start:end])
    if num_windows < 1:
        logger.error("Audio too short for the given window/hop parameters.")
        raise ValueError("Audio too short for the given window/hop parameters.")
    return np.array(vectors)


# 2. Compute ordinal patterns (ranking)
def rank_vector(z):
    """
    Returns the ordinal ranking of the values in vector z.
    (Smallest value gets rank 1, next smallest 2, etc.)
    """
    return np.argsort(z).argsort() + 1

# 3. Box counting 
def box_count_6d(data, subdivisions):
    """
    Traditional box-counting in 6D.
    
    Parameters:
      - data: a NumPy array of shape (n_points, 6)
      - subdivisions: an integer; each dimension will be divided into this many equal parts.
    
    Returns:
      - N: the number of non-empty boxes for the given subdivision.
    """
    # Determine min and max along each dimension
    mins = np.min(data, axis=0)
    maxs = np.max(data, axis=0)
    
    # Compute the range (length) in each dimension
    ranges = maxs - mins
    
    # Compute the box size for each dimension (note: each dimension may have a different box size,
    # but when we plot using epsilon we will use the largest range (scale) for consistency).
    box_sizes = ranges / subdivisions
    
    # For each dimension, assign points to a box index:
    indices = []
    for d in range(6):
        # Compute the index along dimension d.
        idx_d = np.floor((data[:, d] - mins[d]) / box_sizes[d]).astype(int)
        # Handle edge cases: ensure that a point at the maximum falls into the last box.
        idx_d = np.clip(idx_d, 0, subdivisions - 1)
        indices.append(idx_d)
    
    # Combine indices from all dimensions into one array of shape (n_points, 6)
    indices = np.stack(indices, axis=-1)
    
    # Count the number of unique boxes that are occupied
    unique_boxes = np.unique(indices, axis=0)
    return len(unique_boxes)

def estimate_boxcount_dimension(data, subdivisions_list):
    """
    Estimates the fractal (box-counting) dimension using the traditional method.
    
    Parameters:
      - data: (n_points, 6) array representing your 6D dataset.
      - subdivisions_list: a list of integers, each representing how many divisions per dimension.
    
    Returns:
      - fractal_dim: The estimated box-counting dimension (D)
      - (logs_eps, logs_counts): The (x, y) arrays for plotting, where:
            logs_eps   = log2(epsilon)  with epsilon = scale/subdivisions,
            logs_counts = log2(N), with N the number of occupied boxes.
    """
    counts = []
    # Compute overall scale (we use the maximum range across dimensions)
    mins = np.min(data, axis=0)
    maxs = np.max(data, axis=0)
    ranges = maxs - mins
    scale = np.max(ranges)
    
    epsilons = []
    for s in subdivisions_list:
        N = box_count_6d(data, s)
        counts.append(N)
        # Define epsilon as the box size along the dimension with maximum spread
        eps = scale / s
        epsilons.append(eps)
    
    counts = np.array(counts)
    epsilons = np.array(epsilons)
    
    # Compute logarithms (base 2)
    logs_eps = np.log2(epsilons)
    logs_counts = np.log2(counts)
    
    # Perform linear regression using least squares:
    # log2(N) = -D * log2(epsilon) + constant
    slope, intercept = np.polyfit(logs_eps, logs_counts, 1)
    fractal_dim = -slope  # because slope = -D
    return fractal_dim, (logs_eps, logs_counts)




# Fractal dim estimation via code stolen from google

In [75]:
def fractal_dim_estim(file_path):    
# -------------------------------
    # 1. Load your time series data.
    #    Here we assume the time series is stored in a file "timeseries.txt"
    #    with one data point per row.
    #    Replace this with however you load your data.
    ext = os.path.splitext(file_path)[1].lower()
    if ext == '.mp3':
        # Load mp3 using librosa.
        # sr=None preserves the native sampling rate.
        data, sr = librosa.load(file_path, sr=None, mono=True)
        logger.info(f"Processing MP3 '{file_path}' with sample rate: {sr}")
    elif ext == '.wav':
        sr, data = wavfile.read(file_path)
        logger.info(f"Processing WAV '{file_path}' with sample rate: {sr}")
        # If stereo, convert to mono by averaging channels.
        if data.ndim == 2:
            data = data.mean(axis=1)
    else:
        logger.error(f"Unsupported file extension: {ext}")
        raise ValueError(f"Unsupported file extension: {ext}")

    # -------------------------------
    # 2. Time-delay embedding to form a 6-dimensional dataset.
    #    - 'dim' is the embedding dimension (here 6)
    #    - 'gap' is the delay between successive components (here 5)
    dim = 6
    gap = 5

    # Compute the number of valid 6D vectors (we lose (dim-1)*gap points at the end)
    N = len(data) - (dim - 1) * gap

    # Create an array to hold the embedded data.
    embedded_data = np.empty((N, dim))
    for i in range(dim):
        embedded_data[:, i] = data[i * gap : i * gap + N]

    # -------------------------------
    # 3. Prepare for box-counting in 6D.
    #    We need to create a grid that covers the range of our data in each dimension.
    mins = np.min(embedded_data, axis=0)
    maxs = np.max(embedded_data, axis=0)

    # We choose a set of box sizes (scales) in a logarithmic range.
    # For a good range, we base the limits on the overall data range.
    # (Adjust these limits if necessary for your data.)
    range_per_dim = maxs - mins
    range_min = range_per_dim.min()
    range_max = range_per_dim.max()
    scales = np.logspace(np.log10(range_min/10), np.log10(range_max), num=50)

    # -------------------------------
    # 4. Compute the box count (number of non-empty boxes) for each scale.
    Ns = []
    for scale in scales:
        # For each dimension, create bins from the minimum value up to (max+scale)
        bins = [np.arange(mins[d], maxs[d] + scale, scale) for d in range(dim)]
        # Use np.histogramdd to count points in each box (hypercube)
        H, _ = np.histogramdd(embedded_data, bins=bins)
        # Count only boxes that contain at least one point
        Ns.append(np.sum(H > 0))

    # -------------------------------
    # 5. Fit a line to the log-log plot to estimate the fractal (Hausdorff) dimension.
    #    The slope (with a negative sign) gives the estimated dimension.
    coeffs = np.polyfit(np.log(scales), np.log(Ns), 1)
    estimated_dimension = -coeffs[0]

    # -------------------------------
    # 6. Plot and save the results.
    # plt.figure()
    # plt.plot(np.log(scales), np.log(Ns), 'o', mfc='none', label='Data')
    # plt.plot(np.log(scales), np.polyval(coeffs, np.log(scales)), label='Fit')
    # plt.xlabel('log(epsilon)')
    # plt.ylabel('log N')
    # plt.title('Fractal Dimension Estimation (6D)')
    # plt.legend()
    # plt.savefig('fractal_dimension_6d.pdf')

    # print("Estimated fractal (Hausdorff) dimension:", estimated_dimension)
    # Save the scaling data (scale and number of non-empty boxes)
    np.savetxt("scaling_6d.txt", np.column_stack((scales, Ns)))
    return estimated_dimension


In [79]:
mother_folder_path = "data/genres_30sec"
fractal_dim_array = list()

# Loop over each item in the mother folder
for subfolder in os.listdir(mother_folder_path):
    subfolder_path = os.path.join(mother_folder_path, subfolder)
    print(f"\nProcessing folder: {subfolder_path}")
    
    # Check if the item is a directory (subfolder)
    if os.path.isdir(subfolder_path):
        # Get all files ending with '.wav' or '.mp3' (case insensitive)
        files = [filename for filename in os.listdir(subfolder_path) 
                 if filename.lower().endswith('.wav') or filename.lower().endswith('.mp3')]
        
        # Process files with a progress bar
        for filename in tqdm(files, desc=f"Processing files in {subfolder}", unit="file"):
            file_path = os.path.join(subfolder_path, filename)
            # Calculate the fractal dimension for the file
            dim = fractal_dim_estim(file_path)
            # Append the result to the array
            fractal_dim_array.append(dim)
    
        # Convert the list to a numpy array and print statistics for this subfolder
        fractal_dim_array_np = np.array(fractal_dim_array)
        print("Folder:", subfolder_path)
        print("min: ", np.min(fractal_dim_array_np))
        print("max: ", np.max(fractal_dim_array_np))
        print("avg: ", np.mean(fractal_dim_array))
        print("std: ", np.std(fractal_dim_array))


Processing folder: data/genres_30sec/pop


Processing files in pop: 100%|██████████| 100/100 [06:54<00:00,  4.14s/file]


Folder: data/genres_30sec/pop
min:  2.4582380274790774
max:  4.049414634424502
avg:  3.3379400598742377
std:  0.30080756440995476

Processing folder: data/genres_30sec/metal


Processing files in metal: 100%|██████████| 100/100 [06:54<00:00,  4.15s/file]


Folder: data/genres_30sec/metal
min:  2.3927371136537148
max:  4.049414634424502
avg:  3.2860366126986413
std:  0.28511830116573644

Processing folder: data/genres_30sec/disco


Processing files in disco: 100%|██████████| 100/100 [06:50<00:00,  4.10s/file]


Folder: data/genres_30sec/disco
min:  2.373433533166584
max:  4.049414634424502
avg:  3.252237216543888
std:  0.2935950601342884

Processing folder: data/genres_30sec/blues


Processing files in blues: 100%|██████████| 100/100 [06:48<00:00,  4.08s/file]


Folder: data/genres_30sec/blues
min:  1.8729840718765782
max:  4.049414634424502
avg:  3.2298953865509583
std:  0.34876006784294916

Processing folder: data/genres_30sec/reggae


Processing files in reggae: 100%|██████████| 100/100 [06:46<00:00,  4.06s/file]


Folder: data/genres_30sec/reggae
min:  1.8729840718765782
max:  4.049414634424502
avg:  3.179769932451691
std:  0.3664805830759922

Processing folder: data/genres_30sec/classical


Processing files in classical: 100%|██████████| 100/100 [06:44<00:00,  4.05s/file]


Folder: data/genres_30sec/classical
min:  1.8729840718765782
max:  4.049414634424502
avg:  3.1636761060942833
std:  0.36147629926838937

Processing folder: data/genres_30sec/rock


Processing files in rock: 100%|██████████| 100/100 [06:55<00:00,  4.16s/file]


Folder: data/genres_30sec/rock
min:  1.8729840718765782
max:  4.0831228709252505
avg:  3.166760863943895
std:  0.35771608275421685

Processing folder: data/genres_30sec/hiphop


Processing files in hiphop: 100%|██████████| 100/100 [06:53<00:00,  4.14s/file]


Folder: data/genres_30sec/hiphop
min:  1.7940188357565476
max:  4.0831228709252505
avg:  3.1572033803766213
std:  0.35909546105860596

Processing folder: data/genres_30sec/country


Processing files in country: 100%|██████████| 100/100 [06:46<00:00,  4.07s/file]


Folder: data/genres_30sec/country
min:  1.7940188357565476
max:  4.0831228709252505
avg:  3.1657610769983386
std:  0.3511872431729461

Processing folder: data/genres_30sec/jazz


Processing files in jazz:  44%|████▍     | 44/100 [02:58<03:47,  4.06s/file]


ValueError: File format b'\xcb\x15\x1e\x16' not understood. Only 'RIFF', 'RIFX', and 'RF64' supported.

# Create entropy complexity with ordpy


In [120]:
def ordpy_process_file(file_path, dim_size, hop_size):
    ext = os.path.splitext(file_path)[1].lower()
    if ext == '.mp3':
        # Load mp3 using librosa.
        # sr=None preserves the native sampling rate.
        data, sr = librosa.load(file_path, sr=None, mono=True)
        logger.info(f"Processing MP3 '{file_path}' with sample rate: {sr}")
    elif ext == '.wav':
        sr, data = wavfile.read(file_path)
        logger.info(f"Processing WAV '{file_path}' with sample rate: {sr}")
        # If stereo, convert to mono by averaging channels.
        if data.ndim == 2:
            data = data.mean(axis=1)
    else:
        logger.error(f"Unsupported file extension: {ext}")
        raise ValueError(f"Unsupported file extension: {ext}")
    
    
    HC = ordpy.complexity_entropy(data, dim_size, hop_size)
    return HC


def ordpy_process_folder(folder_path, dim_size=6, hop_size=1):
    wav_files = glob.glob(os.path.join(folder_path, "*.wav"))
    mp3_files = glob.glob(os.path.join(folder_path, "*.mp3"))
    all_files = wav_files + mp3_files
    
    if not all_files:
        logger.info(f"No audio files found in {folder_path}")
        return [], [], []
    
    all_H = []
    all_comp = []
    file_labels = []
    
    # Use tqdm to add a progress bar over the file list.
    for file_path in tqdm(all_files, desc=f"Processing audio files in {folder_path} with dim {dim_size} and hop {hop_size}"):
        try:
            H_norm, comp = ordpy_process_file(file_path, dim_size, hop_size)
            all_H.append(H_norm)
            all_comp.append(comp)
            file_labels.append(os.path.basename(file_path))
        except Exception as e:
            logger.exception(f"Error processing file {file_path}: {e}")
    
    return all_H, all_comp, file_labels

# Scatter with Entropy Complexity for one folder

In [121]:

def plot_graph_ordpy(folder_path, dim, hop, folder="plots"):
    logger.info(f"Processing folder {folder_path}")
    
    # Process the folder to get entropy, complexity, and file labels.
    all_H, all_comp, file_labels = ordpy_process_folder(folder_path, dim, hop)
    df = pd.DataFrame({
        "Normalized Permutation Entropy": all_H,
        "Normalized Complexity": all_comp,
        "File": file_labels
    })

    # Get the maximum and minimum complexity-entropy boundaries as numpy arrays
    max_HC = ordpy.maximum_complexity_entropy(dim, hop)
    min_HC = ordpy.minimum_complexity_entropy(dim, hop)

    # -------------------------
    # Matplotlib Plot Section
    # -------------------------
    plt.figure(figsize=(8, 6))
    # Plot the data points without text labels.
    plt.scatter(df["Normalized Permutation Entropy"],
                df["Normalized Complexity"],
                s=70,
                c='blue',
                edgecolors='black')

    # Plot the maximum and minimum complexity boundaries if available.
    if isinstance(max_HC, np.ndarray) and max_HC.ndim == 2 and max_HC.shape[1] == 2:
        plt.plot(max_HC[:, 0], max_HC[:, 1], 'r--', label='Maximum Complexity Boundary')
    else:
        logger.warning("max_HC is not in the expected numpy array format.")

    if isinstance(min_HC, np.ndarray) and min_HC.ndim == 2 and min_HC.shape[1] == 2:
        plt.plot(min_HC[:, 0], min_HC[:, 1], 'g--', label='Minimum Complexity Boundary')
    else:
        logger.warning("min_HC is not in the expected numpy array format.")

    plt.xlabel("Normalized Permutation Entropy")
    plt.ylabel("Normalized Complexity")
    plt.title(f"Entropy–Complexity for {folder_path}\nDim = '{dim}', Hop = '{hop}'")
    plt.xlim(0, 1)
    plt.ylim(0, 1)
    plt.grid(True, alpha=0.3)
    plt.gca().set_aspect('equal', adjustable='box')
    plt.legend()
    plt.tight_layout()

    output_dir = os.path.join(folder, os.path.basename(folder_path))
    os.makedirs(output_dir, exist_ok=True)
    output_file = os.path.join(output_dir, f"entropy_complexity_{os.path.basename(folder_path)}_dim_{dim}_hop_{hop}.png")
    plt.savefig(output_file)
    plt.close()
    logger.info(f"Saved Matplotlib plot to {output_file}")

    # -------------------------
    # Plotly Plot Section
    # -------------------------
    fig = go.Figure()

    # Add the data points as markers only. The file labels will appear when hovering.
    fig.add_trace(go.Scatter(
        x=df["Normalized Permutation Entropy"],
        y=df["Normalized Complexity"],
        mode='markers',  # Only markers, no text labels on the plot.
        marker=dict(size=10, color='blue', line=dict(width=1, color='black')),
        name='Data Points',
        hovertext=df["File"],
        hovertemplate=(
            "<b>File:</b> %{hovertext}<br>" +
            "<b>Entropy:</b> %{x}<br>" +
            "<b>Complexity:</b> %{y}<extra></extra>"
        )
    ))

    # Add the maximum complexity boundary line if available.
    if isinstance(max_HC, np.ndarray) and max_HC.ndim == 2 and max_HC.shape[1] == 2:
        fig.add_trace(go.Scatter(
            x=max_HC[:, 0],
            y=max_HC[:, 1],
            mode='lines',
            line=dict(color='red', dash='dash'),
            name='Maximum Complexity Boundary'
        ))
    else:
        logger.warning("max_HC is not in the expected numpy array format.")

    # Add the minimum complexity boundary line if available.
    if isinstance(min_HC, np.ndarray) and min_HC.ndim == 2 and min_HC.shape[1] == 2:
        fig.add_trace(go.Scatter(
            x=min_HC[:, 0],
            y=min_HC[:, 1],
            mode='lines',
            line=dict(color='green', dash='dash'),
            name='Minimum Complexity Boundary'
        ))
    else:
        logger.warning("min_HC is not in the expected numpy array format.")

    # Update layout settings for the Plotly figure.
    fig.update_layout(
        title=f"Entropy–Complexity for {folder_path} (Plotly)\nDim = '{dim}', Hop = '{hop}'",
        xaxis_title="Normalized Permutation Entropy",
        yaxis_title="Normalized Complexity",
        xaxis=dict(range=[0, 1]),
        yaxis=dict(range=[0, 1]),
        width=800,
        height=600,
        template="plotly_white"
    )


    fig.show()

# Centroids for multiple folders

In [122]:
def plot_graph_ordpy_centroids(main_folder_path, dim, hop):
    centroids = []

    # Iterate over all items in the main folder
    for item in os.listdir(main_folder_path):
        subfolder_path = os.path.join(main_folder_path, item)
        if os.path.isdir(subfolder_path):
            logger.info(f"Processing folder: {subfolder_path}")
            try:
                # Process the folder and obtain entropy, complexity, and file labels.
                all_H, all_comp, file_labels = ordpy_process_folder(subfolder_path, dim, hop)
                
                # Skip folders with no valid data
                if len(all_H) == 0 or len(all_comp) == 0:
                    logger.warning(f"No data found in {subfolder_path}. Skipping.")
                    continue

                # Compute centroid (mean entropy and complexity)
                centroid_H = np.mean(all_H)
                centroid_comp = np.mean(all_comp)

                centroids.append({
                    "Folder": item,  # use subfolder name as identifier
                    "Normalized Permutation Entropy": centroid_H,
                    "Normalized Complexity": centroid_comp
                })
            except Exception as e:
                logger.error(f"Error processing {subfolder_path}: {e}")
                continue

    # If no centroids were computed, log an error and exit.
    if not centroids:
        logger.error("No centroids were computed. Please check your data and folder structure.")
        return

    # Create a DataFrame from the centroids list
    df_centroids = pd.DataFrame(centroids)
    logger.info(f"Centroids DataFrame:\n{df_centroids}")

    # Retrieve the maximum and minimum complexity–entropy boundaries.
    # They are expected to be NumPy arrays with shape (n_points, 2)
    max_HC = ordpy.maximum_complexity_entropy(dim, hop)
    min_HC = ordpy.minimum_complexity_entropy(dim, hop)

    # Create a Plotly figure
    fig = go.Figure()

    # Add the centroid points.
    # If you expect only one centroid per folder, grouping by "Folder" is acceptable.
    # Each folder will be assigned a unique color.
    for folder, df_group in df_centroids.groupby("Folder"):
        fig.add_trace(go.Scatter(
            x=df_group["Normalized Permutation Entropy"],
            y=df_group["Normalized Complexity"],
            mode='markers',
            marker=dict(size=12, line=dict(width=1, color='black')),
            name=folder,
            hovertemplate=(
                "<b>Folder:</b> " + folder + "<br>" +
                "<b>Entropy:</b> %{x}<br>" +
                "<b>Complexity:</b> %{y}<extra></extra>"
            )
        ))

    # Add the maximum complexity boundary as a red dashed line if available.
    if isinstance(max_HC, np.ndarray) and max_HC.ndim == 2 and max_HC.shape[1] == 2:
        fig.add_trace(go.Scatter(
            x=max_HC[:, 0],
            y=max_HC[:, 1],
            mode='lines',
            line=dict(color='red', dash='dash'),
            name='Maximum Complexity Boundary'
        ))
    else:
        logger.warning("max_HC is not in the expected numpy array format.")

    # Add the minimum complexity boundary as a green dashed line if available.
    if isinstance(min_HC, np.ndarray) and min_HC.ndim == 2 and min_HC.shape[1] == 2:
        fig.add_trace(go.Scatter(
            x=min_HC[:, 0],
            y=min_HC[:, 1],
            mode='lines',
            line=dict(color='green', dash='dash'),
            name='Minimum Complexity Boundary'
        ))
    else:
        logger.warning("min_HC is not in the expected numpy array format.")

    # Update layout settings for the figure
    fig.update_layout(
        title=f"Centroids of Entropy–Complexity for Folders in '{os.path.basename(main_folder_path)}' (Dim={dim}, Hop={hop})",
        xaxis_title="Normalized Permutation Entropy",
        yaxis_title="Normalized Complexity",
        xaxis=dict(range=[0, 1]),
        yaxis=dict(range=[0, 1]),
        template="plotly_white",
        width=800,
        height=600
    )

    # Show the Plotly figure
    fig.show()


    

# Centroids with regions for multiple folders

In [None]:

def add_opacity_to_color(color_string, opacity):
    """
    Converts a color string from hex (e.g. "#1f77b4") or rgb(...) format to an RGBA string with given opacity.
    """
    # If color is in hex format, convert it to RGB first.
    if color_string.startswith("#"):
        color_string = color_string.lstrip("#")
        lv = len(color_string)
        r, g, b = tuple(int(color_string[i:i + lv // 3], 16) for i in range(0, lv, lv // 3))
        return f"rgba({r}, {g}, {b}, {opacity})"
    # If color is in rgb(...) format, just adjust the opacity.
    if color_string.startswith("rgb("):
        rgb_vals = color_string[4:-1]
        r, g, b = [int(v.strip()) for v in rgb_vals.split(",")]
        return f"rgba({r}, {g}, {b}, {opacity})"
    return color_string

def alpha_shape(points, alpha=0.1):
    """
    Compute the alpha shape (concave hull) of a set of points.
    
    Parameters:
        points (np.ndarray): An array of shape (n_points, 2).
        alpha (float): Alpha value to influence the concavity. Smaller values result in a more detailed (less smooth)
                       shape. Adjust this parameter to suit your data.
    
    Returns:
        shapely.geometry.Polygon: The resulting concave hull as a shapely Polygon.
                                  If not enough points are provided, returns the convex hull.
    """
    if len(points) < 4:
        return geometry.MultiPoint(list(points)).convex_hull

    # Perform Delaunay triangulation on the point set.
    tri = Delaunay(points)
    edges = []
    for ia, ib, ic in tri.simplices:
        pa = points[ia]
        pb = points[ib]
        pc = points[ic]
        # Compute lengths of triangle sides.
        a = np.linalg.norm(pa - pb)
        b = np.linalg.norm(pb - pc)
        c = np.linalg.norm(pc - pa)
        s = (a + b + c) / 2.0
        # Compute triangle area via Heron’s formula.
        area = math.sqrt(s * (s - a) * (s - b) * (s - c))
        # Avoid division by zero.
        if area == 0:
            circum_r = np.inf
        else:
            circum_r = a * b * c / (4.0 * area)
        # If the circumradius is below a threshold, include the triangle’s edges.
        if circum_r < 1.0 / alpha:
            edges.append((ia, ib))
            edges.append((ib, ic))
            edges.append((ic, ia))
    
    # Build a set of line segments from the edges.
    edge_segments = [(points[i], points[j]) for i, j in edges]
    m = geometry.MultiLineString(edge_segments)
    # Polygonize the edge segments to form candidate polygons.
    triangles = list(polygonize(m))
    # Merge the triangles into a single (possibly concave) polygon.
    concave_hull = unary_union(triangles)
    return concave_hull

def plot_graph_ordpy_centroids_with_region(main_folder_path: str, dim: int, hop: int, alpha_param: float) -> None:
    """
    Process each subfolder in the main folder, compute the centroid (mean) of 
    normalized permutation entropy and complexity for each folder, and plot them 
    as a filled region (using an alpha shape / concave hull that follows the data more closely)
    along with the centroid marker. Different folders are assigned different colors.
    The maximum and minimum complexity–entropy boundaries are also plotted.
    
    Parameters:
        main_folder_path (str): Path to the folder containing subfolders.
        dim (int): Embedding dimension for ordpy.
        hop (int): Delay (or hop) parameter for ordpy.
        alpha_param (float): Alpha parameter to influence the concavity. Smaller values result in a more detailed (less smooth)
                             shape. Adjust this parameter to suit your data.
    """
    # Dictionary to store per-folder data.
    folder_data = {}

    # Iterate over all items (subfolders) in the main folder.
    for item in os.listdir(main_folder_path):
        subfolder_path = os.path.join(main_folder_path, item)
        if os.path.isdir(subfolder_path):
            logger.info(f"Processing folder: {subfolder_path}")
            try:
                # Process the folder and obtain entropy, complexity, and file labels.
                all_H, all_comp, file_labels = ordpy_process_folder(subfolder_path, dim, hop)
                
                # Skip folders with no valid data.
                if len(all_H) == 0 or len(all_comp) == 0:
                    logger.warning(f"No data found in {subfolder_path}. Skipping.")
                    continue

                # Compute centroid (mean entropy and complexity).
                centroid_H = np.mean(all_H)
                centroid_comp = np.mean(all_comp)

                # Save data points and the centroid.
                folder_data[item] = {
                    "H": np.array(all_H),
                    "comp": np.array(all_comp),
                    "centroid_H": centroid_H,
                    "centroid_comp": centroid_comp
                }
            except Exception as e:
                logger.error(f"Error processing {subfolder_path}: {e}")
                continue

    # If no data was collected, exit.
    if not folder_data:
        logger.error("No centroids were computed. Please check your data and folder structure.")
        return

    # Retrieve the maximum and minimum complexity–entropy boundaries.
    # They are expected to be NumPy arrays with shape (n_points, 2)
    max_HC = ordpy.maximum_complexity_entropy(dim, hop)
    min_HC = ordpy.minimum_complexity_entropy(dim, hop)

    # Get a color palette (using Plotly's qualitative palette).
    colors = px.colors.qualitative.Plotly
    color_map = {}
    folder_list = sorted(list(folder_data.keys()))
    for i, folder in enumerate(folder_list):
        color_map[folder] = colors[i % len(colors)]

    # Create a Plotly figure.
    fig = go.Figure()

    # For each folder, add a filled region (using the alpha shape) and the centroid marker.
    for folder, data in folder_data.items():
        points = np.column_stack((data["H"], data["comp"]))
        color = color_map[folder]
        # Create a low-opacity fill color.
        fill_color = add_opacity_to_color(color, 0.3)
        
        try:
            # Compute the alpha shape (concave hull) for the points.
            concave_hull = alpha_shape(points, alpha_param)
            # If a valid polygon is returned, extract its exterior coordinates.
            if concave_hull.geom_type == 'Polygon':
                hull_coords = np.array(concave_hull.exterior.coords)
                fig.add_trace(go.Scatter(
                    x=hull_coords[:, 0],
                    y=hull_coords[:, 1],
                    mode='lines',
                    fill='toself',
                    fillcolor=fill_color,
                    line=dict(color=add_opacity_to_color(color, 0.3), width=2),
                    name=f"{folder} region",
                    showlegend=True,
                    hoverinfo='skip'
                ))
            else:
                # Fallback: if not a polygon, plot the points as low-opacity markers.
                fig.add_trace(go.Scatter(
                    x=data["H"],
                    y=data["comp"],
                    mode='markers',
                    marker=dict(color=color, opacity=0.3),
                    name=f"{folder} region",
                    showlegend=True,
                    hoverinfo='skip'
                ))
        except Exception as e:
            logger.error(f"Could not compute alpha shape for folder {folder}: {e}")
            # Fallback: simply scatter the points with low opacity.
            fig.add_trace(go.Scatter(
                x=data["H"],
                y=data["comp"],
                mode='markers',
                marker=dict(color=color, opacity=0.3),
                name=f"{folder} region",
                showlegend=True,
                hoverinfo='skip'
            ))
            
        # Add the centroid as a marker (using the same color).
        fig.add_trace(go.Scatter(
            x=[data["centroid_H"]],
            y=[data["centroid_comp"]],
            mode='markers',
            marker=dict(color=color, size=12, symbol='circle'),
            name=f"{folder} centroid",
            hovertemplate=(
                f"<b>Folder:</b> {folder}<br>" +
                "<b>Centroid Entropy:</b> %{x}<br>" +
                "<b>Centroid Complexity:</b> %{y}<extra></extra>"
            )
        ))

    # Add the maximum complexity boundary as a red dashed line if available.
    if isinstance(max_HC, np.ndarray) and max_HC.ndim == 2 and max_HC.shape[1] == 2:
        fig.add_trace(go.Scatter(
            x=max_HC[:, 0],
            y=max_HC[:, 1],
            mode='lines',
            line=dict(color='red', dash='dash'),
            name='Maximum Complexity Boundary'
        ))
    else:
        logger.warning("max_HC is not in the expected numpy array format.")

    # Add the minimum complexity boundary as a green dashed line if available.
    if isinstance(min_HC, np.ndarray) and min_HC.ndim == 2 and min_HC.shape[1] == 2:
        fig.add_trace(go.Scatter(
            x=min_HC[:, 0],
            y=min_HC[:, 1],
            mode='lines',
            line=dict(color='green', dash='dash'),
            name='Minimum Complexity Boundary'
        ))
    else:
        logger.warning("min_HC is not in the expected numpy array format.")

    # Update layout settings for the figure.
    fig.update_layout(
        title=f"Entropy–Complexity Regions for Folders in '{os.path.basename(main_folder_path)}' (Dim={dim}, Hop={hop})",
        xaxis_title="Normalized Permutation Entropy",
        yaxis_title="Normalized Complexity",
        xaxis=dict(range=[0, 1]),
        yaxis=dict(range=[0, 1]),
        template="plotly_white",
        width=800,
        height=600
    )

    # Show the Plotly figure.
    fig.show()

# Plots

# Genres

In [124]:
folder_path = "data/genres_30sec/blues"
dim = 6
hop = 1
plot_graph_ordpy(folder_path, dim, hop)

Processing audio files in data/genres_30sec/blues with dim 6 and hop 1: 100%|██████████| 100/100 [02:13<00:00,  1.34s/it]


In [125]:
folder_path = "data/genres_30sec"
dim = 6
hop = 1
alpha_val = 7
plot_graph_ordpy_centroids_with_region(folder_path, dim, hop, alpha_val)

Processing audio files in data/genres_30sec/pop with dim 6 and hop 1: 100%|██████████| 100/100 [02:11<00:00,  1.31s/it]
Processing audio files in data/genres_30sec/metal with dim 6 and hop 1: 100%|██████████| 100/100 [02:12<00:00,  1.32s/it]
Processing audio files in data/genres_30sec/disco with dim 6 and hop 1: 100%|██████████| 100/100 [02:11<00:00,  1.32s/it]
Processing audio files in data/genres_30sec/blues with dim 6 and hop 1: 100%|██████████| 100/100 [02:12<00:00,  1.32s/it]
Processing audio files in data/genres_30sec/reggae with dim 6 and hop 1: 100%|██████████| 100/100 [17:51<00:00, 10.71s/it]  
Processing audio files in data/genres_30sec/classical with dim 6 and hop 1: 100%|██████████| 100/100 [18:18<00:00, 10.99s/it]  
Processing audio files in data/genres_30sec/rock with dim 6 and hop 1: 100%|██████████| 100/100 [18:59<00:00, 11.40s/it]  
Processing audio files in data/genres_30sec/hiphop with dim 6 and hop 1: 100%|██████████| 100/100 [02:08<00:00,  1.28s/it]
Processing audi

# Guitar chords

In [126]:
folder_path = "data/guitar_chords"
dim = 6
hop = 1
alpha_val = 7
plot_graph_ordpy_centroids_with_region(folder_path, dim, hop, alpha_val)

Processing audio files in data/guitar_chords/Am with dim 6 and hop 1: 100%|██████████| 40/40 [00:08<00:00,  4.91it/s]
Processing audio files in data/guitar_chords/Bb with dim 6 and hop 1: 100%|██████████| 40/40 [00:07<00:00,  5.17it/s]
Processing audio files in data/guitar_chords/Em with dim 6 and hop 1: 100%|██████████| 40/40 [00:08<00:00,  4.62it/s]
Processing audio files in data/guitar_chords/G with dim 6 and hop 1: 100%|██████████| 40/40 [00:09<00:00,  4.38it/s]
Processing audio files in data/guitar_chords/F with dim 6 and hop 1: 100%|██████████| 40/40 [00:08<00:00,  4.81it/s]
Processing audio files in data/guitar_chords/Dm with dim 6 and hop 1: 100%|██████████| 40/40 [00:08<00:00,  4.76it/s]
Processing audio files in data/guitar_chords/C with dim 6 and hop 1: 100%|██████████| 40/40 [00:08<00:00,  4.85it/s]
Processing audio files in data/guitar_chords/Bdim with dim 6 and hop 1: 100%|██████████| 40/40 [00:07<00:00,  5.11it/s]


In [129]:
folder_path = "data/guitar_chords/Am"
dim = 6
hop = 1
plot_graph_ordpy(folder_path, dim, hop)

Processing audio files in data/guitar_chords/Am with dim 6 and hop 1: 100%|██████████| 40/40 [00:08<00:00,  4.77it/s]


In [130]:
folder_path = "data/guitar_chords/Bb"
dim = 6
hop = 1
plot_graph_ordpy(folder_path, dim, hop)

Processing audio files in data/guitar_chords/Bb with dim 6 and hop 1: 100%|██████████| 40/40 [00:08<00:00,  4.93it/s]


In [131]:
folder_path = "data/guitar_chords/Bdim"
dim = 6
hop = 1
plot_graph_ordpy(folder_path, dim, hop)

Processing audio files in data/guitar_chords/Bdim with dim 6 and hop 1: 100%|██████████| 40/40 [00:07<00:00,  5.05it/s]


In [132]:
folder_path = "data/guitar_chords/C"
dim = 6
hop = 1
plot_graph_ordpy(folder_path, dim, hop)

Processing audio files in data/guitar_chords/C with dim 6 and hop 1: 100%|██████████| 40/40 [00:08<00:00,  4.65it/s]


In [133]:
folder_path = "data/guitar_chords/Dm"
dim = 6
hop = 1
plot_graph_ordpy(folder_path, dim, hop)

Processing audio files in data/guitar_chords/Dm with dim 6 and hop 1: 100%|██████████| 40/40 [00:08<00:00,  4.67it/s]


In [134]:
folder_path = "data/guitar_chords/Em"
dim = 6
hop = 1
plot_graph_ordpy(folder_path, dim, hop)

Processing audio files in data/guitar_chords/Em with dim 6 and hop 1: 100%|██████████| 40/40 [00:08<00:00,  4.53it/s]


In [135]:
folder_path = "data/guitar_chords/F"
dim = 6
hop = 1
plot_graph_ordpy(folder_path, dim, hop)

Processing audio files in data/guitar_chords/F with dim 6 and hop 1: 100%|██████████| 40/40 [00:08<00:00,  4.77it/s]


In [136]:
folder_path = "data/guitar_chords/G"
dim = 6
hop = 1
plot_graph_ordpy(folder_path, dim, hop)

Processing audio files in data/guitar_chords/G with dim 6 and hop 1: 100%|██████████| 40/40 [00:09<00:00,  4.41it/s]


# Piano sound analysis chords

In [127]:
folder_path = "data/chords"
dim = 6
hop = 1
alpha_val = 7
plot_graph_ordpy_centroids_with_region(folder_path, dim, hop, alpha_val)

Processing audio files in data/chords/perf4 with dim 6 and hop 1: 100%|██████████| 170/170 [00:14<00:00, 11.85it/s]
Processing audio files in data/chords/perf5 with dim 6 and hop 1: 100%|██████████| 170/170 [00:14<00:00, 11.74it/s]
Processing audio files in data/chords/min6 with dim 6 and hop 1: 100%|██████████| 170/170 [00:14<00:00, 11.72it/s]
Processing audio files in data/chords/dim with dim 6 and hop 1: 100%|██████████| 255/255 [00:21<00:00, 11.68it/s]
Processing audio files in data/chords/min7 with dim 6 and hop 1: 100%|██████████| 340/340 [00:29<00:00, 11.67it/s]
Processing audio files in data/chords/maj7_2 with dim 6 and hop 1: 100%|██████████| 170/170 [00:14<00:00, 11.74it/s]
Processing audio files in data/chords/aug with dim 6 and hop 1: 100%|██████████| 255/255 [00:21<00:00, 11.70it/s]
Processing audio files in data/chords/sixth with dim 6 and hop 1: 100%|██████████| 340/340 [00:29<00:00, 11.69it/s]
Processing audio files in data/chords/maj3 with dim 6 and hop 1: 100%|███████

In [128]:
folder_path = "data/arpeggios"
dim = 6
hop = 1
alpha_val = 7
plot_graph_ordpy_centroids_with_region(folder_path, dim, hop, alpha_val)

Processing audio files in data/arpeggios/perf4 with dim 6 and hop 1: 100%|██████████| 170/170 [00:19<00:00,  8.54it/s]
Processing audio files in data/arpeggios/perf5 with dim 6 and hop 1: 100%|██████████| 170/170 [00:19<00:00,  8.66it/s]
Processing audio files in data/arpeggios/min6 with dim 6 and hop 1: 100%|██████████| 170/170 [00:19<00:00,  8.75it/s]
Processing audio files in data/arpeggios/dim with dim 6 and hop 1: 100%|██████████| 255/255 [00:36<00:00,  6.93it/s]
Processing audio files in data/arpeggios/min7 with dim 6 and hop 1: 100%|██████████| 340/340 [00:59<00:00,  5.69it/s]
Processing audio files in data/arpeggios/maj7_2 with dim 6 and hop 1: 100%|██████████| 170/170 [00:19<00:00,  8.58it/s]
Processing audio files in data/arpeggios/aug with dim 6 and hop 1: 100%|██████████| 255/255 [00:36<00:00,  6.90it/s]
Processing audio files in data/arpeggios/sixth with dim 6 and hop 1: 100%|██████████| 340/340 [00:59<00:00,  5.71it/s]
Processing audio files in data/arpeggios/maj3 with di

In [137]:
folder_path = "data/chords/maj7"
dim = 6
hop = 1
plot_graph_ordpy(folder_path, dim, hop)

Processing audio files in data/chords/maj7 with dim 6 and hop 1: 100%|██████████| 340/340 [00:28<00:00, 11.78it/s]


In [138]:
folder_path = "data/chords/tritone"
dim = 6
hop = 1
plot_graph_ordpy(folder_path, dim, hop)

Processing audio files in data/chords/tritone with dim 6 and hop 1: 100%|██████████| 170/170 [00:14<00:00, 11.65it/s]


In [139]:
folder_path = "data/chords/maj"
dim = 6
hop = 1
plot_graph_ordpy(folder_path, dim, hop)

Processing audio files in data/chords/maj with dim 6 and hop 1: 100%|██████████| 255/255 [00:21<00:00, 11.61it/s]


In [140]:
folder_path = "data/chords/perf5"
dim = 6
hop = 1
plot_graph_ordpy(folder_path, dim, hop)

Processing audio files in data/chords/perf5 with dim 6 and hop 1: 100%|██████████| 170/170 [00:14<00:00, 11.77it/s]


In [141]:
folder_path = "data/chords/sus4"
dim = 6
hop = 1
plot_graph_ordpy(folder_path, dim, hop)

Processing audio files in data/chords/sus4 with dim 6 and hop 1: 100%|██████████| 255/255 [00:21<00:00, 11.73it/s]
