# Imports

In [117]:
import os
import glob
import math
import numpy as np
import pandas as pd
import ordpy
from scipy.io import wavfile
from tqdm import tqdm
import librosa
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import logging
import warnings
from scipy.io.wavfile import WavFileWarning
from scipy.spatial import ConvexHull
from scipy.spatial import Delaunay
# Import shapely modules for geometry operations
import shapely.geometry as geometry
from shapely.ops import polygonize, unary_union



# Logger


In [118]:
logger = logging.getLogger("audio_process")
logger.setLevel(logging.DEBUG)

# Clear existing handlers to avoid duplicates
if logger.hasHandlers():
    logger.handlers.clear()

# File Handler: Write all messages (DEBUG and above) to a log file.
fh = logging.FileHandler("process.log", mode='w')
fh.setLevel(logging.DEBUG)
fh_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
fh.setFormatter(fh_formatter)
logger.addHandler(fh)

# Console Handler: Only show INFO and above on the console.
ch = logging.StreamHandler()
ch.setLevel(logging.WARNING)
ch_formatter = logging.Formatter('%(message)s')
ch.setFormatter(ch_formatter)
logger.addHandler(ch)

# Ignore metadata from scipy.wavfile
warnings.filterwarnings("ignore", category=WavFileWarning)

# Fractal dimension coutning 

In [119]:
def box_counting_dimension(ts, num_scales=10):
    """
    Estimate the fractal dimension of a time series using the box-counting algorithm.
    
    This function treats the 1D time series as a 2D curve with points (t, ts[t]).
    
    Parameters:
        ts (np.ndarray): 1D time series data.
        num_scales (int): Number of box sizes (scales) to evaluate.
        
    Returns:
        float: Estimated fractal (box-counting) dimension.
    """
    # Create 2D points: x = time index, y = amplitude.
    points = np.column_stack((np.arange(len(ts)), ts))
    
    # Determine the bounding box of the points.
    x_min, x_max = points[:, 0].min(), points[:, 0].max()
    y_min, y_max = points[:, 1].min(), points[:, 1].max()
    
    # Use the maximum span as the overall length scale.
    L = max(x_max - x_min, y_max - y_min)
    
    # Create a series of box sizes (epsilons) on a logarithmic scale.
    epsilons = np.logspace(np.log10(L / num_scales), np.log10(L), num=num_scales)
    counts = []
    
    # For each epsilon, count how many boxes contain at least one point.
    for eps in epsilons:
        boxes = set()
        for (x, y) in points:
            i = int((x - x_min) // eps)
            j = int((y - y_min) // eps)
            boxes.add((i, j))
        counts.append(len(boxes))
    
    # Perform linear regression on log(eps) vs. log(count) to obtain the scaling exponent.
    logs_eps = np.log(epsilons)
    logs_counts = np.log(counts)
    slope, intercept = np.polyfit(logs_eps, logs_counts, 1)
    
    # The box-counting dimension is given by the negative slope.
    fractal_dimension = -slope
    return fractal_dimension

# Create entropy complexity with ordpy


In [120]:
def ordpy_process_file(file_path, dim_size, hop_size):
    ext = os.path.splitext(file_path)[1].lower()
    if ext == '.mp3':
        # Load mp3 using librosa.
        # sr=None preserves the native sampling rate.
        data, sr = librosa.load(file_path, sr=None, mono=True)
        logger.info(f"Processing MP3 '{file_path}' with sample rate: {sr}")
    elif ext == '.wav':
        sr, data = wavfile.read(file_path)
        logger.info(f"Processing WAV '{file_path}' with sample rate: {sr}")
        # If stereo, convert to mono by averaging channels.
        if data.ndim == 2:
            data = data.mean(axis=1)
    else:
        logger.error(f"Unsupported file extension: {ext}")
        raise ValueError(f"Unsupported file extension: {ext}")
    
    
    HC = ordpy.complexity_entropy(data, dim_size, hop_size)
    return HC


def ordpy_process_folder(folder_path, dim_size=6, hop_size=1):
    wav_files = glob.glob(os.path.join(folder_path, "*.wav"))
    mp3_files = glob.glob(os.path.join(folder_path, "*.mp3"))
    all_files = wav_files + mp3_files
    
    if not all_files:
        logger.info(f"No audio files found in {folder_path}")
        return [], [], []
    
    all_H = []
    all_comp = []
    file_labels = []
    
    # Use tqdm to add a progress bar over the file list.
    for file_path in tqdm(all_files, desc=f"Processing audio files in {folder_path} with dim {dim_size} and hop {hop_size}"):
        try:
            H_norm, comp = ordpy_process_file(file_path, dim_size, hop_size)
            all_H.append(H_norm)
            all_comp.append(comp)
            file_labels.append(os.path.basename(file_path))
        except Exception as e:
            logger.exception(f"Error processing file {file_path}: {e}")
    
    return all_H, all_comp, file_labels

# Scatter with Entropy Complexity for one folder

In [121]:

def plot_graph_ordpy(folder_path, dim, hop, folder="plots"):
    logger.info(f"Processing folder {folder_path}")
    
    # Process the folder to get entropy, complexity, and file labels.
    all_H, all_comp, file_labels = ordpy_process_folder(folder_path, dim, hop)
    df = pd.DataFrame({
        "Normalized Permutation Entropy": all_H,
        "Normalized Complexity": all_comp,
        "File": file_labels
    })

    # Get the maximum and minimum complexity-entropy boundaries as numpy arrays
    max_HC = ordpy.maximum_complexity_entropy(dim, hop)
    min_HC = ordpy.minimum_complexity_entropy(dim, hop)

    # -------------------------
    # Matplotlib Plot Section
    # -------------------------
    plt.figure(figsize=(8, 6))
    # Plot the data points without text labels.
    plt.scatter(df["Normalized Permutation Entropy"],
                df["Normalized Complexity"],
                s=70,
                c='blue',
                edgecolors='black')

    # Plot the maximum and minimum complexity boundaries if available.
    if isinstance(max_HC, np.ndarray) and max_HC.ndim == 2 and max_HC.shape[1] == 2:
        plt.plot(max_HC[:, 0], max_HC[:, 1], 'r--', label='Maximum Complexity Boundary')
    else:
        logger.warning("max_HC is not in the expected numpy array format.")

    if isinstance(min_HC, np.ndarray) and min_HC.ndim == 2 and min_HC.shape[1] == 2:
        plt.plot(min_HC[:, 0], min_HC[:, 1], 'g--', label='Minimum Complexity Boundary')
    else:
        logger.warning("min_HC is not in the expected numpy array format.")

    plt.xlabel("Normalized Permutation Entropy")
    plt.ylabel("Normalized Complexity")
    plt.title(f"Entropy–Complexity for {folder_path}\nDim = '{dim}', Hop = '{hop}'")
    plt.xlim(0, 1)
    plt.ylim(0, 1)
    plt.grid(True, alpha=0.3)
    plt.gca().set_aspect('equal', adjustable='box')
    plt.legend()
    plt.tight_layout()

    output_dir = os.path.join(folder, os.path.basename(folder_path))
    os.makedirs(output_dir, exist_ok=True)
    output_file = os.path.join(output_dir, f"entropy_complexity_{os.path.basename(folder_path)}_dim_{dim}_hop_{hop}.png")
    plt.savefig(output_file)
    plt.close()
    logger.info(f"Saved Matplotlib plot to {output_file}")

    # -------------------------
    # Plotly Plot Section
    # -------------------------
    fig = go.Figure()

    # Add the data points as markers only. The file labels will appear when hovering.
    fig.add_trace(go.Scatter(
        x=df["Normalized Permutation Entropy"],
        y=df["Normalized Complexity"],
        mode='markers',  # Only markers, no text labels on the plot.
        marker=dict(size=10, color='blue', line=dict(width=1, color='black')),
        name='Data Points',
        hovertext=df["File"],
        hovertemplate=(
            "<b>File:</b> %{hovertext}<br>" +
            "<b>Entropy:</b> %{x}<br>" +
            "<b>Complexity:</b> %{y}<extra></extra>"
        )
    ))

    # Add the maximum complexity boundary line if available.
    if isinstance(max_HC, np.ndarray) and max_HC.ndim == 2 and max_HC.shape[1] == 2:
        fig.add_trace(go.Scatter(
            x=max_HC[:, 0],
            y=max_HC[:, 1],
            mode='lines',
            line=dict(color='red', dash='dash'),
            name='Maximum Complexity Boundary'
        ))
    else:
        logger.warning("max_HC is not in the expected numpy array format.")

    # Add the minimum complexity boundary line if available.
    if isinstance(min_HC, np.ndarray) and min_HC.ndim == 2 and min_HC.shape[1] == 2:
        fig.add_trace(go.Scatter(
            x=min_HC[:, 0],
            y=min_HC[:, 1],
            mode='lines',
            line=dict(color='green', dash='dash'),
            name='Minimum Complexity Boundary'
        ))
    else:
        logger.warning("min_HC is not in the expected numpy array format.")

    # Update layout settings for the Plotly figure.
    fig.update_layout(
        title=f"Entropy–Complexity for {folder_path} (Plotly)\nDim = '{dim}', Hop = '{hop}'",
        xaxis_title="Normalized Permutation Entropy",
        yaxis_title="Normalized Complexity",
        xaxis=dict(range=[0, 1]),
        yaxis=dict(range=[0, 1]),
        width=800,
        height=600,
        template="plotly_white"
    )


    fig.show()

# Centroids for multiple folders

In [122]:
def plot_graph_ordpy_centroids(main_folder_path, dim, hop):
    centroids = []

    # Iterate over all items in the main folder
    for item in os.listdir(main_folder_path):
        subfolder_path = os.path.join(main_folder_path, item)
        if os.path.isdir(subfolder_path):
            logger.info(f"Processing folder: {subfolder_path}")
            try:
                # Process the folder and obtain entropy, complexity, and file labels.
                all_H, all_comp, file_labels = ordpy_process_folder(subfolder_path, dim, hop)
                
                # Skip folders with no valid data
                if len(all_H) == 0 or len(all_comp) == 0:
                    logger.warning(f"No data found in {subfolder_path}. Skipping.")
                    continue

                # Compute centroid (mean entropy and complexity)
                centroid_H = np.mean(all_H)
                centroid_comp = np.mean(all_comp)

                centroids.append({
                    "Folder": item,  # use subfolder name as identifier
                    "Normalized Permutation Entropy": centroid_H,
                    "Normalized Complexity": centroid_comp
                })
            except Exception as e:
                logger.error(f"Error processing {subfolder_path}: {e}")
                continue

    # If no centroids were computed, log an error and exit.
    if not centroids:
        logger.error("No centroids were computed. Please check your data and folder structure.")
        return

    # Create a DataFrame from the centroids list
    df_centroids = pd.DataFrame(centroids)
    logger.info(f"Centroids DataFrame:\n{df_centroids}")

    # Retrieve the maximum and minimum complexity–entropy boundaries.
    # They are expected to be NumPy arrays with shape (n_points, 2)
    max_HC = ordpy.maximum_complexity_entropy(dim, hop)
    min_HC = ordpy.minimum_complexity_entropy(dim, hop)

    # Create a Plotly figure
    fig = go.Figure()

    # Add the centroid points.
    # If you expect only one centroid per folder, grouping by "Folder" is acceptable.
    # Each folder will be assigned a unique color.
    for folder, df_group in df_centroids.groupby("Folder"):
        fig.add_trace(go.Scatter(
            x=df_group["Normalized Permutation Entropy"],
            y=df_group["Normalized Complexity"],
            mode='markers',
            marker=dict(size=12, line=dict(width=1, color='black')),
            name=folder,
            hovertemplate=(
                "<b>Folder:</b> " + folder + "<br>" +
                "<b>Entropy:</b> %{x}<br>" +
                "<b>Complexity:</b> %{y}<extra></extra>"
            )
        ))

    # Add the maximum complexity boundary as a red dashed line if available.
    if isinstance(max_HC, np.ndarray) and max_HC.ndim == 2 and max_HC.shape[1] == 2:
        fig.add_trace(go.Scatter(
            x=max_HC[:, 0],
            y=max_HC[:, 1],
            mode='lines',
            line=dict(color='red', dash='dash'),
            name='Maximum Complexity Boundary'
        ))
    else:
        logger.warning("max_HC is not in the expected numpy array format.")

    # Add the minimum complexity boundary as a green dashed line if available.
    if isinstance(min_HC, np.ndarray) and min_HC.ndim == 2 and min_HC.shape[1] == 2:
        fig.add_trace(go.Scatter(
            x=min_HC[:, 0],
            y=min_HC[:, 1],
            mode='lines',
            line=dict(color='green', dash='dash'),
            name='Minimum Complexity Boundary'
        ))
    else:
        logger.warning("min_HC is not in the expected numpy array format.")

    # Update layout settings for the figure
    fig.update_layout(
        title=f"Centroids of Entropy–Complexity for Folders in '{os.path.basename(main_folder_path)}' (Dim={dim}, Hop={hop})",
        xaxis_title="Normalized Permutation Entropy",
        yaxis_title="Normalized Complexity",
        xaxis=dict(range=[0, 1]),
        yaxis=dict(range=[0, 1]),
        template="plotly_white",
        width=800,
        height=600
    )

    # Show the Plotly figure
    fig.show()


    

# Centroids with regions for multiple folders

In [123]:

def add_opacity_to_color(color_string, opacity):
    """
    Converts a color string from hex (e.g. "#1f77b4") or rgb(...) format to an RGBA string with given opacity.
    """
    # If color is in hex format, convert it to RGB first.
    if color_string.startswith("#"):
        color_string = color_string.lstrip("#")
        lv = len(color_string)
        r, g, b = tuple(int(color_string[i:i + lv // 3], 16) for i in range(0, lv, lv // 3))
        return f"rgba({r}, {g}, {b}, {opacity})"
    # If color is in rgb(...) format, just adjust the opacity.
    if color_string.startswith("rgb("):
        rgb_vals = color_string[4:-1]
        r, g, b = [int(v.strip()) for v in rgb_vals.split(",")]
        return f"rgba({r}, {g}, {b}, {opacity})"
    return color_string

def alpha_shape(points, alpha=0.1):
    """
    Compute the alpha shape (concave hull) of a set of points.
    
    Parameters:
        points (np.ndarray): An array of shape (n_points, 2).
        alpha (float): Alpha value to influence the concavity. Smaller values result in a more detailed (less smooth)
                       shape. Adjust this parameter to suit your data.
    
    Returns:
        shapely.geometry.Polygon: The resulting concave hull as a shapely Polygon.
                                  If not enough points are provided, returns the convex hull.
    """
    if len(points) < 4:
        return geometry.MultiPoint(list(points)).convex_hull

    # Perform Delaunay triangulation on the point set.
    tri = Delaunay(points)
    edges = []
    for ia, ib, ic in tri.simplices:
        pa = points[ia]
        pb = points[ib]
        pc = points[ic]
        # Compute lengths of triangle sides.
        a = np.linalg.norm(pa - pb)
        b = np.linalg.norm(pb - pc)
        c = np.linalg.norm(pc - pa)
        s = (a + b + c) / 2.0
        # Compute triangle area via Heron’s formula.
        area = math.sqrt(s * (s - a) * (s - b) * (s - c))
        # Avoid division by zero.
        if area == 0:
            circum_r = np.inf
        else:
            circum_r = a * b * c / (4.0 * area)
        # If the circumradius is below a threshold, include the triangle’s edges.
        if circum_r < 1.0 / alpha:
            edges.append((ia, ib))
            edges.append((ib, ic))
            edges.append((ic, ia))
    
    # Build a set of line segments from the edges.
    edge_segments = [(points[i], points[j]) for i, j in edges]
    m = geometry.MultiLineString(edge_segments)
    # Polygonize the edge segments to form candidate polygons.
    triangles = list(polygonize(m))
    # Merge the triangles into a single (possibly concave) polygon.
    concave_hull = unary_union(triangles)
    return concave_hull

def plot_graph_ordpy_centroids_with_region(main_folder_path, dim, hop, alpha_param):
    """
    Process each subfolder in the main folder, compute the centroid (mean) of 
    normalized permutation entropy and complexity for each folder, and plot them 
    as a filled region (using an alpha shape / concave hull that follows the data more closely)
    along with the centroid marker. Different folders are assigned different colors.
    The maximum and minimum complexity–entropy boundaries are also plotted.
    
    Parameters:
        main_folder_path (str): Path to the folder containing subfolders.
        dim (int): Embedding dimension for ordpy.
        hop (int): Delay (or hop) parameter for ordpy.
        alpha_param (float): Alpha parameter to influence the concavity. Smaller values result in a more detailed (less smooth)
                             shape. Adjust this parameter to suit your data.
    """
    # Dictionary to store per-folder data.
    folder_data = {}

    # Iterate over all items (subfolders) in the main folder.
    for item in os.listdir(main_folder_path):
        subfolder_path = os.path.join(main_folder_path, item)
        if os.path.isdir(subfolder_path):
            logger.info(f"Processing folder: {subfolder_path}")
            try:
                # Process the folder and obtain entropy, complexity, and file labels.
                all_H, all_comp, file_labels = ordpy_process_folder(subfolder_path, dim, hop)
                
                # Skip folders with no valid data.
                if len(all_H) == 0 or len(all_comp) == 0:
                    logger.warning(f"No data found in {subfolder_path}. Skipping.")
                    continue

                # Compute centroid (mean entropy and complexity).
                centroid_H = np.mean(all_H)
                centroid_comp = np.mean(all_comp)

                # Save data points and the centroid.
                folder_data[item] = {
                    "H": np.array(all_H),
                    "comp": np.array(all_comp),
                    "centroid_H": centroid_H,
                    "centroid_comp": centroid_comp
                }
            except Exception as e:
                logger.error(f"Error processing {subfolder_path}: {e}")
                continue

    # If no data was collected, exit.
    if not folder_data:
        logger.error("No centroids were computed. Please check your data and folder structure.")
        return

    # Retrieve the maximum and minimum complexity–entropy boundaries.
    # They are expected to be NumPy arrays with shape (n_points, 2)
    max_HC = ordpy.maximum_complexity_entropy(dim, hop)
    min_HC = ordpy.minimum_complexity_entropy(dim, hop)

    # Get a color palette (using Plotly's qualitative palette).
    colors = px.colors.qualitative.Plotly
    color_map = {}
    folder_list = sorted(list(folder_data.keys()))
    for i, folder in enumerate(folder_list):
        color_map[folder] = colors[i % len(colors)]

    # Create a Plotly figure.
    fig = go.Figure()

    # For each folder, add a filled region (using the alpha shape) and the centroid marker.
    for folder, data in folder_data.items():
        points = np.column_stack((data["H"], data["comp"]))
        color = color_map[folder]
        # Create a low-opacity fill color.
        fill_color = add_opacity_to_color(color, 0.3)
        
        try:
            # Compute the alpha shape (concave hull) for the points.
            concave_hull = alpha_shape(points, alpha_param)
            # If a valid polygon is returned, extract its exterior coordinates.
            if concave_hull.geom_type == 'Polygon':
                hull_coords = np.array(concave_hull.exterior.coords)
                fig.add_trace(go.Scatter(
                    x=hull_coords[:, 0],
                    y=hull_coords[:, 1],
                    mode='lines',
                    fill='toself',
                    fillcolor=fill_color,
                    line=dict(color=add_opacity_to_color(color, 0.3), width=2),
                    name=f"{folder} region",
                    showlegend=True,
                    hoverinfo='skip'
                ))
            else:
                # Fallback: if not a polygon, plot the points as low-opacity markers.
                fig.add_trace(go.Scatter(
                    x=data["H"],
                    y=data["comp"],
                    mode='markers',
                    marker=dict(color=color, opacity=0.3),
                    name=f"{folder} region",
                    showlegend=True,
                    hoverinfo='skip'
                ))
        except Exception as e:
            logger.error(f"Could not compute alpha shape for folder {folder}: {e}")
            # Fallback: simply scatter the points with low opacity.
            fig.add_trace(go.Scatter(
                x=data["H"],
                y=data["comp"],
                mode='markers',
                marker=dict(color=color, opacity=0.3),
                name=f"{folder} region",
                showlegend=True,
                hoverinfo='skip'
            ))
            
        # Add the centroid as a marker (using the same color).
        fig.add_trace(go.Scatter(
            x=[data["centroid_H"]],
            y=[data["centroid_comp"]],
            mode='markers',
            marker=dict(color=color, size=12, symbol='circle'),
            name=f"{folder} centroid",
            hovertemplate=(
                f"<b>Folder:</b> {folder}<br>" +
                "<b>Centroid Entropy:</b> %{x}<br>" +
                "<b>Centroid Complexity:</b> %{y}<extra></extra>"
            )
        ))

    # Add the maximum complexity boundary as a red dashed line if available.
    if isinstance(max_HC, np.ndarray) and max_HC.ndim == 2 and max_HC.shape[1] == 2:
        fig.add_trace(go.Scatter(
            x=max_HC[:, 0],
            y=max_HC[:, 1],
            mode='lines',
            line=dict(color='red', dash='dash'),
            name='Maximum Complexity Boundary'
        ))
    else:
        logger.warning("max_HC is not in the expected numpy array format.")

    # Add the minimum complexity boundary as a green dashed line if available.
    if isinstance(min_HC, np.ndarray) and min_HC.ndim == 2 and min_HC.shape[1] == 2:
        fig.add_trace(go.Scatter(
            x=min_HC[:, 0],
            y=min_HC[:, 1],
            mode='lines',
            line=dict(color='green', dash='dash'),
            name='Minimum Complexity Boundary'
        ))
    else:
        logger.warning("min_HC is not in the expected numpy array format.")

    # Update layout settings for the figure.
    fig.update_layout(
        title=f"Entropy–Complexity Regions for Folders in '{os.path.basename(main_folder_path)}' (Dim={dim}, Hop={hop})",
        xaxis_title="Normalized Permutation Entropy",
        yaxis_title="Normalized Complexity",
        xaxis=dict(range=[0, 1]),
        yaxis=dict(range=[0, 1]),
        template="plotly_white",
        width=800,
        height=600
    )

    # Show the Plotly figure.
    fig.show()

# Plots

In [124]:
folder_path = "data/genres_30sec/blues"
dim = 6
hop = 1
plot_graph_ordpy(folder_path, dim, hop)

Processing audio files in data/genres_30sec/blues with dim 6 and hop 1: 100%|██████████| 100/100 [02:13<00:00,  1.34s/it]


In [125]:
folder_path = "data/genres_30sec"
dim = 6
hop = 1
alpha_val = 7
plot_graph_ordpy_centroids_with_region(folder_path, dim, hop, alpha_val)

Processing audio files in data/genres_30sec/pop with dim 6 and hop 1: 100%|██████████| 100/100 [02:11<00:00,  1.31s/it]
Processing audio files in data/genres_30sec/metal with dim 6 and hop 1: 100%|██████████| 100/100 [02:12<00:00,  1.32s/it]
Processing audio files in data/genres_30sec/disco with dim 6 and hop 1: 100%|██████████| 100/100 [02:11<00:00,  1.32s/it]
Processing audio files in data/genres_30sec/blues with dim 6 and hop 1: 100%|██████████| 100/100 [02:12<00:00,  1.32s/it]
Processing audio files in data/genres_30sec/reggae with dim 6 and hop 1: 100%|██████████| 100/100 [17:51<00:00, 10.71s/it]  
Processing audio files in data/genres_30sec/classical with dim 6 and hop 1: 100%|██████████| 100/100 [18:18<00:00, 10.99s/it]  
Processing audio files in data/genres_30sec/rock with dim 6 and hop 1: 100%|██████████| 100/100 [18:59<00:00, 11.40s/it]  
Processing audio files in data/genres_30sec/hiphop with dim 6 and hop 1: 100%|██████████| 100/100 [02:08<00:00,  1.28s/it]
Processing audi