In [73]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from pathlib import Path

In [74]:
from scipy import stats
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import matplotlib.patches as mpatches
from math import pi
from collections import defaultdict

In [None]:
def format_statistics_box(models, plot_data):
    stats_lines = []
    
    stats_lines.append("Model         Mean       Std        Count")
    stats_lines.append("─" * 43)
    
    for model, data in zip(models, plot_data):
        if len(data) > 0:
            line = f"{model:<13} {data.mean():>9.4f}  {data.std():>9.4f}  {len(data):>5d}"
            stats_lines.append(line)
    
    return '\n'.join(stats_lines)

In [76]:
current_dir = Path.cwd()
project_root = current_dir.parent.parent

sns.set(style='whitegrid')

In [77]:
# Available Datasets = ['FakeRealMusicOriginal/', 'FakeRealMusicOriginalNormalized/']
Dataset = 'FakeRealMusicOriginal/'

# Available Sources = ['mix', 'vocals0', 'drums0', 'bass0', 'other0']
separated_source = 'mix'

# Available LUFS = ['', 'minus14/', 'minus23/']
LUFS = ''

# Available Perturbations = ['', 'base/', 'mp3_192/', 'noise_snr30/', 'resample22k/', 'reverb_room/']
perturbation = ''

test_name = 'Extended_full_track_features/'

In [78]:
features_path = f'../../results/Features/{Dataset}{LUFS}{perturbation}{test_name}full_track/features_full_track.json'

In [None]:
def load_and_prepare_data_full(json_file, separated_source):
    """
    Load JSON data and preserve ALL sub-features from nested structure.
    
    Data structure example:
    {
        model_name: {
            track_id: {
                "type": "full_track" | "segment",
                "segment_id": null | value,
                "features": {
                    "mix": {
                        "duration": 120.0,
                        "rms_wave": {"min": ..., "mean": ..., "std": ..., "max": ...},
                        "jitter": {"jitter_local": ..., "jitter_rap": ..., ...},
                        ...
                    },
                    "vocals0": {...},
                    ...
                }
            },
            ...
        },
        ...
    }
    
    Output:
    - DataFrame with collumns: model, track, data_type, source, segment_id, [all_features}]
    """
    with open(json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    all_rows = []
    
    type_mapping = {
        'ElevenLabs': 'GENERATED',       
        'REAL': 'REAL',
        'SUNO': 'GENERATED',
        'SUNO_PRO': 'GENERATED',
        'UDIO': 'GENERATED',
    }
    
    for model_name, tracks_dict in data.items():
        for track_key, track_data in tracks_dict.items():
            
            if not isinstance(track_data, dict) or 'features' not in track_data:
                continue
            
            features_root = track_data.get('features', {})
            track_type = track_data.get('type', 'unknown')
            segment_id = track_data.get('segment_id', None)
            
            sources_to_process = [separated_source]
            
            for source in sources_to_process:
                if source not in features_root:
                    continue
                
                features = features_root[source]
                row = {
                    'model': model_name,
                    'track': track_key,
                    'source': source,
                    'data_type': type_mapping.get(model_name, model_name),
                    'segment_id': segment_id,
                }
                
                def flatten_feature(feat_dict, prefix=''):
                    result = {}
                    
                    for key, val in feat_dict.items():
                        col_name = f"{prefix}_{key}" if prefix else key
                        
                        if isinstance(val, dict):
                            stats_keys = {'min', 'mean', 'std', 'max'}
                            
                            if stats_keys.intersection(val.keys()):
                                for stat_name, stat_val in val.items():
                                    result[f"{col_name}_{stat_name}"] = float(stat_val) if isinstance(stat_val, (int, float)) else np.nan
                            else:
                                nested = flatten_feature(val, prefix=col_name)
                                result.update(nested)
                        
                        elif isinstance(val, list):
                            if len(val) > 0 and all(isinstance(x, (int, float)) for x in val):
                                result[f"{col_name}_mean"] = float(np.mean(val))
                                result[f"{col_name}_min"] = float(np.min(val))
                                result[f"{col_name}_max"] = float(np.max(val))
                                result[f"{col_name}_std"] = float(np.std(val)) if len(val) > 1 else 0.0
                            else:
                                pass
                        
                        elif isinstance(val, (int, float)):
                            result[col_name] = float(val)
                        elif isinstance(val, bool):
                            result[col_name] = val
                        elif isinstance(val, str):
                            try:
                                result[col_name] = float(val)
                            except (ValueError, TypeError):
                                pass
                    
                    return result
                
                flattened = flatten_feature(features)
                row.update(flattened)
                
                all_rows.append(row)
    
    features_df = pd.DataFrame(all_rows)
    
    if features_df.empty:
        print("⚠️ Warning: No data loaded from JSON file!")
        return features_df, []
    
    exclude_cols = {'model', 'track', 'source', 'data_type', 'segment_id'}
    feature_cols = [col for col in features_df.columns if col not in exclude_cols]
    
    print(f"\n{'='*80}")
    print(f"✅ Data loaded successfully!")
    print(f"   • Models: {features_df['model'].unique().tolist()}")
    print(f"   • Total records: {len(features_df)}")
    print(f"   • Total features: {len(feature_cols)}")
    print(f"   • Sample features: {feature_cols[:10]}")
    print(f"{'='*80}\n")
    
    return features_df, feature_cols


In [80]:
features_df, features_to_analyze = load_and_prepare_data_full(features_path, separated_source)
print(f"\n✓ Data loaded: {len(features_df)} samples, {len(features_to_analyze)} features")
print(f"✓ Models: {features_df['model'].value_counts().to_dict()}\n")


✅ Data loaded successfully!
   • Models: ['ElevenLabs', 'REAL', 'SUNO', 'SUNO_PRO', 'UDIO']
   • Total records: 50
   • Total features: 65
   • Sample features: ['duration', 'rms_wave_min', 'rms_wave_mean', 'rms_wave_std', 'rms_wave_max', 'rms_spec_min', 'rms_spec_mean', 'rms_spec_std', 'rms_spec_max', 'zero_crossing_rate']


✓ Data loaded: 50 samples, 65 features
✓ Models: {'ElevenLabs': 10, 'REAL': 10, 'SUNO': 10, 'SUNO_PRO': 10, 'UDIO': 10}



In [81]:
features_df.head(5)

Unnamed: 0,model,track,source,data_type,segment_id,duration,rms_wave_min,rms_wave_mean,rms_wave_std,rms_wave_max,...,gne,breath_count,intonation_pattern_pitch_variability,voice_breaks,rhythm_stats_tempo_bpm_mean,rhythm_stats_tempo_bpm_min,rhythm_stats_tempo_bpm_max,rhythm_stats_tempo_bpm_std,rhythm_stats_avg_onset_strength,rhythm_stats_max_onset_strength
0,ElevenLabs,1__Ed_Sheeran_-_Perfect_Echoes_of_You_Wariant_...,mix,GENERATED,,120.0,2.089001e-05,0.139664,0.070806,0.333599,...,10.047748,24.0,67.918416,97.0,120.18532,120.18532,120.18532,0.0,1.000118,13.041919
1,ElevenLabs,10__Adele_Rolling_in_the_Deep_-_Edge_of_the_He...,mix,GENERATED,,99.892245,0.0,0.178018,0.0838,0.411453,...,8.568966,31.0,97.10983,63.0,78.302557,78.302557,78.302557,0.0,1.160766,23.53215
2,ElevenLabs,2__Travis_Scott_Sico_Mode_-_Run_This_Town_Wari...,mix,GENERATED,,103.88898,0.0,0.159084,0.090357,0.442858,...,6.943455,44.0,205.753161,83.0,132.512019,132.512019,132.512019,0.0,1.611085,26.747967
3,ElevenLabs,3__Imagine_Dragons_Believer_-_Together_We_Rise...,mix,GENERATED,,89.887347,0.0,0.146642,0.071364,0.353091,...,7.243923,13.0,235.435288,51.0,147.65625,147.65625,147.65625,0.0,1.080609,17.320227
4,ElevenLabs,4__Offset_Bodies_-_Ruckus_Resonance_Wariant_2_...,mix,GENERATED,,73.926531,2.330183e-07,0.166111,0.066529,0.385434,...,5.300772,3.0,468.677755,20.0,143.554688,143.554688,143.554688,0.0,1.483804,16.723547


In [None]:
def setup_professional_style():
    plt.rcParams['font.family'] = 'sans-serif'
    plt.rcParams['font.sans-serif'] = ['Arial', 'Helvetica']
    plt.rcParams['font.size'] = 10
    plt.rcParams['axes.labelsize'] = 12
    plt.rcParams['axes.titlesize'] = 13
    plt.rcParams['xtick.labelsize'] = 11
    plt.rcParams['ytick.labelsize'] = 11
    plt.rcParams['legend.fontsize'] = 10
    plt.rcParams['figure.titlesize'] = 16
    
    plt.rcParams['axes.grid'] = True
    plt.rcParams['grid.alpha'] = 0.3
    plt.rcParams['grid.linestyle'] = '--'
    plt.rcParams['grid.linewidth'] = 0.5
    
    plt.rcParams['axes.linewidth'] = 1.5
    plt.rcParams['xtick.major.width'] = 1.5
    plt.rcParams['ytick.major.width'] = 1.5
    
    sns.set_palette("husl")

PROFESSIONAL_COLORS = {
    'REAL': '#1f77b4',
    'ElevenLabs': '#ff7f0e',
    'SUNO': '#2ca02c',
    'SUNO_PRO': '#d62728',
    'UDIO': '#9467bd'
}

In [83]:
base_output_folder = Path(f'{Dataset}{test_name}{LUFS}{perturbation}{separated_source}2/')
base_output_folder.mkdir(parents=True, exist_ok=True)

In [None]:
def viz2_real_vs_generated_by_feature_violin(features_df, features_to_analyze=None):
    
    setup_professional_style()
    
    base_folder = Path(f'{base_output_folder}/visualizations_violin')
    base_folder.mkdir(exist_ok=True)
    
    print(f"\n{'='*80}")
    print("Creating PROFESSIONAL visualizations for academic thesis...")
    print(f"{'='*80}\n")
    
    exclude_cols = {'model', 'track', 'source', 'data_type', 'segment_id'}
    all_cols = [col for col in features_df.columns if col not in exclude_cols]
    
    feature_groups = defaultdict(list)
    
    for col in all_cols:
        parts = col.split('_')
        
        if len(parts) > 1 and parts[-1] in ['min', 'mean', 'std', 'max']:
            base_name = '_'.join(parts[:-1])
            stat = parts[-1]
        else:
            base_name = col
            stat = 'single'
        
        feature_groups[base_name].append((col, stat))
    
    print(f"Found {len(feature_groups)} feature groups\n")
    
    for feature_base, columns_list in sorted(feature_groups.items()):
        print(f"Processing feature: {feature_base}")
        
        if len(columns_list) == 1 and columns_list[0][1] == 'single':
            col = columns_list[0][0]
            
            feature_folder = base_folder / feature_base
            feature_folder.mkdir(exist_ok=True)
            
            models = sorted(features_df['model'].unique())
            
            plot_data = []
            positions = []
            valid_models = []
            pos = 0
            
            for model in models:
                data = features_df[features_df['model'] == model][col].dropna()
                if len(data) > 0:
                    plot_data.append(data.values)
                    positions.append(pos)
                    valid_models.append(model)
                    pos += 1.5
            
            if len(plot_data) == 0:
                print(f"  ⚠️ SKIPPED: No valid data for feature '{col}'")
                continue
            
            fig, axes = plt.subplots(2, 2, figsize=(18, 14))
            axes = axes.flatten()
            
            for ax in axes[1:]:
                ax.set_visible(False)
            
            ax = axes[0]
            
            parts = ax.violinplot(plot_data, positions=positions, widths=0.7,
                                 showmeans=True, showmedians=True)
            
            for pc in parts['bodies']:
                pc.set_facecolor(PROFESSIONAL_COLORS.get('REAL', '#1f77b4'))
                pc.set_alpha(0.6)
                pc.set_edgecolor('black')
                pc.set_linewidth(1.5)
            
            parts['cmeans'].set_color('red')
            parts['cmeans'].set_linewidth(2)
            parts['cmedians'].set_color('black')
            parts['cmedians'].set_linewidth(2)
            
            for partname in ('cbars', 'cmaxes', 'cmins'):
                if partname in parts:
                    parts[partname].set_edgecolor('black')
                    parts[partname].set_linewidth(1.5)
            
            for data, pos in zip(plot_data, positions):
                y = data
                x = np.random.normal(pos, 0.04, size=len(y))
                ax.scatter(x, y, alpha=0.4, s=30, color='black', edgecolors='none')
            
            ax.set_xticks(positions)
            ax.set_xticklabels(valid_models, fontsize=13, fontweight='bold')
            ax.set_ylabel('Value', fontsize=14, fontweight='bold')
            ax.set_title(f'{col}', fontsize=14, fontweight='bold', pad=20)
            
            ax.grid(axis='y', alpha=0.4, linestyle='--', linewidth=1)
            ax.set_axisbelow(True)
            ax.spines['top'].set_visible(False)
            ax.spines['right'].set_visible(False)

            if len(plot_data) > 0:
                textstr = format_statistics_box(valid_models, plot_data)
                ax.text(
                    0.98, 0.97, textstr,
                    transform=ax.transAxes,
                    fontsize=9,
                    verticalalignment='top',
                    horizontalalignment='right',
                    bbox=dict(
                        boxstyle='round,pad=0.8',
                        facecolor='white',
                        alpha=0.9,
                        edgecolor='black',
                        linewidth=1.2
                    ),
                    family='monospace'
                )

            plt.tight_layout()
            
            output_file = feature_folder / f'{feature_base}_violin_analysis.png'
            plt.savefig(output_file, dpi=300, bbox_inches='tight')
            plt.close()
            
            print(f"  ✓ Saved: {output_file}")
        
        else:
            feature_folder = base_folder / feature_base
            feature_folder.mkdir(exist_ok=True)
            
            stat_order = ['min', 'mean', 'std', 'max']
            columns_sorted = sorted(columns_list,
                                   key=lambda x: next((i for i, stat in enumerate(stat_order) 
                                                      if stat == x[1]), 999))
            
            fig, axes = plt.subplots(2, 2, figsize=(18, 14))
            axes = axes.flatten()
            
            models = sorted(features_df['model'].unique())
            
            for plot_idx, (col, stat) in enumerate(columns_sorted[:4]):
                ax = axes[plot_idx]
                
                plot_data = []
                positions = []
                valid_models = []
                pos = 0
                
                for model in models:
                    data = features_df[features_df['model'] == model][col].dropna()
                    if len(data) > 0:
                        plot_data.append(data.values)
                        positions.append(pos)
                        valid_models.append(model)
                        pos += 1.5
                
                if len(plot_data) == 0:
                    ax.text(0.5, 0.5, f'No data available\nfor {stat.upper()}',
                           ha='center', va='center', fontsize=12, color='red',
                           transform=ax.transAxes)
                    ax.set_title(f'{stat.upper()} - NO DATA', fontsize=13, 
                               fontweight='bold', color='red')
                    continue
                
                parts = ax.violinplot(plot_data, positions=positions, widths=0.7,
                                     showmeans=True, showmedians=True)
                
                for idx, pc in enumerate(parts['bodies']):
                    if idx < len(valid_models):
                        color = PROFESSIONAL_COLORS.get(valid_models[idx], '#888888')
                    else:
                        color = '#888888'
                    pc.set_facecolor(color)
                    pc.set_alpha(0.7)
                    pc.set_edgecolor('black')
                    pc.set_linewidth(1.5)
                
                parts['cmeans'].set_color('red')
                parts['cmeans'].set_linewidth(2.5)
                parts['cmeans'].set_label('Mean')
                parts['cmedians'].set_color('darkblue')
                parts['cmedians'].set_linewidth(2.5)
                parts['cmedians'].set_label('Median')
                
                for partname in ('cbars', 'cmaxes', 'cmins'):
                    if partname in parts:
                        parts[partname].set_edgecolor('black')
                        parts[partname].set_linewidth(1.5)
                
                for data, pos, model in zip(plot_data, positions, valid_models):
                    y = data
                    x = np.random.normal(pos, 0.05, size=len(y))
                    ax.scatter(x, y, alpha=0.35, s=25, color='black', edgecolors='none')
                
                ax.set_xticks(positions)
                ax.set_xticklabels(valid_models, fontsize=12, fontweight='bold')
                ax.set_ylabel('Value', fontsize=12, fontweight='bold')
                
                stat_display = stat.upper() if stat != 'single' else feature_base
                ax.set_title(f'{stat_display}', fontsize=13, fontweight='bold', pad=15,
                            bbox=dict(boxstyle='round,pad=0.7', facecolor='lightgray', 
                                     alpha=0.8, edgecolor='black', linewidth=1.5))
                
                ax.grid(axis='y', alpha=0.4, linestyle='--', linewidth=1)
                ax.set_axisbelow(True)
                
                ax.spines['top'].set_visible(False)
                ax.spines['right'].set_visible(False)
                ax.spines['left'].set_linewidth(1.5)
                ax.spines['bottom'].set_linewidth(1.5)
                
                if len(plot_data) > 0:
                    textstr = format_statistics_box(valid_models, plot_data)
                    ax.text(
                            1.02, 0.98, textstr, transform=ax.transAxes,
                            ha='left', va='top',
                            fontsize=9, verticalalignment='top', horizontalalignment='right',
                            bbox=dict(boxstyle='round,pad=0.8', facecolor='white', 
                                    alpha=0.9, edgecolor='black', linewidth=1.2),
                            family='monospace', clip_on=False)
            
            fig.suptitle(f'Feature Analysis: {feature_base.replace("_", " ").title()} '
                        f'(REAL vs GENERATED Models)',
                        fontsize=18, fontweight='bold', y=0.98)
            
            plt.tight_layout()
            
            output_file = feature_folder / f'{feature_base}_analysis.png'
            plt.savefig(output_file, dpi=300, bbox_inches='tight', facecolor='white')
            plt.close()
            
            print(f"  ✓ Saved: {output_file}")
    
    print(f"\n{'='*80}")
    print(f"✅ Professional visualizations saved to: visualizations/")
    print(f"✅ Ready for academic thesis presentation!")
    print(f"{'='*80}\n")

In [85]:
viz2_real_vs_generated_by_feature_violin(
    features_df,
    features_to_analyze
)


Creating PROFESSIONAL visualizations for academic thesis...

Found 32 feature groups

Processing feature: breath_count
  ✓ Saved: FakeRealMusicOriginal\Extended_full_track_features\mix2\visualizations_violin\breath_count\breath_count_violin_analysis.png
Processing feature: duration
  ✓ Saved: FakeRealMusicOriginal\Extended_full_track_features\mix2\visualizations_violin\duration\duration_violin_analysis.png
Processing feature: f0
  ✓ Saved: FakeRealMusicOriginal\Extended_full_track_features\mix2\visualizations_violin\f0\f0_analysis.png
Processing feature: gne
  ✓ Saved: FakeRealMusicOriginal\Extended_full_track_features\mix2\visualizations_violin\gne\gne_violin_analysis.png
Processing feature: hnr
  ✓ Saved: FakeRealMusicOriginal\Extended_full_track_features\mix2\visualizations_violin\hnr\hnr_violin_analysis.png
Processing feature: intonation_pattern_pitch_variability
  ✓ Saved: FakeRealMusicOriginal\Extended_full_track_features\mix2\visualizations_violin\intonation_pattern_pitch_varia

In [None]:
BOX_FILL_COLORS = {
    'REAL': '#aec7e8',
    'ElevenLabs': '#ffbb78',
    'SUNO': '#98df8a',
    'SUNO_PRO': '#ff9896',
    'UDIO': '#c5b0d5'
}

def viz2_real_vs_generated_boxplots(features_df, features_to_analyze=None):
    
    setup_professional_style()
    
    base_folder = Path(f'{base_output_folder}/visualizations_boxplot')
    base_folder.mkdir(exist_ok=True)
    
    print(f"\n{'='*80}")
    print("Creating PROFESSIONAL boxplot visualizations for academic thesis...")
    print(f"{'='*80}\n")
    
    exclude_cols = {'model', 'track', 'source', 'data_type', 'segment_id'}
    all_cols = [col for col in features_df.columns if col not in exclude_cols]
    
    feature_groups = defaultdict(list)
    
    for col in all_cols:
        parts = col.split('_')
        
        if len(parts) > 1 and parts[-1] in ['min', 'mean', 'std', 'max']:
            base_name = '_'.join(parts[:-1])
            stat = parts[-1]
        else:
            base_name = col
            stat = 'single'
        
        feature_groups[base_name].append((col, stat))
    
    print(f"Found {len(feature_groups)} feature groups\n")
    
    for feature_base, columns_list in sorted(feature_groups.items()):
        print(f"Processing feature: {feature_base}")
        
        if len(columns_list) == 1 and columns_list[0][1] == 'single':
            col = columns_list[0][0]
            
            feature_folder = base_folder / feature_base
            feature_folder.mkdir(exist_ok=True)
            
            fig, axes = plt.subplots(2, 2, figsize=(18, 14))
            axes = axes.flatten()
            
            for ax in axes[1:]:
                ax.set_visible(False)
            
            ax = axes[0]
            models = sorted(features_df['model'].unique())
            
            plot_data = []
            valid_models = []
            
            for model in models:
                data = features_df[features_df['model'] == model][col].dropna()
                if len(data) > 0:
                    plot_data.append(data)
                    valid_models.append(model)
            
            if len(plot_data) == 0:
                print(f"  ⚠️ SKIPPED: No valid data for feature '{col}'")
                continue
            
            bp = ax.boxplot(plot_data, 
                           labels=valid_models,
                           patch_artist=True,
                           widths=0.6,
                           showmeans=True,
                           meanline=False,
                           notch=False,
                           vert=True,
                           whis=1.5,
                           meanprops=dict(marker='D', markerfacecolor='red', 
                                        markersize=7, markeredgecolor='darkred',
                                        markeredgewidth=1.5),
                           medianprops=dict(color='darkblue', linewidth=2),
                           whiskerprops=dict(linewidth=1.5, color='black'),
                           capprops=dict(linewidth=1.5, color='black'),
                           boxprops=dict(linewidth=1.5, color='black'))
            
            for patch, model in zip(bp['boxes'], valid_models):
                patch.set_facecolor(BOX_FILL_COLORS.get(model, '#cccccc'))
                patch.set_alpha(0.8)
                patch.set_edgecolor(PROFESSIONAL_COLORS.get(model, '#000000'))
                patch.set_linewidth(2)
            
            for i, (data, model) in enumerate(zip(plot_data, valid_models)):
                y = data.values
                x = np.random.normal(i+1, 0.04, size=len(y))
                ax.scatter(x, y, alpha=0.4, s=40, color='black', 
                          edgecolors='gray', linewidth=0.5)
            
            ax.set_xticklabels(valid_models, fontsize=12, fontweight='bold')
            ax.set_ylabel('Value', fontsize=13, fontweight='bold')
            ax.set_title(f'{col}', fontsize=13, fontweight='bold', pad=15)
            
            ax.grid(axis='y', alpha=0.3, linestyle='--', linewidth=0.8)
            ax.set_axisbelow(True)
            ax.spines['top'].set_visible(False)
            ax.spines['right'].set_visible(False)
            ax.spines['left'].set_linewidth(1.8)
            ax.spines['bottom'].set_linewidth(1.8)
            
            if len(plot_data) > 0:
                textstr = format_statistics_box(valid_models, plot_data)
                ax.text(
                        1.02, 0.98, textstr, transform=ax.transAxes,
                        ha='left', va='top',
                        fontsize=9, verticalalignment='top', horizontalalignment='right',
                        bbox=dict(boxstyle='round,pad=0.8', facecolor='white', 
                                alpha=0.9, edgecolor='black', linewidth=1.2),
                        family='monospace', clip_on=False)
                
            plt.tight_layout()
            
            output_file = feature_folder / f'{feature_base}_analysis.png'
            plt.savefig(output_file, dpi=300, bbox_inches='tight', facecolor='white', edgecolor='none')
            plt.close()
            
            print(f"  ✓ Saved: {output_file}")
        
        else:
            feature_folder = base_folder / feature_base
            feature_folder.mkdir(exist_ok=True)
            
            stat_order = ['min', 'mean', 'std', 'max']
            columns_sorted = sorted(columns_list,
                                   key=lambda x: next((i for i, stat in enumerate(stat_order) 
                                                      if stat == x[1]), 999))
            
            fig, axes = plt.subplots(2, 2, figsize=(18, 14))
            fig.patch.set_facecolor('white')
            axes = axes.flatten()
            
            models = sorted(features_df['model'].unique())
            
            for plot_idx, (col, stat) in enumerate(columns_sorted[:4]):
                ax = axes[plot_idx]
                
                plot_data = []
                valid_models = []
                
                for model in models:
                    data = features_df[features_df['model'] == model][col].dropna()
                    if len(data) > 0:
                        plot_data.append(data)
                        valid_models.append(model)
                
                if len(plot_data) == 0:
                    ax.text(0.5, 0.5, f'No data available\nfor {stat.upper()}',
                           ha='center', va='center', fontsize=12, color='red',
                           transform=ax.transAxes)
                    ax.set_title(f'{stat.upper()} - NO DATA', fontsize=13, 
                               fontweight='bold', color='red')
                    continue
                
                bp = ax.boxplot(plot_data,
                               labels=valid_models,
                               patch_artist=True,
                               widths=0.6,
                               showmeans=True,
                               meanline=False,
                               notch=False,
                               vert=True,
                               whis=1.5,
                               meanprops=dict(marker='D', markerfacecolor='red',
                                            markersize=7, markeredgecolor='darkred',
                                            markeredgewidth=1.5),
                               medianprops=dict(color='darkblue', linewidth=2),
                               whiskerprops=dict(linewidth=1.5, color='black'),
                               capprops=dict(linewidth=1.5, color='black'),
                               boxprops=dict(linewidth=1.5, color='black'))
                
                for patch, model in zip(bp['boxes'], valid_models):
                    patch.set_facecolor(BOX_FILL_COLORS.get(model, '#cccccc'))
                    patch.set_alpha(0.8)
                    patch.set_edgecolor(PROFESSIONAL_COLORS.get(model, '#000000'))
                    patch.set_linewidth(2)
                
                for i, (data, model) in enumerate(zip(plot_data, valid_models)):
                    y = data.values
                    x = np.random.normal(i+1, 0.04, size=len(y))
                    ax.scatter(x, y, alpha=0.35, s=35, color='black',
                              edgecolors='gray', linewidth=0.5)
                
                ax.set_xticklabels(valid_models, fontsize=11, fontweight='bold')
                ax.set_ylabel('Value', fontsize=12, fontweight='bold')
                
                stat_display = stat.upper() if stat != 'single' else feature_base
                ax.set_title(f'{stat_display}', fontsize=12, fontweight='bold', pad=12,
                            bbox=dict(boxstyle='round,pad=0.6', facecolor='#f0f0f0',
                                     alpha=0.9, edgecolor='#333333', linewidth=1.5))
                
                ax.grid(axis='y', alpha=0.3, linestyle='--', linewidth=0.8)
                ax.set_axisbelow(True)
                ax.spines['top'].set_visible(False)
                ax.spines['right'].set_visible(False)
                ax.spines['left'].set_linewidth(1.8)
                ax.spines['bottom'].set_linewidth(1.8)
                
                if len(plot_data) > 0:
                    textstr = format_statistics_box(valid_models, plot_data)
                    ax.text(0.98, 0.97, textstr, transform=ax.transAxes,
                           fontsize=9, verticalalignment='top', horizontalalignment='right',
                           bbox=dict(boxstyle='round,pad=0.8', facecolor='white', 
                                    alpha=0.9, edgecolor='black', linewidth=1.2),
                           family='monospace')
            
            fig.suptitle(f'Feature Analysis: {feature_base.replace("_", " ").title()} '
                        f'(REAL vs GENERATED Models)',
                        fontsize=18, fontweight='bold', y=0.98)
            
            plt.tight_layout()
            
            output_file = feature_folder / f'{feature_base}_boxplots.png'
            plt.savefig(output_file, dpi=300, bbox_inches='tight', facecolor='white', edgecolor='none')
            plt.close()
            
            print(f"  ✓ Saved: {output_file}")
    
    print(f"\n{'='*80}")
    print(f"✅ Professional boxplot visualizations saved to: visualizations/")
    print(f"✅ Ready for academic thesis presentation!")
    print(f"{'='*80}\n")

In [87]:
viz2_real_vs_generated_boxplots(
    features_df, features_to_analyze
)


Creating PROFESSIONAL boxplot visualizations for academic thesis...

Found 32 feature groups

Processing feature: breath_count


  bp = ax.boxplot(plot_data,


  ✓ Saved: FakeRealMusicOriginal\Extended_full_track_features\mix2\visualizations_boxplot\breath_count\breath_count_analysis.png
Processing feature: duration


  bp = ax.boxplot(plot_data,


  ✓ Saved: FakeRealMusicOriginal\Extended_full_track_features\mix2\visualizations_boxplot\duration\duration_analysis.png
Processing feature: f0


  bp = ax.boxplot(plot_data,
  bp = ax.boxplot(plot_data,
  bp = ax.boxplot(plot_data,
  bp = ax.boxplot(plot_data,


  ✓ Saved: FakeRealMusicOriginal\Extended_full_track_features\mix2\visualizations_boxplot\f0\f0_boxplots.png
Processing feature: gne


  bp = ax.boxplot(plot_data,


  ✓ Saved: FakeRealMusicOriginal\Extended_full_track_features\mix2\visualizations_boxplot\gne\gne_analysis.png
Processing feature: hnr


  bp = ax.boxplot(plot_data,


  ✓ Saved: FakeRealMusicOriginal\Extended_full_track_features\mix2\visualizations_boxplot\hnr\hnr_analysis.png
Processing feature: intonation_pattern_pitch_variability


  bp = ax.boxplot(plot_data,


  ✓ Saved: FakeRealMusicOriginal\Extended_full_track_features\mix2\visualizations_boxplot\intonation_pattern_pitch_variability\intonation_pattern_pitch_variability_analysis.png
Processing feature: jitter_jitter


  bp = ax.boxplot(plot_data,


  ✓ Saved: FakeRealMusicOriginal\Extended_full_track_features\mix2\visualizations_boxplot\jitter_jitter\jitter_jitter_boxplots.png
Processing feature: jitter_jitter_local


  bp = ax.boxplot(plot_data,


  ✓ Saved: FakeRealMusicOriginal\Extended_full_track_features\mix2\visualizations_boxplot\jitter_jitter_local\jitter_jitter_local_analysis.png
Processing feature: jitter_jitter_mean_absolute_ms


  bp = ax.boxplot(plot_data,


  ✓ Saved: FakeRealMusicOriginal\Extended_full_track_features\mix2\visualizations_boxplot\jitter_jitter_mean_absolute_ms\jitter_jitter_mean_absolute_ms_analysis.png
Processing feature: jitter_jitter_ppq5


  bp = ax.boxplot(plot_data,


  ✓ Saved: FakeRealMusicOriginal\Extended_full_track_features\mix2\visualizations_boxplot\jitter_jitter_ppq5\jitter_jitter_ppq5_analysis.png
Processing feature: jitter_jitter_range


  bp = ax.boxplot(plot_data,


  ✓ Saved: FakeRealMusicOriginal\Extended_full_track_features\mix2\visualizations_boxplot\jitter_jitter_range\jitter_jitter_range_analysis.png
Processing feature: jitter_jitter_rap


  bp = ax.boxplot(plot_data,


  ✓ Saved: FakeRealMusicOriginal\Extended_full_track_features\mix2\visualizations_boxplot\jitter_jitter_rap\jitter_jitter_rap_analysis.png
Processing feature: rhythm_stats_avg_onset_strength


  bp = ax.boxplot(plot_data,


  ✓ Saved: FakeRealMusicOriginal\Extended_full_track_features\mix2\visualizations_boxplot\rhythm_stats_avg_onset_strength\rhythm_stats_avg_onset_strength_analysis.png
Processing feature: rhythm_stats_max_onset_strength


  bp = ax.boxplot(plot_data,


  ✓ Saved: FakeRealMusicOriginal\Extended_full_track_features\mix2\visualizations_boxplot\rhythm_stats_max_onset_strength\rhythm_stats_max_onset_strength_analysis.png
Processing feature: rhythm_stats_tempo_bpm


  bp = ax.boxplot(plot_data,
  bp = ax.boxplot(plot_data,
  bp = ax.boxplot(plot_data,
  bp = ax.boxplot(plot_data,


  ✓ Saved: FakeRealMusicOriginal\Extended_full_track_features\mix2\visualizations_boxplot\rhythm_stats_tempo_bpm\rhythm_stats_tempo_bpm_boxplots.png
Processing feature: rms_spec


  bp = ax.boxplot(plot_data,
  bp = ax.boxplot(plot_data,
  bp = ax.boxplot(plot_data,
  bp = ax.boxplot(plot_data,


  ✓ Saved: FakeRealMusicOriginal\Extended_full_track_features\mix2\visualizations_boxplot\rms_spec\rms_spec_boxplots.png
Processing feature: rms_wave


  bp = ax.boxplot(plot_data,
  bp = ax.boxplot(plot_data,
  bp = ax.boxplot(plot_data,
  bp = ax.boxplot(plot_data,


  ✓ Saved: FakeRealMusicOriginal\Extended_full_track_features\mix2\visualizations_boxplot\rms_wave\rms_wave_boxplots.png
Processing feature: shimmer_shimmer


  bp = ax.boxplot(plot_data,


  ✓ Saved: FakeRealMusicOriginal\Extended_full_track_features\mix2\visualizations_boxplot\shimmer_shimmer\shimmer_shimmer_boxplots.png
Processing feature: shimmer_shimmer_apq3


  bp = ax.boxplot(plot_data,


  ✓ Saved: FakeRealMusicOriginal\Extended_full_track_features\mix2\visualizations_boxplot\shimmer_shimmer_apq3\shimmer_shimmer_apq3_analysis.png
Processing feature: shimmer_shimmer_apq5


  bp = ax.boxplot(plot_data,


  ✓ Saved: FakeRealMusicOriginal\Extended_full_track_features\mix2\visualizations_boxplot\shimmer_shimmer_apq5\shimmer_shimmer_apq5_analysis.png
Processing feature: shimmer_shimmer_dB


  bp = ax.boxplot(plot_data,


  ✓ Saved: FakeRealMusicOriginal\Extended_full_track_features\mix2\visualizations_boxplot\shimmer_shimmer_dB\shimmer_shimmer_dB_analysis.png
Processing feature: shimmer_shimmer_local


  bp = ax.boxplot(plot_data,


  ✓ Saved: FakeRealMusicOriginal\Extended_full_track_features\mix2\visualizations_boxplot\shimmer_shimmer_local\shimmer_shimmer_local_analysis.png
Processing feature: shimmer_shimmer_range


  bp = ax.boxplot(plot_data,


  ✓ Saved: FakeRealMusicOriginal\Extended_full_track_features\mix2\visualizations_boxplot\shimmer_shimmer_range\shimmer_shimmer_range_analysis.png
Processing feature: spectral_bandwidth


  bp = ax.boxplot(plot_data,
  bp = ax.boxplot(plot_data,
  bp = ax.boxplot(plot_data,
  bp = ax.boxplot(plot_data,


  ✓ Saved: FakeRealMusicOriginal\Extended_full_track_features\mix2\visualizations_boxplot\spectral_bandwidth\spectral_bandwidth_boxplots.png
Processing feature: spectral_centroid


  bp = ax.boxplot(plot_data,
  bp = ax.boxplot(plot_data,
  bp = ax.boxplot(plot_data,
  bp = ax.boxplot(plot_data,


  ✓ Saved: FakeRealMusicOriginal\Extended_full_track_features\mix2\visualizations_boxplot\spectral_centroid\spectral_centroid_boxplots.png
Processing feature: spectral_contrast


  bp = ax.boxplot(plot_data,
  bp = ax.boxplot(plot_data,
  bp = ax.boxplot(plot_data,
  bp = ax.boxplot(plot_data,


  ✓ Saved: FakeRealMusicOriginal\Extended_full_track_features\mix2\visualizations_boxplot\spectral_contrast\spectral_contrast_boxplots.png
Processing feature: spectral_flatness


  bp = ax.boxplot(plot_data,
  bp = ax.boxplot(plot_data,
  bp = ax.boxplot(plot_data,
  bp = ax.boxplot(plot_data,


  ✓ Saved: FakeRealMusicOriginal\Extended_full_track_features\mix2\visualizations_boxplot\spectral_flatness\spectral_flatness_boxplots.png
Processing feature: spectral_rolloff_1


  bp = ax.boxplot(plot_data,
  bp = ax.boxplot(plot_data,
  bp = ax.boxplot(plot_data,
  bp = ax.boxplot(plot_data,


  ✓ Saved: FakeRealMusicOriginal\Extended_full_track_features\mix2\visualizations_boxplot\spectral_rolloff_1\spectral_rolloff_1_boxplots.png
Processing feature: spectral_rolloff_85


  bp = ax.boxplot(plot_data,
  bp = ax.boxplot(plot_data,
  bp = ax.boxplot(plot_data,
  bp = ax.boxplot(plot_data,


  ✓ Saved: FakeRealMusicOriginal\Extended_full_track_features\mix2\visualizations_boxplot\spectral_rolloff_85\spectral_rolloff_85_boxplots.png
Processing feature: spectral_rolloff_99


  bp = ax.boxplot(plot_data,
  bp = ax.boxplot(plot_data,
  bp = ax.boxplot(plot_data,
  bp = ax.boxplot(plot_data,


  ✓ Saved: FakeRealMusicOriginal\Extended_full_track_features\mix2\visualizations_boxplot\spectral_rolloff_99\spectral_rolloff_99_boxplots.png
Processing feature: voice_breaks


  bp = ax.boxplot(plot_data,


  ✓ Saved: FakeRealMusicOriginal\Extended_full_track_features\mix2\visualizations_boxplot\voice_breaks\voice_breaks_analysis.png
Processing feature: zero_crossing_rate


  bp = ax.boxplot(plot_data,


  ✓ Saved: FakeRealMusicOriginal\Extended_full_track_features\mix2\visualizations_boxplot\zero_crossing_rate\zero_crossing_rate_analysis.png

✅ Professional boxplot visualizations saved to: visualizations/
✅ Ready for academic thesis presentation!



In [None]:
def viz_feature_ranges(features_df):
    import matplotlib.pyplot as plt
    from pathlib import Path
    from collections import defaultdict
    import numpy as np

    setup_professional_style()
    base_folder = Path(f'{base_output_folder}/visualizations_range')
    base_folder.mkdir(exist_ok=True)
    
    exclude_cols = {'model', 'track', 'source', 'data_type', 'segment_id'}
    all_cols = [col for col in features_df.columns if col not in exclude_cols]
    
    feature_groups = defaultdict(list)
    for col in all_cols:
        parts = col.split('_')
        if len(parts) > 1 and parts[-1] in ['min', 'mean', 'std', 'max']:
            base_name = '_'.join(parts[:-1])
            stat = parts[-1]
        else:
            base_name = col
            stat = 'single'
        feature_groups[base_name].append((col, stat))
    
    all_models = sorted(features_df['model'].unique())
    stat_order = ['min', 'mean', 'std', 'max']

    for feat_base, columns in sorted(feature_groups.items()):
        found_stats = {st: col for col, st in columns if st in stat_order}
        has_all_stats = set(['min', 'mean', 'std', 'max']).issubset(found_stats.keys())
        singles = [col for col, st in columns if st == 'single']

        feature_folder = base_folder / feat_base
        feature_folder.mkdir(exist_ok=True)
        fig, axes = plt.subplots(2, 2, figsize=(15, 10))
        axes = axes.flatten()
        
        ylabels = []
        for m in all_models:
            model_data = features_df[features_df['model'] == m]
            if len(model_data) > 0:
                has_data = False
                for col, _ in columns:
                    if col in model_data.columns and model_data[col].notna().any():
                        has_data = True
                        break
                if has_data:
                    ylabels.append(m)
        
        if len(ylabels) == 0:
            print(f"  ⚠️ SKIPPED: Feature '{feat_base}' has no valid data for any model")
            continue
        
        colors = [PROFESSIONAL_COLORS.get(m, 'gray') for m in ylabels]

        if has_all_stats:
            ax = axes[0]
            for j, m in enumerate(ylabels):
                row = features_df[features_df['model'] == m]
                min_vals = row[found_stats['min']].dropna()
                max_vals = row[found_stats['max']].dropna()
                
                if len(min_vals) > 0 and len(max_vals) > 0:
                    mn = min_vals.mean()
                    mx = max_vals.mean()
                    ax.plot([mn, mx], [j, j], '-', color=colors[j], lw=6, solid_capstyle='round')
                    ax.scatter([mn, mx], [j, j], color=colors[j], s=80, edgecolor='black', zorder=3)
            
            ax.set_title('MIN - MAX', fontsize=13, fontweight='bold')
            ax.set_yticks(range(len(ylabels)))
            ax.set_yticklabels(ylabels, fontweight='bold')
            ax.set_xlabel('Value', fontsize=12)
            ax.grid(axis='x')
            
            ax = axes[1]
            for j, m in enumerate(ylabels):
                row = features_df[features_df['model'] == m]
                mean_vals = row[found_stats['mean']].dropna()
                std_vals = row[found_stats['std']].dropna()
                
                if len(mean_vals) > 0 and len(std_vals) > 0:
                    mean = mean_vals.mean()
                    std = std_vals.mean()
                    ax.errorbar(mean, j, xerr=std, fmt='o', color=colors[j], ecolor=colors[j], 
                               elinewidth=5, capsize=8, markersize=12)
            
            ax.set_title('MEAN ± STD', fontsize=13, fontweight='bold')
            ax.set_yticks(range(len(ylabels)))
            ax.set_yticklabels(ylabels, fontweight='bold')
            ax.set_xlabel('Value', fontsize=12)
            ax.grid(axis='x')
            
            ax = axes[2]
            for j, m in enumerate(ylabels):
                row = features_df[features_df['model'] == m]
                vals = row[found_stats['mean']].dropna()
                
                if len(vals) > 0:
                    q1 = np.percentile(vals, 25)
                    q3 = np.percentile(vals, 75)
                    ax.plot([q1, q3], [j, j], '-', color=colors[j], lw=8, solid_capstyle='round', alpha=0.85)
                    ax.scatter([q1, q3], [j, j], color=colors[j], s=80, edgecolor='black')
            
            ax.set_title('Q1 - Q3 (interquartile)', fontsize=13, fontweight='bold')
            ax.set_yticks(range(len(ylabels)))
            ax.set_yticklabels(ylabels, fontweight='bold')
            ax.set_xlabel('Value', fontsize=12)
            ax.grid(axis='x')
            
            ax = axes[3]
            for j, m in enumerate(ylabels):
                row = features_df[features_df['model'] == m]
                min_vals = row[found_stats['min']].dropna()
                max_vals = row[found_stats['max']].dropna()
                mean_vals = row[found_stats['mean']].dropna()
                
                if len(min_vals) > 0 and len(max_vals) > 0:
                    mn = min_vals.mean()
                    mx = max_vals.mean()
                    ax.plot([mn, mx], [j, j], '-', color=colors[j], lw=6, solid_capstyle='round')
                    
                    if len(mean_vals) > 0:
                        median = mean_vals.median()
                        ax.scatter([median], [j], color='black', s=70, zorder=5, 
                                  label='median' if j == 0 else None)
            
            ax.set_title('Range (min, median, max)', fontsize=13, fontweight='bold')
            ax.set_yticks(range(len(ylabels)))
            ax.set_yticklabels(ylabels, fontweight='bold')
            ax.set_xlabel('Value', fontsize=12)
            ax.grid(axis='x')
        
        elif singles and len(singles) == 1:
            ax = axes[0]
            for j, m in enumerate(ylabels):
                vals = features_df[features_df['model'] == m][singles[0]].dropna()
                if len(vals) == 0:
                    continue
                mn, mx = vals.min(), vals.max()
                ax.plot([mn, mx], [j, j], color=colors[j], lw=8)
                ax.scatter([mn, mx], [j, j], color=colors[j], s=90, edgecolor='black', zorder=3)
            
            ax.set_title('RANGE', fontsize=14, fontweight='bold')
            ax.set_yticks(range(len(ylabels)))
            ax.set_yticklabels(ylabels, fontweight='bold')
            ax.set_xlabel('Value', fontsize=13)
            ax.grid(axis='x')
            
            for pi in [1, 2, 3]:
                axes[pi].text(0.5, 0.5, 'Brak innych statystyk', va='center', ha='center', 
                             fontsize=13, fontweight='bold', alpha=0.7)
                axes[pi].spines['top'].set_visible(False)
                axes[pi].spines['right'].set_visible(False)
        
        elif ('mean' in found_stats) and (set(found_stats.keys()) <= {'mean'}):
            ax = axes[0]
            for j, m in enumerate(ylabels):
                vals = features_df[features_df['model'] == m][found_stats['mean']].dropna()
                if len(vals) == 0:
                    continue
                mn, mx = vals.min(), vals.max()
                ax.plot([mn, mx], [j, j], color=colors[j], lw=8)
                ax.scatter([mn, mx], [j, j], color=colors[j], s=90, edgecolor='black', zorder=3)
            
            ax.set_title('RANGE (mean values)', fontsize=14, fontweight='bold')
            ax.set_yticks(range(len(ylabels)))
            ax.set_yticklabels(ylabels, fontweight='bold')
            ax.set_xlabel('Value', fontsize=13)
            ax.grid(axis='x')
            
            for pi in [1, 2, 3]:
                axes[pi].text(0.5, 0.5, 'Brak innych statystyk', va='center', ha='center', 
                             fontsize=13, fontweight='bold', alpha=0.7)
                axes[pi].spines['top'].set_visible(False)
                axes[pi].spines['right'].set_visible(False)
        
        else:
            for pi in range(4):
                axes[pi].text(0.5, 0.5, 'Brak wystarczających statystyk', va='center', ha='center', 
                             fontsize=13, fontweight='bold', alpha=0.7)
                axes[pi].spines['top'].set_visible(False)
                axes[pi].spines['right'].set_visible(False)
        
        for ax in axes:
            ax.set_axisbelow(True)
        
        fig.suptitle(f"{feat_base.replace('_', ' ').title()} – Distribution Ranges", 
                    fontsize=16, fontweight='bold')
        plt.tight_layout(rect=[0, 0.03, 1, 0.97])
        plt.savefig(feature_folder / f"{feat_base}_range.png", dpi=300, bbox_inches='tight', facecolor='white')
        plt.close()
        print(f"  ✓ {feat_base}: range visualization OK")

    print(f"\n✅ Wszystkie wykresy adaptive grid wygenerowane w visualizations_range/\n")

In [89]:
viz_feature_ranges(
    features_df,
)

  ✓ breath_count: range visualization OK
  ✓ duration: range visualization OK
  ✓ f0: range visualization OK
  ✓ gne: range visualization OK
  ✓ hnr: range visualization OK
  ✓ intonation_pattern_pitch_variability: range visualization OK
  ✓ jitter_jitter: range visualization OK
  ✓ jitter_jitter_local: range visualization OK
  ✓ jitter_jitter_mean_absolute_ms: range visualization OK
  ✓ jitter_jitter_ppq5: range visualization OK
  ✓ jitter_jitter_range: range visualization OK
  ✓ jitter_jitter_rap: range visualization OK
  ✓ rhythm_stats_avg_onset_strength: range visualization OK
  ✓ rhythm_stats_max_onset_strength: range visualization OK
  ✓ rhythm_stats_tempo_bpm: range visualization OK
  ✓ rms_spec: range visualization OK
  ✓ rms_wave: range visualization OK
  ✓ shimmer_shimmer: range visualization OK
  ✓ shimmer_shimmer_apq3: range visualization OK
  ✓ shimmer_shimmer_apq5: range visualization OK
  ✓ shimmer_shimmer_dB: range visualization OK
  ✓ shimmer_shimmer_local: range visua

----