# Misclassification Analysis: Genre Boundary Ambiguity Quantification

This notebook analyzes model errors to understand genre boundary ambiguity and identify which genres are most frequently confused. This provides insights into the fundamental challenges of music genre classification.

In [None]:
import sys
sys.path.insert(0, '../')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (14, 8)

print("Misclassification Analysis Framework")
print("="*70)

## 1. Load and Prepare Data

Load trained models and generate predictions for analysis.

In [None]:
# Create synthetic dataset for demonstration
np.random.seed(42)
n_samples = 2000
n_genres = 8
genres = ['rock', 'electronic', 'hip-hop', 'classical', 'jazz', 'folk', 'pop', 'experimental']

# Generate features
X = np.random.randn(n_samples, 20)
y = np.random.randint(0, n_genres, n_samples)

# Standardize
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.3, stratify=y, random_state=42
)

# Train model
model = RandomForestClassifier(n_estimators=100, max_depth=15, random_state=42, n_jobs=-1)
model.fit(X_train, y_train)

# Generate predictions
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)

print(f"Test set size: {len(y_test)}")
print(f"Accuracy: {(y_pred == y_test).mean():.4f}")
print(f"Misclassified samples: {(y_pred != y_test).sum()}")

## 2. Confusion Matrix Analysis

Identify which genres are most frequently confused.

In [None]:
# Generate confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Normalize by true label (recall perspective)
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

# Visualize normalized confusion matrix
plt.figure(figsize=(12, 10))
sns.heatmap(cm_normalized, annot=cm, fmt='d', cmap='Blues',
           xticklabels=genres, yticklabels=genres,
           cbar_kws={'label': 'Proportion'}, linewidths=0.5)
plt.title('Confusion Matrix: Genre Misclassification Patterns', fontweight='bold', fontsize=14)
plt.ylabel('True Genre')
plt.xlabel('Predicted Genre')
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.savefig('../outputs/misclassification_confusion_matrix.png', dpi=300, bbox_inches='tight')
plt.show()

print("Confusion Matrix (normalized by true label):")
print(cm_normalized.round(3))

## 3. Identify Common Confusions

Find the most frequent genre confusions.

In [None]:
# Extract off-diagonal elements (confusions only)
confusions = []
for i in range(n_genres):
    for j in range(n_genres):
        if i != j:
            confusion_rate = cm_normalized[i, j]
            if confusion_rate > 0:
                confusions.append({
                    'True Genre': genres[i],
                    'Predicted Genre': genres[j],
                    'Count': cm[i, j],
                    'Rate': confusion_rate,
                    'Direction': f"{genres[i]} → {genres[j]}"
                })

confusions_df = pd.DataFrame(confusions).sort_values('Count', ascending=False)

print("\n" + "="*70)
print("TOP 10 GENRE CONFUSIONS")
print("="*70)
print(confusions_df.head(10)[['True Genre', 'Predicted Genre', 'Count', 'Rate']].to_string(index=False))

# Visualize top confusions
fig, ax = plt.subplots(figsize=(12, 6))
top_confusions = confusions_df.head(10)
y_pos = np.arange(len(top_confusions))

bars = ax.barh(y_pos, top_confusions['Count'], color='steelblue', alpha=0.8)
ax.set_yticks(y_pos)
ax.set_yticklabels(top_confusions['Direction'])
ax.set_xlabel('Number of Misclassifications')
ax.set_title('Top 10 Genre Confusions', fontweight='bold', fontsize=14)
ax.invert_yaxis()

# Add value labels on bars
for i, bar in enumerate(bars):
    width = bar.get_width()
    ax.text(width, bar.get_y() + bar.get_height()/2,
           f'{int(width)}', ha='left', va='center', fontweight='bold')

plt.tight_layout()
plt.savefig('../outputs/top_genre_confusions.png', dpi=300, bbox_inches='tight')
plt.show()

## 4. Confidence vs. Correctness Analysis

Analyze relationship between prediction confidence and accuracy.

In [None]:
# Get maximum probability for each prediction
max_probs = np.max(y_pred_proba, axis=1)
correct = (y_pred == y_test).astype(int)

# Create DataFrame for analysis
confidence_df = pd.DataFrame({
    'confidence': max_probs,
    'correct': correct,
    'true_label': y_test,
    'pred_label': y_pred
})

# Analyze confidence for correct vs. incorrect predictions
print("\n" + "="*70)
print("CONFIDENCE ANALYSIS")
print("="*70)
print(f"\nCorrect predictions:")
print(f"  Mean confidence: {confidence_df[confidence_df['correct']==1]['confidence'].mean():.4f}")
print(f"  Std confidence:  {confidence_df[confidence_df['correct']==1]['confidence'].std():.4f}")

print(f"\nIncorrect predictions:")
print(f"  Mean confidence: {confidence_df[confidence_df['correct']==0]['confidence'].mean():.4f}")
print(f"  Std confidence:  {confidence_df[confidence_df['correct']==0]['confidence'].std():.4f}")

# Binned analysis
confidence_bins = np.linspace(0, 1, 11)
binned_accuracy = []
bin_centers = []

for i in range(len(confidence_bins)-1):
    bin_mask = (max_probs >= confidence_bins[i]) & (max_probs < confidence_bins[i+1])
    if bin_mask.sum() > 0:
        bin_accuracy = correct[bin_mask].mean()
        bin_center = (confidence_bins[i] + confidence_bins[i+1]) / 2
        binned_accuracy.append(bin_accuracy)
        bin_centers.append(bin_center)

# Plot confidence vs. accuracy
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Box plot
axes[0].boxplot([confidence_df[confidence_df['correct']==1]['confidence'],
                 confidence_df[confidence_df['correct']==0]['confidence']],
               labels=['Correct', 'Incorrect'])
axes[0].set_ylabel('Prediction Confidence')
axes[0].set_title('Confidence Distribution by Correctness')
axes[0].grid(True, alpha=0.3, axis='y')

# Binned accuracy
axes[1].plot(bin_centers, binned_accuracy, 'o-', linewidth=2, markersize=8, color='steelblue')
axes[1].axhline(y=1.0, color='green', linestyle='--', alpha=0.5, label='Perfect Accuracy')
axes[1].set_xlabel('Prediction Confidence')
axes[1].set_ylabel('Accuracy')
axes[1].set_title('Accuracy vs. Model Confidence')
axes[1].set_ylim([0, 1.05])
axes[1].grid(True, alpha=0.3)
axes[1].legend()

plt.tight_layout()
plt.savefig('../outputs/confidence_vs_accuracy.png', dpi=300, bbox_inches='tight')
plt.show()

## 5. Per-Genre Error Analysis

Analyze error patterns for each genre.

In [None]:
# Calculate per-genre metrics
per_genre_stats = []

for genre_idx in range(n_genres):
    genre_mask = y_test == genre_idx
    genre_predictions = y_pred[genre_mask]
    genre_true = y_test[genre_mask]
    
    accuracy = (genre_predictions == genre_true).mean()
    misclassification_rate = 1 - accuracy
    
    # Find most common misclassification target for this genre
    if misclassification_rate > 0:
        misclass_mask = genre_predictions != genre_true
        if misclass_mask.sum() > 0:
            top_misclass = pd.Series(genre_predictions[misclass_mask]).value_counts().idxmax()
            top_misclass_name = genres[top_misclass]
        else:
            top_misclass_name = 'N/A'
    else:
        top_misclass_name = 'N/A'
    
    per_genre_stats.append({
        'Genre': genres[genre_idx],
        'Samples': genre_mask.sum(),
        'Accuracy': accuracy,
        'Misclassification Rate': misclassification_rate,
        'Most Confused With': top_misclass_name,
        'Recall': accuracy
    })

per_genre_df = pd.DataFrame(per_genre_stats).sort_values('Accuracy')

print("\n" + "="*70)
print("PER-GENRE ERROR ANALYSIS")
print("="*70)
print(per_genre_df.to_string(index=False))

# Visualize per-genre accuracy
fig, ax = plt.subplots(figsize=(12, 6))
colors = ['green' if acc > 0.8 else 'orange' if acc > 0.7 else 'red' 
         for acc in per_genre_df['Accuracy']]

y_pos = np.arange(len(per_genre_df))
bars = ax.barh(y_pos, per_genre_df['Accuracy'], color=colors, alpha=0.7, edgecolor='black', linewidth=1.5)

ax.set_yticks(y_pos)
ax.set_yticklabels(per_genre_df['Genre'])
ax.set_xlabel('Recall / Accuracy')
ax.set_title('Per-Genre Classification Accuracy', fontweight='bold', fontsize=14)
ax.set_xlim([0, 1.0])
ax.axvline(x=0.8, color='green', linestyle='--', alpha=0.5, linewidth=2, label='80% threshold')
ax.legend()

# Add value labels
for i, bar in enumerate(bars):
    width = bar.get_width()
    ax.text(width + 0.02, bar.get_y() + bar.get_height()/2,
           f'{width:.1%}', ha='left', va='center', fontweight='bold')

plt.tight_layout()
plt.savefig('../outputs/per_genre_accuracy.png', dpi=300, bbox_inches='tight')
plt.show()

## 6. Genre Similarity Analysis

Quantify similarity between genres based on confusion patterns.

In [None]:
# Create genre similarity matrix based on confusion patterns
# Normalize confusion matrix to get confusion probabilities
cm_prob = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

# Genre similarity: how often genres are confused
genre_similarity = np.zeros((n_genres, n_genres))
for i in range(n_genres):
    for j in range(n_genres):
        if i != j:
            # Symmetric confusion rate
            genre_similarity[i, j] = (cm_prob[i, j] + cm_prob[j, i]) / 2

# Visualize genre similarity
plt.figure(figsize=(10, 8))
sns.heatmap(genre_similarity, annot=np.round(genre_similarity, 3), fmt='g', cmap='YlOrRd',
           xticklabels=genres, yticklabels=genres, cbar_kws={'label': 'Confusion Rate'})
plt.title('Genre Similarity Matrix: Misclassification Co-occurrence', fontweight='bold', fontsize=14)
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.savefig('../outputs/genre_similarity_matrix.png', dpi=300, bbox_inches='tight')
plt.show()

# Find most similar genre pairs
similarities = []
for i in range(n_genres):
    for j in range(i+1, n_genres):
        similarities.append({
            'Genre 1': genres[i],
            'Genre 2': genres[j],
            'Similarity': genre_similarity[i, j]
        })

similarities_df = pd.DataFrame(similarities).sort_values('Similarity', ascending=False)

print("\n" + "="*70)
print("MOST SIMILAR GENRE PAIRS (Likely to be confused)")
print("="*70)
print(similarities_df.head(10).to_string(index=False))

## 7. Feature Importance for Error Patterns

Analyze which features contribute to misclassifications.

In [None]:
# Get feature importances
feature_importances = model.feature_importances_
feature_names = [f'Feature_{i}' for i in range(len(feature_importances))]

# Separate misclassified vs. correctly classified
misclassified_mask = y_pred != y_test

# Analyze feature distributions for misclassified samples
print("\n" + "="*70)
print("FEATURE ANALYSIS: MISCLASSIFIED vs. CORRECT")
print("="*70)

comparison_data = []
for i in range(min(5, len(feature_names))):  # Top 5 features
    correct_mean = X_test[~misclassified_mask, i].mean()
    misclass_mean = X_test[misclassified_mask, i].mean()
    
    comparison_data.append({
        'Feature': feature_names[i],
        'Importance': feature_importances[i],
        'Correct Mean': correct_mean,
        'Misclass Mean': misclass_mean,
        'Difference': abs(correct_mean - misclass_mean)
    })

comparison_df = pd.DataFrame(comparison_data).sort_values('Importance', ascending=False)
print(comparison_df.to_string(index=False))

# Visualize feature importance
fig, ax = plt.subplots(figsize=(10, 6))
top_n = 10
top_features_idx = np.argsort(feature_importances)[-top_n:]
top_importances = feature_importances[top_features_idx]
top_feature_names = [f'Feature {i}' for i in top_features_idx]

bars = ax.barh(range(len(top_importances)), top_importances, color='steelblue', alpha=0.8)
ax.set_yticks(range(len(top_importances)))
ax.set_yticklabels(top_feature_names)
ax.set_xlabel('Feature Importance')
ax.set_title(f'Top {top_n} Important Features', fontweight='bold', fontsize=14)
ax.invert_yaxis()

plt.tight_layout()
plt.savefig('../outputs/feature_importance_analysis.png', dpi=300, bbox_inches='tight')
plt.show()

## 8. Summary and Insights

Document key findings about genre boundary ambiguity.

In [None]:
print("\n" + "="*70)
print("MISCLASSIFICATION ANALYSIS SUMMARY")
print("="*70)

misclass_rate = (y_pred != y_test).mean()
print(f"\nOverall misclassification rate: {misclass_rate:.2%}")

print(f"\nGenre pairs most likely to be confused:")
for idx, row in similarities_df.head(5).iterrows():
    print(f"  • {row['Genre 1']} {row['Genre 2']}: {row['Similarity']:.2%} confusion rate")

print(f"\nGenres with lowest recall (most error-prone):")
worst_genres = per_genre_df.nsmallest(3, 'Accuracy')
for idx, row in worst_genres.iterrows():
    print(f"  • {row['Genre']:12s}: {row['Accuracy']:.1%} accuracy, often confused with {row['Most Confused With']}")

print(f"\nGenres with highest recall (most distinctive):")
best_genres = per_genre_df.nlargest(3, 'Accuracy')
for idx, row in best_genres.iterrows():
    print(f"  • {row['Genre']:12s}: {row['Accuracy']:.1%} accuracy")

print(f"\nKey Insights:")
print(f"  Genre boundary ambiguity is quantifiable through confusion matrix analysis")
print(f"  High model confidence doesn't always guarantee correctness")
print(f"  Certain genre pairs show systematic confusion patterns")
print(f"  Feature overlap explains observable misclassification patterns")
print(f"  Error analysis provides actionable insights for model improvement")