## 1. Import Required Libraries

In [None]:
# Core libraries
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Audio processing
import librosa
import librosa.display
import soundfile as sf
from IPython.display import Audio, display

# Configuration
import yaml

# Set plotting style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')

print("‚úÖ All libraries imported successfully!")
print(f"Librosa version: {librosa.__version__}")
print(f"NumPy version: {np.__version__}")
print(f"Pandas version: {pd.__version__}")

## 2. Set Up Paths and Configuration

In [None]:
# Define project paths
PROJECT_ROOT = Path('..')
DATA_DIR = PROJECT_ROOT / 'data'
RAW_DATA_DIR = DATA_DIR / 'raw'
PROCESSED_DATA_DIR = DATA_DIR / 'processed'
CONFIG_DIR = PROJECT_ROOT / 'configs'

# Create directories if they don't exist
RAW_DATA_DIR.mkdir(parents=True, exist_ok=True)
(RAW_DATA_DIR / 'real').mkdir(exist_ok=True)
(RAW_DATA_DIR / 'synthetic').mkdir(exist_ok=True)
PROCESSED_DATA_DIR.mkdir(parents=True, exist_ok=True)

print(f"üìÅ Project Root: {PROJECT_ROOT.absolute()}")
print(f"üìÅ Raw Data Directory: {RAW_DATA_DIR.absolute()}")
print(f"üìÅ Processed Data Directory: {PROCESSED_DATA_DIR.absolute()}")
print("\n‚úÖ Directory structure verified!")

## 3. Load Configuration

In [None]:
# Load configuration file
config_path = CONFIG_DIR / 'config.yaml'

if config_path.exists():
    with open(config_path, 'r') as f:
        config = yaml.safe_load(f)
    print("‚úÖ Configuration loaded successfully!\n")
    print("üìã Key Configuration Parameters:")
    print(f"  - Sample Rate: {config['data']['sample_rate']} Hz")
    print(f"  - Audio Duration: {config['data']['duration']} seconds")
    print(f"  - N_FFT: {config['data']['n_fft']}")
    print(f"  - N_Mels: {config['data']['n_mels']}")
    print(f"  - N_MFCC: {config['data']['n_mfcc']}")
else:
    print("‚ö†Ô∏è  Configuration file not found. Using default values.")
    config = {
        'data': {
            'sample_rate': 22050,
            'duration': 5,
            'n_fft': 2048,
            'n_mels': 128,
            'n_mfcc': 13
        }
    }

## 4. Check Dataset Availability

Let's check if we have audio files in our data directories.

In [None]:
# Supported audio formats
AUDIO_EXTENSIONS = ['.wav', '.mp3', '.flac', '.ogg', '.m4a']

def count_audio_files(directory):
    """Count audio files in a directory"""
    if not directory.exists():
        return 0
    count = 0
    for ext in AUDIO_EXTENSIONS:
        count += len(list(directory.glob(f'*{ext}')))
        count += len(list(directory.glob(f'**/*{ext}')))
    return count

# Count files
real_audio_dir = RAW_DATA_DIR / 'real'
synthetic_audio_dir = RAW_DATA_DIR / 'synthetic'

num_real = count_audio_files(real_audio_dir)
num_synthetic = count_audio_files(synthetic_audio_dir)

print("üéµ Dataset Statistics:")
print(f"  - Real Music Samples: {num_real}")
print(f"  - Synthetic Music Samples: {num_synthetic}")
print(f"  - Total Samples: {num_real + num_synthetic}\n")

if num_real == 0 and num_synthetic == 0:
    print("‚ö†Ô∏è  No audio files found!")
    print("\nüìù Instructions to Add Data:")
    print("1. Place real music files in: data/raw/real/")
    print("2. Place synthetic music files in: data/raw/synthetic/")
    print("3. Supported formats: WAV, MP3, FLAC, OGG, M4A")
    print("\nüí° For testing purposes, we'll generate sample data below.")
else:
    print("‚úÖ Dataset found! Ready for exploration.")

## 5. Generate Sample Data (For Testing)

If you don't have real data yet, let's generate some synthetic audio samples for testing the pipeline.

In [None]:
def generate_sample_audio(duration=5, sr=22050, frequency=440):
    """Generate a simple sine wave audio sample"""
    t = np.linspace(0, duration, int(sr * duration))
    # Add some harmonics for more realistic sound
    audio = np.sin(2 * np.pi * frequency * t)
    audio += 0.5 * np.sin(2 * np.pi * frequency * 2 * t)
    audio += 0.3 * np.sin(2 * np.pi * frequency * 3 * t)
    # Add noise
    audio += 0.1 * np.random.randn(len(audio))
    # Normalize
    audio = audio / np.max(np.abs(audio))
    return audio

# Generate sample files if no data exists
if num_real == 0 and num_synthetic == 0:
    print("üéº Generating sample audio files for testing...\n")
    
    sr = config['data']['sample_rate']
    duration = config['data']['duration']
    
    # Generate "real" samples (different frequencies)
    for i, freq in enumerate([440, 523, 659, 784, 880], 1):
        audio = generate_sample_audio(duration, sr, freq)
        filepath = real_audio_dir / f'real_sample_{i}.wav'
        sf.write(filepath, audio, sr)
        print(f"  ‚úì Created: {filepath.name}")
    
    # Generate "synthetic" samples (slightly different characteristics)
    for i, freq in enumerate([450, 533, 669, 794, 890], 1):
        audio = generate_sample_audio(duration, sr, freq)
        # Add more artificial characteristics
        audio = audio * (1 + 0.1 * np.sin(2 * np.pi * 2 * np.arange(len(audio)) / sr))
        filepath = synthetic_audio_dir / f'synthetic_sample_{i}.wav'
        sf.write(filepath, audio, sr)
        print(f"  ‚úì Created: {filepath.name}")
    
    print("\n‚úÖ Sample data generated successfully!")
    print("   Note: These are simple test samples. Replace with real datasets for actual training.")
    
    # Update counts
    num_real = count_audio_files(real_audio_dir)
    num_synthetic = count_audio_files(synthetic_audio_dir)
else:
    print("‚úÖ Using existing dataset.")

## 6. Load and Analyze Audio Files

In [None]:
def get_audio_files(directory):
    """Get list of all audio files in directory"""
    audio_files = []
    for ext in AUDIO_EXTENSIONS:
        audio_files.extend(list(directory.glob(f'*{ext}')))
        audio_files.extend(list(directory.glob(f'**/*{ext}')))
    return sorted(audio_files)

# Get file lists
real_files = get_audio_files(real_audio_dir)
synthetic_files = get_audio_files(synthetic_audio_dir)

print(f"üìä Found {len(real_files)} real audio files")
print(f"üìä Found {len(synthetic_files)} synthetic audio files\n")

# Display first few files
if real_files:
    print("Real audio files (first 5):")
    for f in real_files[:5]:
        print(f"  - {f.name}")

if synthetic_files:
    print("\nSynthetic audio files (first 5):")
    for f in synthetic_files[:5]:
        print(f"  - {f.name}")

## 7. Extract Audio Metadata

In [None]:
def get_audio_metadata(filepath):
    """Extract metadata from audio file"""
    try:
        # Load audio
        y, sr = librosa.load(filepath, sr=None)
        duration = librosa.get_duration(y=y, sr=sr)
        
        return {
            'filename': filepath.name,
            'duration': duration,
            'sample_rate': sr,
            'samples': len(y),
            'channels': 1,  # librosa loads as mono by default
            'label': 'real' if 'real' in str(filepath.parent) else 'synthetic'
        }
    except Exception as e:
        print(f"Error loading {filepath.name}: {e}")
        return None

# Collect metadata for all files
print("üìä Extracting metadata from audio files...\n")

metadata_list = []
for filepath in real_files + synthetic_files:
    metadata = get_audio_metadata(filepath)
    if metadata:
        metadata_list.append(metadata)

# Create DataFrame
df = pd.DataFrame(metadata_list)

print(f"‚úÖ Extracted metadata from {len(df)} audio files\n")
print("First few entries:")
display(df.head())

## 8. Dataset Statistics

In [None]:
print("üìà Dataset Statistics:\n")
print(df.groupby('label').agg({
    'filename': 'count',
    'duration': ['mean', 'min', 'max', 'std'],
    'sample_rate': lambda x: x.mode()[0] if len(x) > 0 else None,
    'samples': ['mean', 'min', 'max']
}).round(2))

print("\nüìä Label Distribution:")
print(df['label'].value_counts())

## 9. Visualize Dataset Distribution

In [None]:
# Create visualizations
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# 1. Label distribution
df['label'].value_counts().plot(kind='bar', ax=axes[0, 0], color=['#3498db', '#e74c3c'])
axes[0, 0].set_title('Distribution of Real vs Synthetic Music', fontsize=14, fontweight='bold')
axes[0, 0].set_xlabel('Label')
axes[0, 0].set_ylabel('Count')
axes[0, 0].tick_params(axis='x', rotation=0)

# 2. Duration distribution
df.boxplot(column='duration', by='label', ax=axes[0, 1])
axes[0, 1].set_title('Audio Duration Distribution by Label', fontsize=14, fontweight='bold')
axes[0, 1].set_xlabel('Label')
axes[0, 1].set_ylabel('Duration (seconds)')
plt.sca(axes[0, 1])
plt.xticks(rotation=0)

# 3. Sample rate distribution
df.groupby('label')['sample_rate'].value_counts().unstack().plot(kind='bar', ax=axes[1, 0])
axes[1, 0].set_title('Sample Rate Distribution', fontsize=14, fontweight='bold')
axes[1, 0].set_xlabel('Label')
axes[1, 0].set_ylabel('Count')
axes[1, 0].legend(title='Sample Rate (Hz)')
axes[1, 0].tick_params(axis='x', rotation=0)

# 4. Duration histogram
for label in df['label'].unique():
    data = df[df['label'] == label]['duration']
    axes[1, 1].hist(data, alpha=0.6, label=label, bins=15)
axes[1, 1].set_title('Duration Distribution Histogram', fontsize=14, fontweight='bold')
axes[1, 1].set_xlabel('Duration (seconds)')
axes[1, 1].set_ylabel('Frequency')
axes[1, 1].legend()

plt.tight_layout()
plt.show()

print("‚úÖ Dataset visualizations created!")

## 10. Load and Visualize Sample Audio

Let's load a sample from each class and visualize their waveforms.

In [None]:
# Select one sample from each class
if real_files and synthetic_files:
    sample_real = real_files[0]
    sample_synthetic = synthetic_files[0]
    
    # Load audio
    y_real, sr_real = librosa.load(sample_real, sr=config['data']['sample_rate'])
    y_synthetic, sr_synthetic = librosa.load(sample_synthetic, sr=config['data']['sample_rate'])
    
    print(f"üìª Loaded Samples:")
    print(f"  Real: {sample_real.name} (Duration: {len(y_real)/sr_real:.2f}s)")
    print(f"  Synthetic: {sample_synthetic.name} (Duration: {len(y_synthetic)/sr_synthetic:.2f}s)")
else:
    print("‚ö†Ô∏è  No audio files available for visualization.")

## 11. Waveform Visualization

In [None]:
if real_files and synthetic_files:
    fig, axes = plt.subplots(2, 1, figsize=(15, 8))
    
    # Real audio waveform
    librosa.display.waveshow(y_real, sr=sr_real, ax=axes[0], color='#3498db')
    axes[0].set_title('Real Music - Waveform', fontsize=14, fontweight='bold')
    axes[0].set_xlabel('Time (s)')
    axes[0].set_ylabel('Amplitude')
    axes[0].grid(True, alpha=0.3)
    
    # Synthetic audio waveform
    librosa.display.waveshow(y_synthetic, sr=sr_synthetic, ax=axes[1], color='#e74c3c')
    axes[1].set_title('Synthetic Music - Waveform', fontsize=14, fontweight='bold')
    axes[1].set_xlabel('Time (s)')
    axes[1].set_ylabel('Amplitude')
    axes[1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    print("‚úÖ Waveform visualizations created!")

## 12. Play Audio Samples

Listen to the audio samples directly in the notebook.

In [None]:
if real_files and synthetic_files:
    print("üéµ Real Music Sample:")
    display(Audio(y_real, rate=sr_real))
    
    print("\nüéµ Synthetic Music Sample:")
    display(Audio(y_synthetic, rate=sr_synthetic))

## 13. Spectral Analysis

Let's analyze the frequency content of the audio samples.

In [None]:
if real_files and synthetic_files:
    # Compute spectrograms
    D_real = librosa.amplitude_to_db(np.abs(librosa.stft(y_real)), ref=np.max)
    D_synthetic = librosa.amplitude_to_db(np.abs(librosa.stft(y_synthetic)), ref=np.max)
    
    fig, axes = plt.subplots(2, 1, figsize=(15, 10))
    
    # Real audio spectrogram
    img1 = librosa.display.specshow(D_real, sr=sr_real, x_axis='time', y_axis='hz', ax=axes[0], cmap='viridis')
    axes[0].set_title('Real Music - Spectrogram', fontsize=14, fontweight='bold')
    axes[0].set_ylabel('Frequency (Hz)')
    fig.colorbar(img1, ax=axes[0], format='%+2.0f dB')
    
    # Synthetic audio spectrogram
    img2 = librosa.display.specshow(D_synthetic, sr=sr_synthetic, x_axis='time', y_axis='hz', ax=axes[1], cmap='viridis')
    axes[1].set_title('Synthetic Music - Spectrogram', fontsize=14, fontweight='bold')
    axes[1].set_xlabel('Time (s)')
    axes[1].set_ylabel('Frequency (Hz)')
    fig.colorbar(img2, ax=axes[1], format='%+2.0f dB')
    
    plt.tight_layout()
    plt.show()
    
    print("‚úÖ Spectrogram visualizations created!")

## 14. Summary and Next Steps

In [None]:
print("="*70)
print("üìã DATA EXPLORATION SUMMARY")
print("="*70)
print(f"\n‚úÖ Total Audio Files: {len(df)}")
print(f"   - Real Music: {len(df[df['label']=='real'])}")
print(f"   - Synthetic Music: {len(df[df['label']=='synthetic'])}")
print(f"\nüìä Average Duration: {df['duration'].mean():.2f} seconds")
print(f"üìä Sample Rate: {df['sample_rate'].mode()[0]} Hz")
print(f"\n‚úÖ Dataset is {'balanced' if abs(len(df[df['label']=='real']) - len(df[df['label']=='synthetic'])) < 5 else 'imbalanced'}")

print("\n" + "="*70)
print("üìù NEXT STEPS")
print("="*70)
print("\n1Ô∏è‚É£  Feature Extraction (Notebook 02):")
print("   - Extract mel-spectrograms")
print("   - Compute MFCCs")
print("   - Extract chroma features")
print("   - Save preprocessed features")
print("\n2Ô∏è‚É£  Model Training (Notebook 03):")
print("   - Build Hybrid Transformer-Autoencoder model")
print("   - Train on preprocessed features")
print("   - Monitor training metrics")
print("\n3Ô∏è‚É£  Model Evaluation (Notebook 04):")
print("   - Evaluate on test set")
print("   - Generate confusion matrix")
print("   - Analyze model performance")
print("\n" + "="*70)
print("\nüéâ Data Exploration Complete! Ready for feature extraction.")

---

## üìå Key Takeaways

1. **Dataset Structure**: We have successfully set up the data directory structure with separate folders for real and synthetic music.

2. **Audio Characteristics**: We analyzed the duration, sample rate, and frequency content of our audio samples.

3. **Visualization**: We created waveforms and spectrograms to understand the differences between real and synthetic music.

4. **Next Steps**: We're ready to move to feature extraction in the next notebook.

---

### üí° Tips for Real Data

- **Real Music Sources**: Use royalty-free music from sources like Free Music Archive, YouTube Audio Library, or your own music collection
- **Synthetic Music Sources**: Generate AI music using tools like MusicGen, Jukebox, MuseNet, or Stable Audio
- **Data Diversity**: Include various genres, instruments, and styles for better model generalization
- **Data Quality**: Ensure consistent audio quality and format across all samples

---