## üìã Section 1: Environment Setup

In [1]:
import os
import sys
from pathlib import Path

# Get project root directory
current_dir = Path.cwd()
project_root = current_dir.parent if current_dir.name == 'notebooks' else current_dir
print(f"üìÅ Project Root: {project_root}")

# Add to Python path
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

# Create directory structure
dirs = ['data/raw/real', 'data/raw/synthetic', 'data/processed', 'outputs/results']
for dir_path in dirs:
    (project_root / dir_path).mkdir(parents=True, exist_ok=True)

print("‚úÖ Project structure ready!")

üìÅ Project Root: d:\Projects\AI-Music-DeepFake-Detector
‚úÖ Project structure ready!


In [2]:
# Import all required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import yaml
import warnings
from tqdm.auto import tqdm
import requests
import zipfile
import shutil
from datetime import datetime

# Audio processing
import librosa
import librosa.display
import soundfile as sf

# Deep Learning
import torch
import torchaudio

# Visualization
from IPython.display import Audio, display
import plotly.graph_objects as go
from plotly.subplots import make_subplots

warnings.filterwarnings('ignore')
np.random.seed(42)
torch.manual_seed(42)

plt.style.use('seaborn-v0_8-darkgrid')
%matplotlib inline

print("‚úÖ All libraries imported!")

‚úÖ All libraries imported!


In [3]:
# Check GPU availability
print(f"üî• PyTorch: {torch.__version__}")
print(f"üéµ TorchAudio: {torchaudio.__version__}")
print(f"üéº Librosa: {librosa.__version__}")

print(f"\n{'='*60}")
if torch.cuda.is_available():
    device = torch.device('cuda')
    print(f"‚úÖ CUDA Available!")
    print(f"   GPU: {torch.cuda.get_device_name(0)}")
    print(f"   Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
else:
    device = torch.device('cpu')
    print("‚ö†Ô∏è  CUDA not available - using CPU")

print(f"üñ•Ô∏è  Device: {device}")
print(f"{'='*60}")

üî• PyTorch: 2.7.1+cu118
üéµ TorchAudio: 2.7.1+cpu
üéº Librosa: 0.11.0

‚úÖ CUDA Available!
   GPU: NVIDIA GeForce MX450
   Memory: 2.15 GB
üñ•Ô∏è  Device: cuda


In [4]:
# Load configuration
config_path = project_root / 'config.yaml'
with open(config_path, 'r') as f:
    config = yaml.safe_load(f)

# Set data paths
data_dir = project_root / 'data'
raw_real_dir = project_root / 'data' / 'raw' / 'real'
raw_synthetic_dir = project_root / 'data' / 'raw' / 'synthetic'

print("‚úÖ Configuration loaded!")
print(f"   Sample Rate: {config['audio']['sample_rate']} Hz")
print(f"   Duration: {config['audio']['duration']} seconds")
print(f"   Mel Bins: {config['audio']['n_mels']}")

‚úÖ Configuration loaded!
   Sample Rate: 22050 Hz
   Duration: 10 seconds
   Mel Bins: 128


---
## üì• Section 2: Download Real Music Dataset

### Option 1: GTZAN Genre Dataset (Recommended)
**100 tracks, 10 genres, ~1.2GB**

### Option 2: FMA (Free Music Archive)
**Small version: 8,000 tracks, ~7GB**

### Option 3: Use Your Own Music
Place your music files in `data/raw/real/`

In [9]:
# Check if real music files already exist
existing_real_files = list(raw_real_dir.glob('**/*.wav')) + list(raw_real_dir.glob('**/*.mp3'))

if len(existing_real_files) >= 50:
    print(f"‚úÖ Found {len(existing_real_files)} real music files already!")
    print(f"üìÅ Location: {raw_real_dir}")
    print("\n‚ú® Skipping download - using existing files")
    real_count = len(existing_real_files)
else:
    print("üì• Downloading GTZAN Dataset from Kaggle...")
    print("‚è≥ This may take a few minutes...\n")
    
    try:
        import subprocess
        
        # Download using Kaggle API
        dataset_name = "andradaolteanu/gtzan-dataset-music-genre-classification"
        download_dir = project_root / 'data' / 'raw'
        
        print(f"üîΩ Downloading {dataset_name}...")
        result = subprocess.run(
            ['kaggle', 'datasets', 'download', '-d', dataset_name, '-p', str(download_dir), '--unzip'],
            capture_output=True,
            text=True
        )
        
        if result.returncode == 0:
            print("‚úÖ Download completed!")
            
            # Find the extracted folder with audio files
            print("\nüìÇ Organizing files...")
            
            # GTZAN dataset structure: genres/genre_name/*.wav
            genres_dir = download_dir / 'Data' / 'genres_original'
            
            if not genres_dir.exists():
                # Try alternative structure
                genres_dir = download_dir / 'genres'
            
            if genres_dir.exists():
                # Copy all wav files to raw/real directory
                wav_files = list(genres_dir.glob('**/*.wav'))
                print(f"üìÅ Found {len(wav_files)} .wav files")
                
                for wav_file in tqdm(wav_files, desc="Copying files"):
                    dest_file = raw_real_dir / wav_file.name
                    shutil.copy2(wav_file, dest_file)
                
                # Clean up extracted folders
                if (download_dir / 'Data').exists():
                    shutil.rmtree(download_dir / 'Data')
                
                real_count = len(list(raw_real_dir.glob('*.wav')))
                print(f"\n‚úÖ {real_count} files copied to {raw_real_dir}")
            else:
                print("‚ö†Ô∏è  Unexpected folder structure. Checking for files...")
                # Check if files are directly in download_dir
                all_wav = list(download_dir.glob('**/*.wav'))
                if len(all_wav) > 0:
                    print(f"üìÅ Found {len(all_wav)} .wav files")
                    for wav_file in tqdm(all_wav, desc="Copying files"):
                        if 'real' not in str(wav_file):
                            dest_file = raw_real_dir / wav_file.name
                            shutil.copy2(wav_file, dest_file)
                    real_count = len(list(raw_real_dir.glob('*.wav')))
                    print(f"\n‚úÖ {real_count} files copied to {raw_real_dir}")
                else:
                    print("‚ùå Could not find .wav files in downloaded data")
                    real_count = 0
        else:
            print("‚ùå Download failed!")
            print(f"Error: {result.stderr}")
            print("\nüí° Make sure you have:")
            print("   1. Kaggle API installed: pip install kaggle")
            print("   2. API credentials at: ~/.kaggle/kaggle.json")
            print("   3. Accepted dataset rules on Kaggle website")
            real_count = 0
            
    except FileNotFoundError:
        print("‚ùå Kaggle CLI not found!")
        print("\nüìù Install with: pip install kaggle")
        print("üìù Setup instructions: https://www.kaggle.com/docs/api")
        real_count = 0
    except Exception as e:
        print(f"‚ùå Error: {e}")
        real_count = 0

print(f"\n{'='*70}")
print(f"üìä Total real music files: {real_count}")
print(f"{'='*70}")


üì• Downloading GTZAN Dataset from Kaggle...
‚è≥ This may take a few minutes...

üîΩ Downloading andradaolteanu/gtzan-dataset-music-genre-classification...


Exception in thread Thread-7 (_readerthread):
Traceback (most recent call last):
  File [35m"d:\Apps\Python\Lib\threading.py"[0m, line [35m1043[0m, in [35m_bootstrap_inner[0m
    [31mself.run[0m[1;31m()[0m
    [31m~~~~~~~~[0m[1;31m^^[0m
  File [35m"C:\Users\nasir\AppData\Roaming\Python\Python313\site-packages\ipykernel\ipkernel.py"[0m, line [35m772[0m, in [35mrun_closure[0m
    [31m_threading_Thread_run[0m[1;31m(self)[0m
    [31m~~~~~~~~~~~~~~~~~~~~~[0m[1;31m^^^^^^[0m
  File [35m"d:\Apps\Python\Lib\threading.py"[0m, line [35m994[0m, in [35mrun[0m
    [31mself._target[0m[1;31m(*self._args, **self._kwargs)[0m
    [31m~~~~~~~~~~~~[0m[1;31m^^^^^^^^^^^^^^^^^^^^^^^^^^^^^[0m
  File [35m"d:\Apps\Python\Lib\subprocess.py"[0m, line [35m1615[0m, in [35m_readerthread[0m
    buffer.append([31mfh.read[0m[1;31m()[0m)
                  [31m~~~~~~~[0m[1;31m^^[0m
  File [35m"d:\Apps\Python\Lib\encodings\cp1252.py"[0m, line [35m23[0m, in [35mde

‚úÖ Download completed!

üìÇ Organizing files...
üìÅ Found 1000 .wav files


Copying files:   0%|          | 0/1000 [00:00<?, ?it/s]


‚úÖ 1000 files copied to d:\Projects\AI-Music-DeepFake-Detector\data\raw\real

üìä Total real music files: 1000


In [10]:
# Count real music files
real_files = list(raw_real_dir.glob('**/*.wav')) + list(raw_real_dir.glob('**/*.mp3'))
real_count = len(real_files)

if real_count > 0:
    print(f"‚úÖ {real_count} real music files found!")
    print(f"üìÅ Location: {raw_real_dir}")
    
    # Show file types
    wav_count = len(list(raw_real_dir.glob('**/*.wav')))
    mp3_count = len(list(raw_real_dir.glob('**/*.mp3')))
    print(f"\nüìä Format breakdown:")
    print(f"   WAV files: {wav_count}")
    print(f"   MP3 files: {mp3_count}")
    
    if real_count < 50:
        print(f"\n‚ö†Ô∏è  Only {real_count} files - recommend at least 50-100 for training")
else:
    print("‚ùå No real music files found yet.")
    print("üìù Please follow the manual download instructions above.")

‚úÖ 1000 real music files found!
üìÅ Location: d:\Projects\AI-Music-DeepFake-Detector\data\raw\real

üìä Format breakdown:
   WAV files: 1000
   MP3 files: 0


---
## ü§ñ Section 3: Generate Synthetic Music

Now let's generate 100 synthetic music samples using **MusicGen**.

In [11]:
# Install audiocraft (MusicGen)
print("üì¶ Installing audiocraft (MusicGen)...")
print("‚è≥ This may take a few minutes...\n")
!pip install -U audiocraft -q
print("\n‚úÖ audiocraft installed!")

üì¶ Installing audiocraft (MusicGen)...
‚è≥ This may take a few minutes...


‚úÖ audiocraft installed!


  error: subprocess-exited-with-error
  
  √ó installing build dependencies for spacy did not run successfully.
  ‚îÇ exit code: 1
  ‚ï∞‚îÄ> [100 lines of output]
      Collecting setuptools
        Using cached setuptools-80.9.0-py3-none-any.whl.metadata (6.6 kB)
      Collecting cython<3.0,>=0.25
        Using cached Cython-0.29.37-py2.py3-none-any.whl.metadata (3.1 kB)
      Collecting cymem<2.1.0,>=2.0.2
        Using cached cymem-2.0.13-cp313-cp313-win_amd64.whl.metadata (9.9 kB)
      Collecting preshed<3.1.0,>=3.0.2
        Using cached preshed-3.0.12-cp313-cp313-win_amd64.whl.metadata (2.6 kB)
      Collecting murmurhash<1.1.0,>=0.28.0
        Using cached murmurhash-1.0.15-cp313-cp313-win_amd64.whl.metadata (2.3 kB)
      Collecting thinc<8.2.0,>=8.1.8
        Downloading thinc-8.1.12.tar.gz (190 kB)
        Installing build dependencies: started
        Installing build dependencies: still running...
        Installing build dependencies: finished with status 'error'
        

In [None]:
# Test import first
try:
    from audiocraft.models import MusicGen
    import scipy.io.wavfile
    print("‚úÖ audiocraft imports successful!")
except ImportError as e:
    print(f"‚ùå Import error: {e}")
    print("\nüí° Trying to install audiocraft again...")
    import subprocess
    subprocess.run([sys.executable, "-m", "pip", "install", "-U", "audiocraft", "--no-cache-dir"], check=True)
    from audiocraft.models import MusicGen
    import scipy.io.wavfile
    print("‚úÖ audiocraft installed and imported!")

def generate_synthetic_music(num_samples=100, duration=10, use_gpu=True):
    """
    Generate synthetic music using MusicGen
    """
    print("üéµ Initializing MusicGen model...")
    print("‚è≥ First run will download model weights (~1.5GB)...\n")
    
    # Load model
    model = MusicGen.get_pretrained('facebook/musicgen-small')
    model.set_generation_params(duration=duration)
    
    # Use GPU if available
    if use_gpu and torch.cuda.is_available():
        model = model.to('cuda')
        print(f"‚úÖ Using GPU: {torch.cuda.get_device_name(0)}\n")
    else:
        print("‚ÑπÔ∏è  Using CPU (slower but works)\n")
    
    # Diverse prompts matching GTZAN genres
    prompts = [
        # Blues
        "slow blues guitar with soulful bending notes",
        "electric blues with harmonica and drums",
        # Classical
        "classical piano solo, romantic period",
        "string quartet playing baroque music",
        "orchestral symphony with full ensemble",
        # Country
        "country music with acoustic guitar and fiddle",
        "upbeat country song with banjo",
        # Disco
        "disco funk with strong bass line and drums",
        "70s disco with synthesizers and rhythm",
        # Hip-hop
        "hip hop beat with bass and hi-hats",
        "lo-fi hip hop with jazzy samples",
        # Jazz
        "smooth jazz with saxophone and piano",
        "bebop jazz with fast tempo and improvisation",
        # Metal
        "heavy metal guitar riff with drums",
        "thrash metal with aggressive guitar",
        # Pop
        "catchy pop music with melodic hooks",
        "upbeat pop song with synthesizers",
        # Reggae
        "reggae rhythm with offbeat guitar",
        "dub reggae with heavy bass",
        # Rock
        "rock music with electric guitar solo",
        "classic rock with drums and bass"
    ]
    
    print(f"üéº Generating {num_samples} synthetic tracks...")
    print(f"   Duration: {duration}s each")
    print(f"   Est. time: {num_samples * 0.3:.0f}-{num_samples * 1:.0f} minutes\n")
    
    raw_synthetic_dir.mkdir(parents=True, exist_ok=True)
    
    generated = 0
    for i in tqdm(range(num_samples), desc="Generating"):
        try:
            # Select prompt
            prompt = prompts[i % len(prompts)]
            
            # Generate
            with torch.no_grad():
                wav = model.generate([prompt])
            
            # Save
            output_file = raw_synthetic_dir / f"synthetic_{i:04d}.wav"
            audio_np = wav[0].cpu().numpy()
            
            scipy.io.wavfile.write(
                str(output_file),
                rate=model.sample_rate,
                data=audio_np.T
            )
            
            generated += 1
            
            # Clear cache every 10 samples
            if torch.cuda.is_available() and i % 10 == 0:
                torch.cuda.empty_cache()
                
        except Exception as e:
            print(f"\n‚ö†Ô∏è  Error on sample {i}: {e}")
            continue
    
    print(f"\n‚úÖ Generated {generated}/{num_samples} synthetic files!")
    print(f"üìÅ Location: {raw_synthetic_dir}")
    return generated

# Generate synthetic music
synthetic_count = generate_synthetic_music(num_samples=100, duration=10)

‚ùå audiocraft not installed!


---
## üìä Section 4: Dataset Verification

In [None]:
# Verify dataset
real_files = list(raw_real_dir.glob('**/*.wav')) + list(raw_real_dir.glob('**/*.mp3'))
synthetic_files = list(raw_synthetic_dir.glob('**/*.wav')) + list(raw_synthetic_dir.glob('**/*.mp3'))

print("\n" + "="*70)
print("üìä FINAL DATASET STATUS")
print("="*70)
print(f"\n‚úÖ Real Music Files:      {len(real_files):>6}")
print(f"‚úÖ Synthetic Music Files: {len(synthetic_files):>6}")
print(f"{'‚îÄ'*70}")
print(f"üìÅ Total Files:           {len(real_files) + len(synthetic_files):>6}")

if len(real_files) > 0 and len(synthetic_files) > 0:
    balance_ratio = len(real_files) / len(synthetic_files)
    print(f"\n‚öñÔ∏è  Balance Ratio: {balance_ratio:.2f}:1")
    
    if 0.8 <= balance_ratio <= 1.2:
        print("‚úÖ Dataset is well-balanced!")
    
    # Calculate size
    total_size = sum(f.stat().st_size for f in real_files + synthetic_files) / (1024**3)
    print(f"üíæ Total Size: {total_size:.2f} GB")
    print(f"\nüìù Status: Perfect dataset size for development and training!")

print("="*70)

---
## üéß Section 5: Audio Exploration

In [None]:
# Load sample files
if len(real_files) > 0 and len(synthetic_files) > 0:
    # Load one of each
    real_sample = real_files[0]
    synthetic_sample = synthetic_files[0]
    
    real_audio, sr_real = librosa.load(real_sample, sr=config['audio']['sample_rate'])
    synthetic_audio, sr_synth = librosa.load(synthetic_sample, sr=config['audio']['sample_rate'])
    
    print(f"‚úÖ Loaded samples:")
    print(f"   Real: {real_sample.name}")
    print(f"   Synthetic: {synthetic_sample.name}")
    print(f"\nüéß Listen to samples below:")
    
    print("\nüéº Real Music:")
    display(Audio(real_audio, rate=sr_real))
    
    print("\nü§ñ Synthetic Music:")
    display(Audio(synthetic_audio, rate=sr_synth))
else:
    print("‚ö†Ô∏è  No audio files found. Please run the download/generation cells above.")

In [None]:
# Visualize waveforms
if len(real_files) > 0 and len(synthetic_files) > 0:
    fig, axes = plt.subplots(2, 1, figsize=(15, 8))
    
    # Real
    librosa.display.waveshow(real_audio, sr=sr_real, ax=axes[0], color='blue', alpha=0.7)
    axes[0].set_title('üéº Real Music Waveform', fontsize=14, fontweight='bold')
    axes[0].set_xlabel('Time (s)')
    axes[0].set_ylabel('Amplitude')
    axes[0].grid(True, alpha=0.3)
    
    # Synthetic
    librosa.display.waveshow(synthetic_audio, sr=sr_synth, ax=axes[1], color='red', alpha=0.7)
    axes[1].set_title('ü§ñ Synthetic Music Waveform', fontsize=14, fontweight='bold')
    axes[1].set_xlabel('Time (s)')
    axes[1].set_ylabel('Amplitude')
    axes[1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()

In [None]:
# Visualize spectrograms
if len(real_files) > 0 and len(synthetic_files) > 0:
    # Extract mel-spectrograms
    mel_real = librosa.feature.melspectrogram(
        y=real_audio, sr=sr_real,
        n_fft=config['audio']['n_fft'],
        hop_length=config['audio']['hop_length'],
        n_mels=config['audio']['n_mels']
    )
    mel_real_db = librosa.power_to_db(mel_real, ref=np.max)
    
    mel_synth = librosa.feature.melspectrogram(
        y=synthetic_audio, sr=sr_synth,
        n_fft=config['audio']['n_fft'],
        hop_length=config['audio']['hop_length'],
        n_mels=config['audio']['n_mels']
    )
    mel_synth_db = librosa.power_to_db(mel_synth, ref=np.max)
    
    # Plot
    fig, axes = plt.subplots(1, 2, figsize=(16, 6))
    
    img1 = librosa.display.specshow(mel_real_db, sr=sr_real,
                                     hop_length=config['audio']['hop_length'],
                                     x_axis='time', y_axis='mel',
                                     ax=axes[0], cmap='viridis')
    axes[0].set_title('üéº Real Music - Mel-Spectrogram', fontsize=14, fontweight='bold')
    fig.colorbar(img1, ax=axes[0], format='%+2.0f dB')
    
    img2 = librosa.display.specshow(mel_synth_db, sr=sr_synth,
                                     hop_length=config['audio']['hop_length'],
                                     x_axis='time', y_axis='mel',
                                     ax=axes[1], cmap='viridis')
    axes[1].set_title('ü§ñ Synthetic Music - Mel-Spectrogram', fontsize=14, fontweight='bold')
    fig.colorbar(img2, ax=axes[1], format='%+2.0f dB')
    
    plt.tight_layout()
    plt.show()

In [None]:
# Compare features
if len(real_files) > 0 and len(synthetic_files) > 0:
    features_real = {
        'Spectral Centroid': librosa.feature.spectral_centroid(y=real_audio, sr=sr_real)[0].mean(),
        'Spectral Rolloff': librosa.feature.spectral_rolloff(y=real_audio, sr=sr_real)[0].mean(),
        'Spectral Bandwidth': librosa.feature.spectral_bandwidth(y=real_audio, sr=sr_real)[0].mean(),
        'Zero Crossing Rate': librosa.feature.zero_crossing_rate(real_audio)[0].mean(),
        'RMS Energy': librosa.feature.rms(y=real_audio)[0].mean()
    }
    
    features_synth = {
        'Spectral Centroid': librosa.feature.spectral_centroid(y=synthetic_audio, sr=sr_synth)[0].mean(),
        'Spectral Rolloff': librosa.feature.spectral_rolloff(y=synthetic_audio, sr=sr_synth)[0].mean(),
        'Spectral Bandwidth': librosa.feature.spectral_bandwidth(y=synthetic_audio, sr=sr_synth)[0].mean(),
        'Zero Crossing Rate': librosa.feature.zero_crossing_rate(synthetic_audio)[0].mean(),
        'RMS Energy': librosa.feature.rms(y=synthetic_audio)[0].mean()
    }
    
    # Create comparison
    comparison_df = pd.DataFrame({
        'Real': features_real,
        'Synthetic': features_synth
    })
    comparison_df['Difference (%)'] = ((comparison_df['Synthetic'] - comparison_df['Real']) / 
                                        comparison_df['Real'] * 100).round(2)
    
    print("\nüìä FEATURE COMPARISON")
    print("="*70)
    print(comparison_df.to_string())
    print("="*70)

---
## ‚úÖ Section 6: Summary & Next Steps

In [None]:
# Save dataset statistics
stats = {
    'timestamp': datetime.now().isoformat(),
    'real_files_count': len(real_files),
    'synthetic_files_count': len(synthetic_files),
    'total_files': len(real_files) + len(synthetic_files),
    'sample_rate': config['audio']['sample_rate'],
    'duration': config['audio']['duration'],
    'dataset_ready': len(real_files) > 0 and len(synthetic_files) > 0
}

results_dir = project_root / 'outputs' / 'results'
results_dir.mkdir(parents=True, exist_ok=True)
stats_file = results_dir / 'dataset_statistics.yaml'

with open(stats_file, 'w') as f:
    yaml.dump(stats, f, default_flow_style=False)

print(f"‚úÖ Statistics saved to: {stats_file}")

In [None]:
print("\n" + "="*70)
print("üéâ NOTEBOOK 01 COMPLETED SUCCESSFULLY!")
print("="*70)
print("\nüìù Summary:")
print(f"   ‚úÖ Environment verified")
print(f"   ‚úÖ Downloaded {len(real_files)} real audio files")
print(f"   ‚úÖ Generated {len(synthetic_files)} synthetic audio files")
print(f"   ‚úÖ Total dataset: {len(real_files) + len(synthetic_files)} samples")
print(f"   ‚úÖ Visualizations created")
print(f"   ‚úÖ Features extracted and analyzed")
print("\nüöÄ Next: Notebook 02 - Audio Preprocessing & Feature Extraction")
print("="*70)