# 📘 Arabic Dialect Dataset: Sample Creation Pipeline


This notebook builds a speech dataset from YouTube videos using the following steps:

YouTube Audio Download
→ Using yt-dlp to extract .mp3 audio from videos.

Audio Preparation
→ Converted to mono, downsampled to 16kHz, trimmed to 30 minutes max.

Voice Detection
→ WebRTC VAD detects 7-second segments with human voice + 500ms padding.

Segment Filtering
→ Segments must be at least 7 seconds long.

Normalization + Bandpass Filtering
→ Audio is normalized to -20 dBFS and filtered between 300–3400 Hz.

Metadata Creation
→ Each .wav sample includes:

sample_id

filename

dialect

duration

start_time_ms & end_time_ms

Multi-link Support
→ Pass 2–3 YouTube links per dialect to get diverse speakers.

In [7]:
# Sample Creation Pipeline (No Whisper)
import os
import random
import uuid
import csv
import subprocess
from pytube import YouTube
from pydub import AudioSegment
import webrtcvad

# ---------------------------
# CONFIGURATION
# ---------------------------
CHUNK_DURATION_MS = 7000                   # Desired chunk length
VAD_MODE = 2                               # WebRTC VAD aggressiveness (0 = loose, 3 = strict)
TARGET_DBFS = -20.0                        # Normalize loudness
SAMPLE_RATE = 16000                        # Audio sample rate (Hz)
CHANNELS = 1                               # Mono channel
PADDING_MS = 500                           # Padding before/after speech (ms)
MAX_AUDIO_DURATION_MIN = 30                # Trim long audio
OUTPUT_BASE = "./Dataset"                  # Base output directory

# ---------------------------
# Download YouTube Audio
# ---------------------------
def download_youtube_audio(youtube_url, dialect_dir, filename="full_audio.mp3"):
    full_audio_dir = os.path.join(dialect_dir, "full_audio")
    os.makedirs(full_audio_dir, exist_ok=True)
    
    output_path = os.path.join(full_audio_dir, filename)

    print(f"🔽 Downloading audio from YouTube: {youtube_url}")
    command = [
        "yt-dlp", "--force-ipv4",
        "--add-header", "User-Agent: Mozilla/5.0",
        "-x", "--audio-format", "mp3",
        "--output", output_path, youtube_url
    ]

    subprocess.run(command, check=True)
    print(f"🎧 Audio downloaded and saved to {output_path}")
    return output_path


# ---------------------------
# Prepare and Trim Audio
# ---------------------------
def prepare_audio(input_path):
    print("🎚️  Preparing audio (mono + 16kHz)...")
    audio = AudioSegment.from_file(input_path)
    audio = audio[60_000:]  # Skip first 60 seconds to avoid intros


    max_duration_ms = MAX_AUDIO_DURATION_MIN * 60 * 1000
    if len(audio) > max_duration_ms:
        print("⏱️ Trimming audio to 30 minutes max")
        return audio[:max_duration_ms]
    return audio

# --------------------------
# Normalize Volume
# ---------------------------
def normalize_audio(audio_segment, target_dBFS=TARGET_DBFS):
    change = target_dBFS - audio_segment.dBFS
    return audio_segment.apply_gain(change)

# ---------------------------
# Frame Slicer for VAD
# ---------------------------
def make_frames(audio_segment, sample_rate, frame_duration_ms):
    frame_len = int(sample_rate * frame_duration_ms / 1000.0) * 2
    audio_bytes = audio_segment.raw_data
    frames = []
    for i in range(0, len(audio_bytes), frame_len):
        frame = audio_bytes[i:i + frame_len]
        if len(frame) == frame_len:
            timestamp = int(i / (sample_rate * 2) * 1000)
            frames.append((timestamp, frame))
    return frames

# ---------------------------
# VAD Collector
# ---------------------------
def vad_collector(audio_segment, sample_rate=SAMPLE_RATE, chunk_ms=30, vad_mode=VAD_MODE):
    print("🗣️  Detecting voiced segments using WebRTC VAD...")
    vad = webrtcvad.Vad(vad_mode)
    frames = make_frames(audio_segment, sample_rate, chunk_ms)
    segments = []
    voiced = []

    for i, (timestamp, frame) in enumerate(frames):
        is_speech = vad.is_speech(frame, sample_rate)
        if is_speech:
            voiced.append((timestamp, frame))
        elif voiced:
            start_ms = frames[i - len(voiced)][0]
            end_ms = timestamp

            chunk_start = max(0, start_ms - PADDING_MS)
            chunk_end = min(len(audio_segment), end_ms + PADDING_MS)
            chunk = audio_segment[chunk_start:chunk_end]

            if len(chunk) >= CHUNK_DURATION_MS:
                segments.append((chunk[:CHUNK_DURATION_MS], chunk_start, chunk_end))
                print(f"🎙️  Segment extracted: {chunk_start}ms → {chunk_end}ms")

            voiced = []

    print(f"✅ VAD found {len(segments)} voiced segments")
    return segments

# ---------------------------
# Save audio + metadata
# ---------------------------
def ms_to_timestamp(ms):
    seconds = int(ms / 1000)
    minutes = seconds // 60
    seconds = seconds % 60
    return f"{minutes:02}:{seconds:02}"

def save_segments_sequential(segments, output_dir, dialect, source_url, quota=100):
    print(f"💾 Saving segments for dialect: {dialect}")
    os.makedirs(output_dir, exist_ok=True)
    metadata_path = os.path.join(output_dir, f"{dialect}_metadata.csv")

    with open(metadata_path, mode='w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow([
            "sample_id", "filename", "dialect", "duration_sec",
            "start_time_ms", "end_time_ms",
            "start_time_str", "end_time_str",
            "source_url"
        ])

        count = 0
        random.shuffle(segments)

        for i, (seg, start, end) in enumerate(segments[:quota]):
            seg = seg.low_pass_filter(3000).high_pass_filter(400)
            seg = normalize_audio(seg)

            index_str = f"{i+1:04}"  # e.g., "0001", "0023"
            sample_id = f"sample_{index_str}"
            filename = f"{dialect}_chunk_{index_str}.wav"
            final_path = os.path.join(output_dir, filename)
            seg.export(final_path, format="wav")

            duration_sec = round(seg.duration_seconds, 2)
            start_str = ms_to_timestamp(start)
            end_str = ms_to_timestamp(end)

            writer.writerow([
                sample_id, filename, dialect, duration_sec,
                start, end, start_str, end_str,
                source_url
            ])
            print(f"✅ [{count+1}] Saved: {filename}")
            count += 1

    print(f"📁 Metadata written to: {metadata_path}")
    print(f"🎉 Total usable segments saved: {count}")




# ---------------------------
# Main Pipeline for a Dialect
# ---------------------------

def process_multiple_youtube_links(dialect, links, quota=100):
    print(f"\n🌍 Starting dataset build for: {dialect}")
    dialect_dir = os.path.join(OUTPUT_BASE, dialect)
    os.makedirs(dialect_dir, exist_ok=True)

    all_segments = []
    quota_per_link = int(quota * 1.5)

    for i, url in enumerate(links):
        print(f"\n🔗 Processing video {i+1}/{len(links)}: {url}")
        try:
            audio_path = download_youtube_audio(url, dialect_dir, f"audio_{i+1}.mp3")
            audio = prepare_audio(audio_path)
            segments = vad_collector(audio)
            for seg in segments:
                all_segments.append((*seg, url))  # Add url to segment tuple
        except Exception as e:
            print(f"⚠️ Error with link {url} — {e}")
            continue

    if not all_segments:
        print(f"❌ No valid segments found for {dialect}")
        return

    print(f"🧮 Total segments collected: {len(all_segments)}")

    # Fair sampling across videos — avoid duplicates from same time ranges
    random.shuffle(all_segments)
    selected_segments = []
    seen_ids = set()
    for seg in all_segments:
        start = seg[1]
        end = seg[2]
        seg_id = f"{start}_{end}"
        if seg_id not in seen_ids:
            selected_segments.append(seg)
            seen_ids.add(seg_id)
        if len(selected_segments) >= quota * 2:
            break

    # Remove source_url from each tuple for saving
    final_segments = [(seg, start, end) for seg, start, end, _ in selected_segments]

    # ✅ Only this function should save/export audio
    save_segments_sequential(final_segments, dialect_dir, dialect, source_url="multiple", quota=quota)

    print(f"🏁 Finished dialect: {dialect}")


# Links



This notebook contains YouTube audio collection and cleaning blocks for Arabic dialects:
- Lebanese
- Egyptian
- Syrian
- Palestinian
- Jordanian
- Iraqi
- Saudi
- Emirati

Select the block for the dialect you want to process and then run the final pipeline cell above.

### Lebanese

In [10]:
dialect = "Lebanese"
links = [
    "https://www.youtube.com/watch?v=wLZ5TkkJyzI",
    "https://www.youtube.com/watch?v=QALZXfprao4",
    "https://www.youtube.com/watch?v=bFagwgZSppc"
]

process_multiple_youtube_links(dialect, links, quota=100)




🌍 Starting dataset build for: Lebanese

🔗 Processing video 1/3: https://www.youtube.com/watch?v=wLZ5TkkJyzI
🔽 Downloading audio from YouTube: https://www.youtube.com/watch?v=wLZ5TkkJyzI
[youtube] Extracting URL: https://www.youtube.com/watch?v=wLZ5TkkJyzI
[youtube] wLZ5TkkJyzI: Downloading webpage
[youtube] wLZ5TkkJyzI: Downloading tv client config
[youtube] wLZ5TkkJyzI: Downloading tv player API JSON
[youtube] wLZ5TkkJyzI: Downloading ios player API JSON
[youtube] wLZ5TkkJyzI: Downloading m3u8 information
[info] wLZ5TkkJyzI: Downloading 1 format(s): 251
[download] ./Dataset/Lebanese/full_audio/audio_1.mp3 has already been downloaded
[ExtractAudio] Not converting audio ./Dataset/Lebanese/full_audio/audio_1.mp3; file is already in target format mp3
🎧 Audio downloaded and saved to ./Dataset/Lebanese/full_audio/audio_1.mp3
🎚️  Preparing audio (mono + 16kHz)...
⏱️ Trimming audio to 30 minutes max
🗣️  Detecting voiced segments using WebRTC VAD...
🎙️  Segment extracted: 34840ms → 42560ms
🎙️

### Jordanian

In [11]:

dialect = "Jordanian"
links = [
    "https://www.youtube.com/watch?v=MewR-nXgWcw",
    "https://www.youtube.com/watch?v=M0rD780_w-I",
    "https://www.youtube.com/watch?v=pxCXrJxP7RI"
]
process_multiple_youtube_links(dialect, links, quota=100)


🌍 Starting dataset build for: Jordanian

🔗 Processing video 1/3: https://www.youtube.com/watch?v=MewR-nXgWcw
🔽 Downloading audio from YouTube: https://www.youtube.com/watch?v=MewR-nXgWcw
[youtube] Extracting URL: https://www.youtube.com/watch?v=MewR-nXgWcw
[youtube] MewR-nXgWcw: Downloading webpage
[youtube] MewR-nXgWcw: Downloading tv client config
[youtube] MewR-nXgWcw: Downloading tv player API JSON
[youtube] MewR-nXgWcw: Downloading ios player API JSON
[youtube] MewR-nXgWcw: Downloading m3u8 information
[info] MewR-nXgWcw: Downloading 1 format(s): 251
[download] ./Dataset/Jordanian/full_audio/audio_1.mp3 has already been downloaded
[ExtractAudio] Not converting audio ./Dataset/Jordanian/full_audio/audio_1.mp3; file is already in target format mp3
🎧 Audio downloaded and saved to ./Dataset/Jordanian/full_audio/audio_1.mp3
🎚️  Preparing audio (mono + 16kHz)...
⏱️ Trimming audio to 30 minutes max
🗣️  Detecting voiced segments using WebRTC VAD...
🎙️  Segment extracted: 4750ms → 11780ms

### Palestinian

In [9]:

dialect = "Palestinian"
links = [
    "https://www.youtube.com/watch?v=kdlKIFtCG3M",
    "https://www.youtube.com/watch?v=h65q8keASFI",
    "https://www.youtube.com/watch?v=29a5EkgYJP8"
]
process_multiple_youtube_links(dialect, links, quota=100) 


🌍 Starting dataset build for: Palestinian

🔗 Processing video 1/3: https://www.youtube.com/watch?v=kdlKIFtCG3M
🔽 Downloading audio from YouTube: https://www.youtube.com/watch?v=kdlKIFtCG3M
[youtube] Extracting URL: https://www.youtube.com/watch?v=kdlKIFtCG3M
[youtube] kdlKIFtCG3M: Downloading webpage
[youtube] kdlKIFtCG3M: Downloading tv client config
[youtube] kdlKIFtCG3M: Downloading tv player API JSON
[youtube] kdlKIFtCG3M: Downloading ios player API JSON
[youtube] kdlKIFtCG3M: Downloading m3u8 information
[info] kdlKIFtCG3M: Downloading 1 format(s): 251
[download] ./Dataset/Palestinian/full_audio/audio_1.mp3 has already been downloaded
[ExtractAudio] Not converting audio ./Dataset/Palestinian/full_audio/audio_1.mp3; file is already in target format mp3
🎧 Audio downloaded and saved to ./Dataset/Palestinian/full_audio/audio_1.mp3
🎚️  Preparing audio (mono + 16kHz)...
⏱️ Trimming audio to 30 minutes max
🗣️  Detecting voiced segments using WebRTC VAD...
🎙️  Segment extracted: 15190ms 

### Syrian

In [16]:
dialect = "Syrian"
links = [
    "https://www.youtube.com/watch?v=5xI_961l_Xw",
    "https://www.youtube.com/watch?v=rBhmjk7ruPk",
    "https://www.youtube.com/watch?v=1LI5RPW_Sto"
]
process_multiple_youtube_links(dialect, links, quota=100) 


🌍 Starting dataset build for: Syrian

🔗 Processing video 1/3: https://www.youtube.com/watch?v=5xI_961l_Xw
🔽 Downloading audio from YouTube: https://www.youtube.com/watch?v=5xI_961l_Xw
[youtube] Extracting URL: https://www.youtube.com/watch?v=5xI_961l_Xw
[youtube] 5xI_961l_Xw: Downloading webpage
[youtube] 5xI_961l_Xw: Downloading tv client config
[youtube] 5xI_961l_Xw: Downloading tv player API JSON
[youtube] 5xI_961l_Xw: Downloading ios player API JSON
[youtube] 5xI_961l_Xw: Downloading m3u8 information
[info] 5xI_961l_Xw: Downloading 1 format(s): 251
[download] Destination: ./Dataset/Syrian/full_audio/audio_1.webm
[download] 100% of   64.81MiB in 00:02:57 at 374.03KiB/s    
[ExtractAudio] Destination: ./Dataset/Syrian/full_audio/audio_1.mp3
Deleting original file ./Dataset/Syrian/full_audio/audio_1.webm (pass -k to keep)
🎧 Audio downloaded and saved to ./Dataset/Syrian/full_audio/audio_1.mp3
🎚️  Preparing audio (mono + 16kHz)...
⏱️ Trimming audio to 30 minutes max
🗣️  Detecting voic



[info] rBhmjk7ruPk: Downloading 1 format(s): 251
[download] Destination: ./Dataset/Syrian/full_audio/audio_2.webm
[download] 100% of  124.78MiB in 00:08:01 at 265.63KiB/s    
[ExtractAudio] Destination: ./Dataset/Syrian/full_audio/audio_2.mp3
Deleting original file ./Dataset/Syrian/full_audio/audio_2.webm (pass -k to keep)
🎧 Audio downloaded and saved to ./Dataset/Syrian/full_audio/audio_2.mp3
🎚️  Preparing audio (mono + 16kHz)...
⏱️ Trimming audio to 30 minutes max
🗣️  Detecting voiced segments using WebRTC VAD...
🎙️  Segment extracted: 63370ms → 70430ms
🎙️  Segment extracted: 69700ms → 78920ms
🎙️  Segment extracted: 86890ms → 95480ms
🎙️  Segment extracted: 118480ms → 126800ms
🎙️  Segment extracted: 126580ms → 134600ms
🎙️  Segment extracted: 135460ms → 144950ms
🎙️  Segment extracted: 145030ms → 152240ms
🎙️  Segment extracted: 234940ms → 245660ms
🎙️  Segment extracted: 249850ms → 261200ms
🎙️  Segment extracted: 276400ms → 288920ms
🎙️  Segment extracted: 300790ms → 308180ms
🎙️  Segment 

### Saudi

In [8]:

dialect = "Saudi"
links = [
    "https://www.youtube.com/watch?v=UjXvq_YyeZ0",
    "https://www.youtube.com/watch?v=Rw_bgXykSbQ",
    "https://www.youtube.com/watch?v=gtUoblGRmmc"
]


process_multiple_youtube_links(dialect, links, quota=100)



🌍 Starting dataset build for: Saudi

🔗 Processing video 1/3: https://www.youtube.com/watch?v=UjXvq_YyeZ0
🔽 Downloading audio from YouTube: https://www.youtube.com/watch?v=UjXvq_YyeZ0
[youtube] Extracting URL: https://www.youtube.com/watch?v=UjXvq_YyeZ0
[youtube] UjXvq_YyeZ0: Downloading webpage
[youtube] UjXvq_YyeZ0: Downloading tv client config
[youtube] UjXvq_YyeZ0: Downloading tv player API JSON
[youtube] UjXvq_YyeZ0: Downloading ios player API JSON
[youtube] UjXvq_YyeZ0: Downloading m3u8 information
[info] UjXvq_YyeZ0: Downloading 1 format(s): 251
[download] ./Dataset/Saudi/full_audio/audio_1.mp3 has already been downloaded
[ExtractAudio] Not converting audio ./Dataset/Saudi/full_audio/audio_1.mp3; file is already in target format mp3
🎧 Audio downloaded and saved to ./Dataset/Saudi/full_audio/audio_1.mp3
🎚️  Preparing audio (mono + 16kHz)...
⏱️ Trimming audio to 30 minutes max
🗣️  Detecting voiced segments using WebRTC VAD...
🎙️  Segment extracted: 1180ms → 9770ms
🎙️  Segment extr

### Egyptian

In [12]:

dialect = "Egyptian"
links = [
    "https://www.youtube.com/watch?v=znNR10W1CbA",
    "https://www.youtube.com/watch?v=GVdMd9yJq8Q",
    "https://www.youtube.com/watch?v=0UTfIbe8nZg"
]

process_multiple_youtube_links(dialect, links, quota=100) 


🌍 Starting dataset build for: Egyptian

🔗 Processing video 1/3: https://www.youtube.com/watch?v=znNR10W1CbA
🔽 Downloading audio from YouTube: https://www.youtube.com/watch?v=znNR10W1CbA
[youtube] Extracting URL: https://www.youtube.com/watch?v=znNR10W1CbA
[youtube] znNR10W1CbA: Downloading webpage
[youtube] znNR10W1CbA: Downloading tv client config
[youtube] znNR10W1CbA: Downloading tv player API JSON
[youtube] znNR10W1CbA: Downloading ios player API JSON
[youtube] znNR10W1CbA: Downloading m3u8 information
[info] znNR10W1CbA: Downloading 1 format(s): 251
[download] Destination: ./Dataset/Egyptian/full_audio/audio_1.webm
[download] 100% of   40.86MiB in 00:02:16 at 305.49KiB/s   
[ExtractAudio] Destination: ./Dataset/Egyptian/full_audio/audio_1.mp3
Deleting original file ./Dataset/Egyptian/full_audio/audio_1.webm (pass -k to keep)
🎧 Audio downloaded and saved to ./Dataset/Egyptian/full_audio/audio_1.mp3
🎚️  Preparing audio (mono + 16kHz)...
⏱️ Trimming audio to 30 minutes max
🗣️  Detec



[youtube] 0UTfIbe8nZg: Downloading m3u8 information
[info] 0UTfIbe8nZg: Downloading 1 format(s): 234
[hlsnative] Downloading m3u8 manifest
[hlsnative] Total fragments: 466
[download] Destination: ./Dataset/Egyptian/full_audio/audio_3.mp4
[download] 100% of   37.06MiB in 00:02:43 at 231.44KiB/s                 
[ExtractAudio] Destination: ./Dataset/Egyptian/full_audio/audio_3.mp3
Deleting original file ./Dataset/Egyptian/full_audio/audio_3.mp4 (pass -k to keep)
🎧 Audio downloaded and saved to ./Dataset/Egyptian/full_audio/audio_3.mp3
🎚️  Preparing audio (mono + 16kHz)...
⏱️ Trimming audio to 30 minutes max
🗣️  Detecting voiced segments using WebRTC VAD...
🎙️  Segment extracted: 29680ms → 38420ms
🎙️  Segment extracted: 81400ms → 97010ms
🎙️  Segment extracted: 96250ms → 104870ms
🎙️  Segment extracted: 115780ms → 124850ms
🎙️  Segment extracted: 149410ms → 157730ms
🎙️  Segment extracted: 158800ms → 169640ms
🎙️  Segment extracted: 204370ms → 213020ms
🎙️  Segment extracted: 245020ms → 253670m

### Emirati


In [13]:

dialect = "Emirati"
links = [
    "https://www.youtube.com/watch?v=paT8c6QXOhA",
    "https://www.youtube.com/watch?v=foiP4rCvkm4",
    "https://www.youtube.com/watch?v=gc6gd8vN9ys"
]

process_multiple_youtube_links(dialect, links, quota=100)


🌍 Starting dataset build for: Emirati

🔗 Processing video 1/3: https://www.youtube.com/watch?v=paT8c6QXOhA
🔽 Downloading audio from YouTube: https://www.youtube.com/watch?v=paT8c6QXOhA
[youtube] Extracting URL: https://www.youtube.com/watch?v=paT8c6QXOhA
[youtube] paT8c6QXOhA: Downloading webpage
[youtube] paT8c6QXOhA: Downloading tv client config
[youtube] paT8c6QXOhA: Downloading tv player API JSON
[youtube] paT8c6QXOhA: Downloading ios player API JSON
[youtube] paT8c6QXOhA: Downloading m3u8 information
[info] paT8c6QXOhA: Downloading 1 format(s): 251
[download] ./Dataset/Emirati/full_audio/audio_1.mp3 has already been downloaded
[ExtractAudio] Not converting audio ./Dataset/Emirati/full_audio/audio_1.mp3; file is already in target format mp3
🎧 Audio downloaded and saved to ./Dataset/Emirati/full_audio/audio_1.mp3
🎚️  Preparing audio (mono + 16kHz)...
⏱️ Trimming audio to 30 minutes max
🗣️  Detecting voiced segments using WebRTC VAD...
🎙️  Segment extracted: 940ms → 9230ms
🎙️  Segme

### Iraqi

In [15]:

dialect = "Iraqi"
links = [
    "https://www.youtube.com/watch?v=QQWbQDojvqc",
    "https://www.youtube.com/watch?v=Za8S1EpAKtM",
    "https://www.youtube.com/watch?v=8Kb-CUA15No"
]
process_multiple_youtube_links(dialect, links, quota=100) 


🌍 Starting dataset build for: Iraqi

🔗 Processing video 1/3: https://www.youtube.com/watch?v=QQWbQDojvqc
🔽 Downloading audio from YouTube: https://www.youtube.com/watch?v=QQWbQDojvqc
[youtube] Extracting URL: https://www.youtube.com/watch?v=QQWbQDojvqc
[youtube] QQWbQDojvqc: Downloading webpage
[youtube] QQWbQDojvqc: Downloading tv client config
[youtube] QQWbQDojvqc: Downloading tv player API JSON
[youtube] QQWbQDojvqc: Downloading ios player API JSON
[youtube] QQWbQDojvqc: Downloading m3u8 information
[info] QQWbQDojvqc: Downloading 1 format(s): 251
[download] ./Dataset/Iraqi/full_audio/audio_1.mp3 has already been downloaded
[ExtractAudio] Not converting audio ./Dataset/Iraqi/full_audio/audio_1.mp3; file is already in target format mp3
🎧 Audio downloaded and saved to ./Dataset/Iraqi/full_audio/audio_1.mp3
🎚️  Preparing audio (mono + 16kHz)...
⏱️ Trimming audio to 30 minutes max
🗣️  Detecting voiced segments using WebRTC VAD...
🎙️  Segment extracted: 140500ms → 152000ms
🎙️  Segment 

# Dataset Splitter


##### 🔀 Dataset Splitting: Train / Validation / Test

After collecting and cleaning audio samples for each dialect, we split the data into:
- **Training set** (`train/`): used to train the model
- **Validation set** (`val/`): used during training to check performance
- **Test set** (`test/`): used after training to evaluate final accuracy

Each split gets its own folder with:
- Clean `.wav` audio files
- A corresponding `mel/` folder for Mel spectrograms

This process also updates the metadata file to include a new column: `"split"`.

In [17]:
import os
import pandas as pd
import shutil
from sklearn.model_selection import train_test_split

def split_all_dialects(base_dir="./Dataset", dialects=None, test_size=0.15, val_size=0.15):
    if dialects is None:
        dialects = ['Lebanese', 'Egyptian', 'Jordanian', 'Syrian', 'Iraqi', 'Palestinian', 'Saudi', 'Emirati']

    for dialect in dialects:
        dialect_dir = os.path.join(base_dir, dialect)
        metadata_path = os.path.join(dialect_dir, f"{dialect}_metadata.csv")

        if not os.path.exists(metadata_path):
            print(f"⚠️ Skipping {dialect} — metadata not found at {metadata_path}")
            continue

        print(f"\n📂 Splitting dataset for: {dialect}")
        df = pd.read_csv(metadata_path)

        # Split the data
        train_val, test = train_test_split(df, test_size=test_size, random_state=42)
        train, val = train_test_split(train_val, test_size=val_size / (1 - test_size), random_state=42)

        # Assign split labels
        df['split'] = 'train'
        df.loc[val.index, 'split'] = 'val'
        df.loc[test.index, 'split'] = 'test'

        # Create folders
        for split in ['train', 'val', 'test']:
            for sub in ['wav', 'mel']:
                os.makedirs(os.path.join(dialect_dir, split, sub), exist_ok=True)

        # Move files
        moved_count = 0
        for _, row in df.iterrows():
            src = os.path.join(dialect_dir, row['filename'])
            dst = os.path.join(dialect_dir, row['split'], 'wav', row['filename'])
            try:
                shutil.move(src, dst)
                moved_count += 1
            except FileNotFoundError:
                print(f"⚠️ File not found: {src}")

        # Save new metadata
        df.to_csv(metadata_path, index=False)
        print(f"✅ {dialect}: {len(df)} files split → Train: {len(train)}, Val: {len(val)}, Test: {len(test)}, Moved: {moved_count}")

# Run the function
split_all_dialects()


📂 Splitting dataset for: Lebanese
✅ Lebanese: 100 files split → Train: 69, Val: 16, Test: 15, Moved: 100

📂 Splitting dataset for: Egyptian
✅ Egyptian: 100 files split → Train: 69, Val: 16, Test: 15, Moved: 100

📂 Splitting dataset for: Jordanian
✅ Jordanian: 98 files split → Train: 68, Val: 15, Test: 15, Moved: 98

📂 Splitting dataset for: Syrian
✅ Syrian: 100 files split → Train: 69, Val: 16, Test: 15, Moved: 100

📂 Splitting dataset for: Iraqi
✅ Iraqi: 100 files split → Train: 69, Val: 16, Test: 15, Moved: 100

📂 Splitting dataset for: Palestinian
✅ Palestinian: 100 files split → Train: 69, Val: 16, Test: 15, Moved: 100

📂 Splitting dataset for: Saudi
✅ Saudi: 100 files split → Train: 69, Val: 16, Test: 15, Moved: 100

📂 Splitting dataset for: Emirati
✅ Emirati: 100 files split → Train: 69, Val: 16, Test: 15, Moved: 100


# 🎼 Generate Mel Spectrograms



For each audio file, we generate a Mel spectrogram and save it as a `.png` image.

This helps transform raw `.wav` files into a visual format for training CNN models.

After generation:
- Spectrograms are stored in `mel/` folders inside each split (`train`, `val`, `test`)
- Their paths are recorded in the metadata under the `"mel_path"` column


In [8]:
import os
import pandas as pd
import librosa
import librosa.display
import matplotlib.pyplot as plt
import numpy as np

def generate_mel_spectrograms(dialect_dir, dialect):
    print(f"\n🎼 Generating Mel spectrograms for: {dialect}")
    
    df_path = os.path.join(dialect_dir, f"{dialect}_metadata.csv")
    if not os.path.exists(df_path):
        print(f"❌ Metadata not found at: {df_path}")
        return

    df = pd.read_csv(df_path)
    print(f"🧾 Loaded {len(df)} rows")

    if 'split' not in df.columns:
        print("❌ Missing 'split' column. Run split_dataset() first.")
        return

    df['mel_path'] = ""

    for i, row in df.iterrows():
        split = row['split']
        filename = row['filename']
        wav_path = os.path.join(dialect_dir, split, 'wav', filename)
        mel_filename = filename.replace('.wav', '.png')
        mel_path = os.path.join(dialect_dir, split, 'mel', mel_filename)

        if not os.path.exists(wav_path):
            print(f"⚠️ Skipping: {wav_path} not found")
            continue

        try:
            y, sr = librosa.load(wav_path, sr=16000)
            mel = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128)
            mel_db = librosa.power_to_db(mel, ref=np.max)

            plt.figure(figsize=(3, 3))
            librosa.display.specshow(mel_db, sr=sr, cmap='viridis')
            plt.axis('off')
            plt.tight_layout()
            plt.savefig(mel_path, bbox_inches='tight', pad_inches=0)
            plt.close()

            df.at[i, 'mel_path'] = mel_path
            print(f"✅ Saved: {mel_path}")
        except Exception as e:
            print(f"❌ Error on {filename} → {e}")

    df.to_csv(df_path, index=False)
    print(f"✅ Metadata updated for: {dialect}")

# 🌀 Run this for all dialects
dialects = ['Lebanese', 'Egyptian', 'Jordanian', 'Syrian', 'Iraqi', 'Palestinian', 'Saudi', 'Emirati']
for dialect in dialects:
    generate_mel_spectrograms(os.path.join('./Dataset', dialect), dialect)



🎼 Generating Mel spectrograms for: Lebanese
🧾 Loaded 100 rows
✅ Saved: ./Dataset/Lebanese/test/mel/Lebanese_chunk_0001.png
✅ Saved: ./Dataset/Lebanese/train/mel/Lebanese_chunk_0002.png
✅ Saved: ./Dataset/Lebanese/val/mel/Lebanese_chunk_0003.png
✅ Saved: ./Dataset/Lebanese/train/mel/Lebanese_chunk_0004.png
✅ Saved: ./Dataset/Lebanese/val/mel/Lebanese_chunk_0005.png
✅ Saved: ./Dataset/Lebanese/train/mel/Lebanese_chunk_0006.png
✅ Saved: ./Dataset/Lebanese/train/mel/Lebanese_chunk_0007.png
✅ Saved: ./Dataset/Lebanese/train/mel/Lebanese_chunk_0008.png
✅ Saved: ./Dataset/Lebanese/train/mel/Lebanese_chunk_0009.png
✅ Saved: ./Dataset/Lebanese/train/mel/Lebanese_chunk_0010.png
✅ Saved: ./Dataset/Lebanese/test/mel/Lebanese_chunk_0011.png
✅ Saved: ./Dataset/Lebanese/train/mel/Lebanese_chunk_0012.png
✅ Saved: ./Dataset/Lebanese/train/mel/Lebanese_chunk_0013.png
✅ Saved: ./Dataset/Lebanese/train/mel/Lebanese_chunk_0014.png
✅ Saved: ./Dataset/Lebanese/val/mel/Lebanese_chunk_0015.png
✅ Saved: ./Data