In [None]:
# Sample Creation Pipeline
import os
import random
import uuid
import csv
import subprocess
from pytube import YouTube
from pydub import AudioSegment
import webrtcvad
import whisper

# ---------------------------
# CONFIGURATION
# ---------------------------
CHUNK_DURATION_MS = 7000                   # Desired chunk length
SILENCE_THRESHOLD_DB = -40                 # Unused, but reserved for silence logic
VAD_MODE = 2                               # WebRTC VAD aggressiveness (0 = loose, 3 = strict)
TARGET_DBFS = -20.0                        # Normalize loudness
SAMPLE_RATE = 16000                        # Audio sample rate (Hz)
CHANNELS = 1                               # Mono channel
PADDING_MS = 500                           # Padding before/after detected speech (ms)
MAX_AUDIO_DURATION_MIN = 30                # Cap audio length to first 30 minutes
WHISPER_MODEL_SIZE = "base"                # Whisper model size ("base", "small", "medium")
OUTPUT_BASE = "./Dataset"                  # Base output directory

# ---------------------------
# LOAD WHISPER MODEL ONCE
# ---------------------------
print(f"🧠 Loading Whisper model: {WHISPER_MODEL_SIZE}...")
model = whisper.load_model(WHISPER_MODEL_SIZE)

# ---------------------------
# Download YouTube Audio
# ---------------------------
def download_youtube_audio(youtube_url, output_dir, filename="full_audio.mp3"):
    os.makedirs(output_dir, exist_ok=True)
    output_path = os.path.join(output_dir, filename)

    print(f"🔽 Downloading audio from YouTube: {youtube_url}")
    command = [
        "yt-dlp", "-x", "--audio-format", "mp3",
        "--output", output_path, youtube_url
    ]

    subprocess.run(command, check=True)
    print(f"🎧 Audio downloaded and saved to {output_path}")
    return output_path

# ---------------------------
# Prepare and Trim Audio
# ---------------------------
def prepare_audio(input_path):
    print("🎚️  Preparing audio (mono + 16kHz)...")
    audio = AudioSegment.from_file(input_path)
    audio = audio.set_channels(CHANNELS).set_frame_rate(SAMPLE_RATE)

    max_duration_ms = MAX_AUDIO_DURATION_MIN * 60 * 1000
    if len(audio) > max_duration_ms:
        print("⏱️ Trimming audio to 30 minutes max")
        return audio[:max_duration_ms]
    return audio

# ---------------------------
# Normalize audio volume
# ---------------------------
def normalize_audio(audio_segment, target_dBFS=TARGET_DBFS):
    change = target_dBFS - audio_segment.dBFS
    return audio_segment.apply_gain(change)

# ---------------------------
# Slice audio into raw frames for VAD
# ---------------------------
def make_frames(audio_segment, sample_rate, frame_duration_ms):
    frame_len = int(sample_rate * frame_duration_ms / 1000.0) * 2
    audio_bytes = audio_segment.raw_data
    frames = []
    for i in range(0, len(audio_bytes), frame_len):
        frame = audio_bytes[i:i + frame_len]
        if len(frame) == frame_len:
            timestamp = int(i / (sample_rate * 2) * 1000)
            frames.append((timestamp, frame))
    return frames

# ---------------------------
# Detect voiced segments with WebRTC VAD
# ---------------------------
def vad_collector(audio_segment, sample_rate=SAMPLE_RATE, chunk_ms=30, vad_mode=VAD_MODE):
    print("🗣️  Detecting voiced segments using WebRTC VAD...")
    vad = webrtcvad.Vad(vad_mode)
    frames = make_frames(audio_segment, sample_rate, chunk_ms)
    segments = []
    voiced = []

    for i, (timestamp, frame) in enumerate(frames):
        is_speech = vad.is_speech(frame, sample_rate)
        if is_speech:
            voiced.append((timestamp, frame))
        elif voiced:
            start_ms = frames[i - len(voiced)][0]
            end_ms = timestamp

            # Apply padding around detected speech
            chunk_start = max(0, start_ms - PADDING_MS)
            chunk_end = min(len(audio_segment), end_ms + PADDING_MS)
            chunk = audio_segment[chunk_start:chunk_end]

            if len(chunk) >= CHUNK_DURATION_MS:
                segments.append((chunk[:CHUNK_DURATION_MS], chunk_start, chunk_end))
                print(f"🎙️  Segment extracted: {chunk_start}ms → {chunk_end}ms")

            voiced = []

    print(f"✅ VAD found {len(segments)} voiced segments")
    return segments

# ---------------------------
# Transcribe segments and write to metadata
# ---------------------------
def transcribe_and_save(segments, output_dir, dialect, source_url, quota=100):
    print(f"📝 Transcribing and saving segments for dialect: {dialect}")
    os.makedirs(output_dir, exist_ok=True)
    metadata_path = os.path.join(output_dir, f"{dialect}_metadata.csv")

    with open(metadata_path, mode='w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow([
            "sample_id", "filename", "dialect", "duration", "source_url",
            "start_time_ms", "end_time_ms", "language", "avg_logprob",
            "transcription", "whisper_model"
        ])

        count = 0
        random.shuffle(segments)

        for seg, start, end in segments:
            if count >= quota:
                break
            
            seg = seg.low_pass_filter(3400).high_pass_filter(300)  # ✅ Bandpass filtering
            seg = normalize_audio(seg)


            seg = normalize_audio(seg)
            temp_path = os.path.join(output_dir, "temp.wav")
            seg.export(temp_path, format="wav")

            result = model.transcribe(temp_path, language="ar", fp16=False)
            transcript = result["text"].strip()
            language = result.get("language", "")
            avg_logprob = result.get("avg_logprob", -10.0)

            if transcript and language == "ar" and avg_logprob > -1.0:
                filename = f"{dialect}_chunk_{uuid.uuid4().hex[:8]}.wav"
                final_path = os.path.join(output_dir, filename)
                seg.export(final_path, format="wav")

                sample_id = uuid.uuid4().hex[:12]
                writer.writerow([
                    sample_id, filename, dialect, round(seg.duration_seconds, 2),
                    source_url, start, end, language, round(avg_logprob, 3),
                    transcript, WHISPER_MODEL_SIZE
                ])
                print(f"✅ [{count+1}] Saved: {filename} | 🗣️ {transcript}")
                count += 1
            else:
                print("❌ Skipped low-quality or non-Arabic sample")

        if os.path.exists(temp_path):
            os.remove(temp_path)

    print(f"📁 Metadata written to: {metadata_path}")
    print(f"🎉 Total usable segments saved: {count}")

# ---------------------------
# Main pipeline: Multiple videos per dialect
# ---------------------------
process_multiple_youtube_links(dialect, eval(f"{dialect.lower()}_links"), quota=10):

print(f"\n🌍 Starting dataset build for: {dialect}")
dialect_dir = os.path.join(OUTPUT_BASE, dialect)
os.makedirs(dialect_dir, exist_ok=True)

all_segments = []
quota_per_link = int(quota * 1.5)  # Over-sample, filter later

for i, url in enumerate(links):
    print(f"\n🔗 Processing video {i+1}/{len(links)}: {url}")
    try:
            audio_path = download_youtube_audio(url, dialect_dir, f"audio_{i+1}.mp3")
            audio = prepare_audio(audio_path)
            segments = vad_collector(audio)
            for seg in segments:
                all_segments.append((*seg, url))  # save URL too
    except Exception as e:
            print(f"⚠️ Error with link {url} — {e}")
            continue

    if not all_segments:
        print(f"❌ No valid segments found for {dialect}")
        return

    print(f"🧮 Total segments collected: {len(all_segments)}")
    selected_segments = all_segments[:quota * 2]

    # Drop URL (we already passed "multiple" as source_url for now)
    final_segments = [(seg, start, end) for seg, start, end, _ in selected_segments]

    transcribe_and_save(final_segments, dialect_dir, dialect, source_url="multiple", quota=quota)
    print(f"🏁 Finished dialect: {dialect}")





# Links



This notebook contains YouTube audio collection and cleaning blocks for Arabic dialects:
- Lebanese
- Egyptian
- Syrian
- Palestinian
- Jordanian
- Iraqi
- Saudi
- Emirati

Select the block for the dialect you want to process and then run the final pipeline cell above.

### Lebanese

In [None]:

dialect = "Lebanese"
links = [
    "https://www.youtube.com/watch?v=wLZ5TkkJyzI",
    "https://www.youtube.com/watch?v=QALZXfprao4",
    "https://www.youtube.com/watch?v=ni0_JIhc1h4"
]

### Jordanian

In [None]:

dialect = "Jordanian"
links = [
    "https://www.youtube.com/watch?v=wLZ5TkkJyzI",
    "https://www.youtube.com/watch?v=example2",
    "https://www.youtube.com/watch?v=example3"
]


### Palestinian

In [None]:

dialect = "Palestinian"
links = [
    "https://www.youtube.com/watch?v=wLZ5TkkJyzI",
    "https://www.youtube.com/watch?v=example2",
    "https://www.youtube.com/watch?v=example3"
]


### Syrian

In [None]:
dialect = "Syrian"
links = [
    "https://www.youtube.com/watch?v=wLZ5TkkJyzI",
    "https://www.youtube.com/watch?v=example2",
    "https://www.youtube.com/watch?v=example3"
]

### Saudi

In [None]:

dialect = "Saudi"
links = [
    "https://www.youtube.com/watch?v=wLZ5TkkJyzI",
    "https://www.youtube.com/watch?v=example2",
    "https://www.youtube.com/watch?v=example3"
]



### Egyptian

In [None]:

dialect = "Egyptian"
links = [
    "https://www.youtube.com/watch?v=wLZ5TkkJyzI",
    "https://www.youtube.com/watch?v=example2",
    "https://www.youtube.com/watch?v=example3"
]


### Emarati


In [None]:

dialect = "Emarati"
links = [
    "https://www.youtube.com/watch?v=wLZ5TkkJyzI",
    "https://www.youtube.com/watch?v=example2",
    "https://www.youtube.com/watch?v=example3"
]


### Iraqi

In [None]:

dialect = "Iraqi"
links = [
    "https://www.youtube.com/watch?v=wLZ5TkkJyzI",
    "https://www.youtube.com/watch?v=example2",
    "https://www.youtube.com/watch?v=example3"
]

# Dataset Splitter


##### 🔀 Dataset Splitting: Train / Validation / Test

After collecting and cleaning audio samples for each dialect, we split the data into:
- **Training set** (`train/`): used to train the model
- **Validation set** (`val/`): used during training to check performance
- **Test set** (`test/`): used after training to evaluate final accuracy

Each split gets its own folder with:
- Clean `.wav` audio files
- A corresponding `mel/` folder for Mel spectrograms

This process also updates the metadata file to include a new column: `"split"`.

In [None]:
import pandas as pd
import shutil
from sklearn.model_selection import train_test_split

def split_dataset(dialect_dir, dialect, test_size=0.15, val_size=0.15):
    print(f"📂 Splitting dataset for: {dialect}")

    df = pd.read_csv(f"{dialect_dir}/{dialect}_metadata.csv")

    train_val, test = train_test_split(df, test_size=test_size, random_state=42)
    train, val = train_test_split(train_val, test_size=val_size/(1-test_size), random_state=42)

    df['split'] = 'train'
    df.loc[val.index, 'split'] = 'val'
    df.loc[test.index, 'split'] = 'test'

    # Create folders
    for split in ['train', 'val', 'test']:
        for sub in ['wav', 'mel']:
            os.makedirs(os.path.join(dialect_dir, split, sub), exist_ok=True)

    # Move files to new folders
    for _, row in df.iterrows():
        src = os.path.join(dialect_dir, row['filename'])
        dst = os.path.join(dialect_dir, row['split'], 'wav', row['filename'])
        shutil.move(src, dst)

    df.to_csv(os.path.join(dialect_dir, f"{dialect}_metadata.csv"), index=False)
    print(f"✅ Done splitting {len(df)} files into train/val/test.")


# 🎼 Generate Mel Spectrograms



For each audio file, we generate a Mel spectrogram and save it as a `.png` image.

This helps transform raw `.wav` files into a visual format for training CNN models.

After generation:
- Spectrograms are stored in `mel/` folders inside each split (`train`, `val`, `test`)
- Their paths are recorded in the metadata under the `"mel_path"` column


In [None]:
import librosa
import librosa.display
import matplotlib.pyplot as plt

def generate_mel_spectrograms(dialect_dir, dialect):
    print(f"🎼 Generating Mel spectrograms for: {dialect}")
    df = pd.read_csv(os.path.join(dialect_dir, f"{dialect}_metadata.csv"))
    df['mel_path'] = ""

    for i, row in df.iterrows():
        wav_path = os.path.join(dialect_dir, row['split'], 'wav', row['filename'])
        mel_path = os.path.join(dialect_dir, row['split'], 'mel', row['filename'].replace('.wav', '.png'))

        try:
            y, sr = librosa.load(wav_path, sr=16000)
            mel = librosa.feature.melspectrogram(y, sr=sr, n_mels=128)
            mel_db = librosa.power_to_db(mel, ref=np.max)

            plt.figure(figsize=(3, 3))
            librosa.display.specshow(mel_db, sr=sr, x_axis=None, y_axis=None)
            plt.axis('off')
            plt.tight_layout()
            plt.savefig(mel_path, bbox_inches='tight', pad_inches=0)
            plt.close()

            df.at[i, 'mel_path'] = mel_path
        except Exception as e:
            print(f"❌ Error on {row['filename']} → {e}")
    
        df.to_csv(os.path.join(dialect_dir, f"{dialect}_metadata.csv"), index=False)
        print(f"✅ Mel spectrograms saved and metadata updated.")