## Transcribing with Whisper Locally

In [None]:
import os
import csv
from pathlib import Path
import torch
from stable_whisper import load_model

# === SETTINGS ===
root_dir = r"C:\Pasha-PoC\Audio-Data"
output_csv = r"C:\Pasha-PoC\transcriptions_pasha.csv"  # ✅ Saving here
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"🚀 Using device: {device}")

# === LOAD MODEL ===
model = load_model("large-v3", device=device)

# === HELPER FUNCTION ===
def transcribe_audio_files(directory):
    results = []

    # Walk through all subdirectories
    for root, dirs, files in os.walk(directory):
        for filename in files:
            if filename.lower().endswith((".wav", ".mp3", ".m4a")):
                filepath = os.path.join(root, filename)
                print(f"🎧 Transcribing: {filepath}")

                try:
                    result = model.transcribe(filepath, language="az")
                    text = result.text.strip()

                    if text:
                        label = filename  # ✅ Change to `os.path.splitext(filename)[0]` for saving without extension
                        results.append([text, label])
                except Exception as e:
                    print(f"⚠️ Failed to transcribe {filename}: {e}")
    return results

# === PROCESS ===
transcriptions = transcribe_audio_files(root_dir)

# === SAVE TO CSV ===
if transcriptions:
    os.makedirs(os.path.dirname(output_csv), exist_ok=True)

    with open(output_csv, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(["Transcription", "Label"])  # Header
        writer.writerows(transcriptions)

    print(f"✅ Transcriptions saved to {output_csv}")
else:
    print("❌ No transcriptions were generated.")

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv("transcriptions_pasha.csv")
display(df)