<a href="https://colab.research.google.com/github/Marcin19721205/BasicTrainingPython/blob/main/transkrypcja.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install faster-whisper

Collecting faster-whisper
  Downloading faster_whisper-1.2.1-py3-none-any.whl.metadata (16 kB)
Collecting ctranslate2<5,>=4.0 (from faster-whisper)
  Downloading ctranslate2-4.6.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (10 kB)
Collecting onnxruntime<2,>=1.14 (from faster-whisper)
  Downloading onnxruntime-1.23.2-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Collecting av>=11 (from faster-whisper)
  Downloading av-16.0.1-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (4.6 kB)
Collecting coloredlogs (from onnxruntime<2,>=1.14->faster-whisper)
  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl.metadata (12 kB)
Collecting humanfriendly>=9.1 (from coloredlogs->onnxruntime<2,>=1.14->faster-whisper)
  Downloading humanfriendly-10.0-py2.py3-none-any.whl.metadata (9.2 kB)
Downloading faster_whisper-1.2.1-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m25.1 MB/s[0m eta [3

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import subprocess
from pathlib import Path

mp4 = Path("sample_data/trans_cw3-4.mp4")

p = subprocess.run(
    ["ffprobe", "-v", "error", "-show_entries", "format=duration",
     "-of", "default=noprint_wrappers=1:nokey=1", str(mp4)],
    stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True
)

print("returncode:", p.returncode)
print("duration:", p.stdout.strip())
print("stderr:", p.stderr.strip())


returncode: 0
duration: 16199.235896
stderr: 


In [6]:
import os
from pathlib import Path

BASE_DIR = Path.cwd()
MP4_FILE = BASE_DIR / "sample_data" / "trans_cw3-4.mp4"

if MP4_FILE.exists():
    size_bytes = os.path.getsize(MP4_FILE)
    size_mb = size_bytes / (1024 * 1024)
    print(f"Rozmiar pliku {MP4_FILE.name}: {size_mb:.2f} MB")
else:
    print(f"Plik {MP4_FILE.name} nie został znaleziony.")

Rozmiar pliku trans_cw3-4.mp4: 247.19 MB


In [7]:
import shutil
import subprocess
from pathlib import Path
from datetime import timedelta

from faster_whisper import WhisperModel


# ---------- utils ----------
def run(cmd: list[str]) -> subprocess.CompletedProcess:
    return subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)


def run_or_fail(cmd: list[str]) -> None:
    p = run(cmd)
    if p.returncode != 0:
        raise RuntimeError(f"Command failed:\n{' '.join(cmd)}\n\nSTDERR:\n{p.stderr.strip()}")


def which_or_fail(exe: str) -> str:
    path = shutil.which(exe)
    if not path:
        raise RuntimeError(f"Nie znaleziono '{exe}' w PATH. Zainstaluj ffmpeg i dodaj do PATH.")
    return path


def srt_timestamp(seconds: float) -> str:
    if seconds < 0:
        seconds = 0
    td = timedelta(seconds=seconds)
    total_seconds = int(td.total_seconds())
    ms = int(round((seconds - total_seconds) * 1000))
    h = total_seconds // 3600
    m = (total_seconds % 3600) // 60
    s = total_seconds % 60
    return f"{h:02d}:{m:02d}:{s:02d},{ms:03d}"


def ffprobe_ok(media_path: Path) -> bool:
    p = run([
        "ffprobe", "-v", "error",
        "-show_entries", "format=duration",
        "-of", "default=noprint_wrappers=1:nokey=1",
        str(media_path)
    ])
    return p.returncode == 0


def try_fix_mp4(input_mp4: Path, fixed_mp4: Path) -> bool:
    p = run([
        "ffmpeg", "-y",
        "-i", str(input_mp4),
        "-c", "copy",
        "-movflags", "+faststart",
        str(fixed_mp4)
    ])
    if p.returncode != 0:
        print("Naprawa MP4 nie wyszła. ffmpeg mówi:\n", p.stderr.strip())
        return False
    return ffprobe_ok(fixed_mp4)


# ---------- pipeline ----------
def extract_audio(mp4_path: Path, wav_path: Path, sr: int = 16000) -> None:
    run_or_fail([
        "ffmpeg", "-y",
        "-i", str(mp4_path),
        "-vn",
        "-ac", "1",
        "-ar", str(sr),
        "-c:a", "pcm_s16le",
        str(wav_path)
    ])


def split_audio(wav_path: Path, out_dir: Path, segment_seconds: int = 600) -> list[Path]:
    out_dir.mkdir(parents=True, exist_ok=True)
    pattern = str(out_dir / "segment_%05d.wav")

    run_or_fail([
        "ffmpeg", "-y",
        "-i", str(wav_path),
        "-f", "segment",
        "-segment_time", str(segment_seconds),
        "-reset_timestamps", "1",
        "-c", "copy",
        pattern
    ])

    segments = sorted(out_dir.glob("segment_*.wav"))
    if not segments:
        raise RuntimeError("Nie utworzono segmentów audio.")
    return segments


def transcribe_media(
    media_path: Path,
    out_dir: Path,
    language: str = "pl",
    model_size: str = "medium",
    segment_seconds: int = 600,
    device: str = "cpu",
    compute_type: str = "int8"
) -> None:

    which_or_fail("ffmpeg")
    which_or_fail("ffprobe")

    if not media_path.exists():
        raise FileNotFoundError(f"Nie ma pliku: {media_path}")

    out_dir.mkdir(parents=True, exist_ok=True)

    # info start
    size_mb = media_path.stat().st_size / (1024 * 1024)
    print(f"Wejście: {media_path}  |  rozmiar: {size_mb:.2f} MB")

    # 0) Check MP4/MKV; jak problem z kontenerem, próbuj faststart
    if not ffprobe_ok(media_path):
        print("Kontener wygląda na problematyczny. Próbuję remux (-movflags +faststart)...")
        fixed = out_dir / "fixed_faststart.mp4"
        ok = try_fix_mp4(media_path, fixed)
        if not ok:
            raise RuntimeError("ffprobe nadal nie czyta pliku po remux. Ten plik jest uszkodzony.")
        media_path = fixed
        print(f"Używam naprawionego: {media_path}")

    wav_path = out_dir / "audio_16k_mono.wav"
    segments_dir = out_dir / "segments"
    txt_path = out_dir / "transkrypcja.txt"
    srt_path = out_dir / "transkrypcja.srt"

    print("1) Ekstrakcja audio")
    extract_audio(media_path, wav_path)

    print("2) Dzielenie na segmenty")
    segments = split_audio(wav_path, segments_dir, segment_seconds)

    print("3) Ładowanie modelu")
    model = WhisperModel(model_size, device=device, compute_type=compute_type)

    print("4) Transkrypcja (zapis TXT + SRT)")
    srt_index = 1

    with open(txt_path, "w", encoding="utf-8") as f_txt, open(srt_path, "w", encoding="utf-8") as f_srt:
        for i, seg in enumerate(segments):
            offset = i * segment_seconds
            print(f"   Segment {i+1}/{len(segments)}: {seg.name}")

            segs, _ = model.transcribe(
                str(seg),
                language=language,
                vad_filter=True,
                beam_size=5
            )

            line = []
            for s in segs:
                text = s.text.strip()
                if not text:
                    continue

                line.append(text)

                start = offset + float(s.start)
                end = offset + float(s.end)

                f_srt.write(f"{srt_index}\n")
                f_srt.write(f"{srt_timestamp(start)} --> {srt_timestamp(end)}\n")
                f_srt.write(f"{text}\n\n")
                srt_index += 1

            if line:
                f_txt.write(" ".join(line) + "\n")

    print("\nGOTOWE")
    print(f"TXT: {txt_path}")
    print(f"SRT: {srt_path}")


# ---------- ENTRY POINT (COLAB) ----------
if __name__ == "__main__":
    BASE_DIR = Path.cwd()

    # >>> TEN PLIK MASZ W COLABIE (ze screena) <<<
    MEDIA_FILE = BASE_DIR / "sample_data" / "trans_cw3-4.mp4"

    OUTPUT_DIR = BASE_DIR / "asr_out"

    transcribe_media(
        media_path=MEDIA_FILE,
        out_dir=OUTPUT_DIR,
        language="pl",
        model_size="medium",
        segment_seconds=600,  # 10 min
        device="cpu",
        compute_type="int8"
    )


Wejście: /content/sample_data/trans_cw3-4.mp4  |  rozmiar: 247.19 MB
1) Ekstrakcja audio
2) Dzielenie na segmenty
3) Ładowanie modelu


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocabulary.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.bin:   0%|          | 0.00/1.53G [00:00<?, ?B/s]

4) Transkrypcja (zapis TXT + SRT)
   Segment 1/27: segment_00000.wav
   Segment 2/27: segment_00001.wav
   Segment 3/27: segment_00002.wav
   Segment 4/27: segment_00003.wav
   Segment 5/27: segment_00004.wav


KeyboardInterrupt: 