In [None]:
import warnings
warnings.filterwarnings("ignore")

import whisperx
import json
import torch
import gc
import os

# ==========================================
# üëá PASTE YOUR VIDEO FILENAME HERE üëá
VIDEO_FILE = r"D:\IS Project\video\videoplayback.mp4"
# ==========================================

def process_video():
    # Check if file exists first
    if not os.path.exists(VIDEO_FILE):
        print(f"‚ùå Error: The file '{VIDEO_FILE}' was not found.")
        print("Please make sure the video is in the same folder as this script.")
        return

    print(f"--> Loading WhisperX to process: {VIDEO_FILE}...")

    # ‚ö†Ô∏è FORCE CPU MODE to avoid GPU memory crash
    device = "cpu"  
    compute_type = "int8"  # int8 is required for CPU
    batch_size = 2  # Small batch for stability
    
    print(f"--> Using device: {device} (forced for stability)")

    try:
        # 2. Transcribe (Get the text)
        print("--> Loading model (this may take a moment)...")
        model = whisperx.load_model("tiny.en", device, compute_type=compute_type)
        
        print("--> Loading audio...")
        audio = whisperx.load_audio(VIDEO_FILE)
        
        print("--> Transcribing (this will take a few minutes on CPU)...")
        result = model.transcribe(audio, batch_size=batch_size)
        
        # Clean up transcription model
        del model
        gc.collect()

        # 3. Align (Get exact timestamps for every word)
        print("--> Aligning text to audio...")
        model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
        result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)
        
        # Clean up
        del model_a
        gc.collect()

        # 4. Prepare the final JSON structure
        full_text = ""
        for segment in result["segments"]:
            full_text += segment["text"] + " "

        output_data = {
            "source_video": VIDEO_FILE,
            "transcript_text": full_text.strip(),
            "word_level_details": result["segments"]
        }

        # 5. Save to JSON file
        output_filename = "final_answer.json"
        with open(output_filename, "w", encoding="utf-8") as f:
            json.dump(output_data, f, indent=4)

        print(f"‚úÖ Success! Data saved to: {output_filename}")

    except Exception as e:
        print(f"‚ùå An error occurred: {str(e)}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    process_video()

--> Loading WhisperX to process: D:\IS Project\video\videoplayback.mp4...
--> Using device: cuda
--> Loading model (this may take a moment)...


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


vocabulary.txt: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

model.bin:   0%|          | 0.00/75.5M [00:00<?, ?B/s]

>>Performing voice activity detection using Pyannote...


Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.5.5. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint d:\Yoshi Studio\Reels\Reels\Lib\site-packages\whisperx\assets\pytorch_model.bin`


Model was trained with pyannote.audio 0.0.1, yours is 3.4.0. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.7.1+cu118. Bad things might happen unless you revert torch to 1.x.
--> Loading audio...
--> Transcribing...


: 