# Estimasi waktu ekstraksi fitur visual dan text

In [None]:
import time
import numpy as np
import cv2
import pandas as pd
from pathlib import Path
import random
import warnings

import tensorflow as tf
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.applications.resnet50 import preprocess_input
from tensorflow.keras.models import Model
import whisper

warnings.filterwarnings("ignore")

# SETUP MODEL (VISUAL & TEKS)
print("Memuat model untuk benchmark...")
visual_model = None
text_model = None

# 1. Model Visual (ResNet50)
try:
    # Buat base model 
    base_model = ResNet50(weights='imagenet', include_top=False, pooling='avg')
    # Definisikan model dari base model yang sama
    visual_model = Model(inputs=base_model.input, outputs=base_model.output)
    print("✅ Model Visual (ResNet50) siap.")
except Exception as e:
    print(f"❌ Gagal memuat model visual: {e}")

# 2. Model Teks (Whisper)
try:
    text_model = whisper.load_model("base") 
    print("✅ Model Teks (Whisper) siap.")
except Exception as e:
    print(f"❌ Gagal memuat model teks: {e}")
    
IMG_SIZE = 224

# FUNGSI-FUNGSI EKSTRAKSI
def extract_visual_features(video_path, num_frames=30):
    cap = cv2.VideoCapture(str(video_path))
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    if total_frames < 1: return np.zeros(2048)
    indices = np.linspace(0, total_frames - 1, num=num_frames, dtype=int)
    frames = []
    for i in indices:
        cap.set(cv2.CAP_PROP_POS_FRAMES, i)
        ret, frame = cap.read()
        if ret:
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frame = cv2.resize(frame, (IMG_SIZE, IMG_SIZE))
            frames.append(frame)
    cap.release()
    if not frames: return np.zeros(2048)
    features = visual_model.predict(preprocess_input(np.array(frames)), verbose=0)
    return np.mean(features, axis=0)

def extract_text_features(video_path):
    try:
        result = text_model.transcribe(str(video_path), fp16=False)
        return result['text']
    except Exception:
        return ""

# PROSES BENCHMARK 
if visual_model and text_model:
    BASE_DIR = Path.cwd().parent
    VIDEO_DIR = BASE_DIR / "data" / "video" / "train"
    
    try:
        all_videos = list(VIDEO_DIR.glob("*.mp4"))
        if len(all_videos) < 5:
            sample_videos = all_videos
        else:
            sample_videos = random.sample(all_videos, 5)
        
        print(f"\nMemulai benchmark pada {len(sample_videos)} video acak...")
        
        total_visual_time = 0
        total_text_time = 0

        for i, video_path in enumerate(sample_videos):
            print(f"\n--- Memproses Video {i+1}/{len(sample_videos)}: {video_path.name} ---")
            
            start_time = time.time()
            _ = extract_visual_features(video_path)
            end_time = time.time()
            visual_time = end_time - start_time
            total_visual_time += visual_time
            print(f"Ekstraksi Visual: {visual_time:.2f} detik")
            
            start_time = time.time()
            transcript = extract_text_features(video_path)
            end_time = time.time()
            text_time = end_time - start_time
            total_text_time += text_time
            print(f"Ekstraksi Teks: {text_time:.2f} detik")
            print(f"Hasil Transkrip: '{transcript[:70]}...'")

        avg_visual_time = total_visual_time / len(sample_videos)
        avg_text_time = total_text_time / len(sample_videos)
        total_videos = 1003 # Sesuai jumlah data train + test

        est_visual_hours = (avg_visual_time * total_videos) / 3600
        est_text_hours = (avg_text_time * total_videos) / 3600

        print("\n" + "="*40)
        print("          HASIL BENCHMARK          ")
        print("="*40)
        print(f"Rata-rata waktu per video (Visual): {avg_visual_time:.2f} detik")
        print(f"Rata-rata waktu per video (Teks): {avg_text_time:.2f} detik")
        print("-" * 40)
        print(f"Estimasi total waktu untuk {total_videos} video (VISUAL): {est_visual_hours:.2f} JAM")
        print(f"Estimasi total waktu untuk {total_videos} video (TEKS): {est_text_hours:.2f} JAM")
        print("="*40)

    except FileNotFoundError:
        print("\nFolder video tidak ditemukan. Pastikan path sudah benar.")
else:
    print("\nBenchmark tidak bisa dijalankan karena salah satu model gagal dimuat.")

Memuat model untuk benchmark...
✅ Model Visual (ResNet50) siap.
✅ Model Teks (Whisper) siap.

Memulai benchmark pada 5 video acak...

--- Memproses Video 1/5: 464.mp4 ---
Ekstraksi Visual: 5.51 detik
Ekstraksi Teks: 11.63 detik
Hasil Transkrip: ' Saya mulainya jim dengan 7,3 kg, lalu setlah 12 tahun ini barang saya...'

--- Memproses Video 2/5: 309.mp4 ---
Ekstraksi Visual: 3.33 detik
Ekstraksi Teks: 4.35 detik
Hasil Transkrip: ' Apa lu tahu kalau lu bayar bulanan terjadi piti yang $20 itu, lu otom...'

--- Memproses Video 3/5: 261.mp4 ---
Ekstraksi Visual: 3.54 detik
Ekstraksi Teks: 7.75 detik
Hasil Transkrip: ' Kayanya, Fib's I Don't Chase I Attrade bisa kamu dapetin deh, kau pak...'

--- Memproses Video 4/5: 658.mp4 ---
Ekstraksi Visual: 3.17 detik
Ekstraksi Teks: 7.66 detik
Hasil Transkrip: ' Enak dan Viral, donut atis ini lagi rame banget di review nih. Ada ya...'

--- Memproses Video 5/5: 618.mp4 ---
Ekstraksi Visual: 4.23 detik
Ekstraksi Teks: 3.25 detik
Hasil Transkrip: ' Kenar