In [1]:
import os
import sys
import pandas as pd
from tqdm import tqdm

# Add src directory to Python path
current_dir = os.getcwd()
src_path = os.path.join(current_dir, 'src')
if src_path not in sys.path:
    sys.path.append(src_path)

# Import feature extractor
try:
    from tiktok_feature_extractor import TikTokFeatureExtractor
    print("✓ Import successful")
except ImportError as e:
    print(f"✗ Import failed: {e}")
    exit()

# Initialize feature extractor
print("Initializing feature extractor...")
try:
    extractor = TikTokFeatureExtractor()
    print("✓ Initialization successful")
except Exception as e:
    print(f"✗ Initialization failed: {e}")
    exit()

# Set paths
video_folder = "tiktok_videos"
output_folder = "tiktok_frames"
csv_output = "video_features_results.csv"

# Check video folder
if not os.path.exists(video_folder):
    print(f"Video folder '{video_folder}' not found!")
    exit()

# Create output directory
os.makedirs(output_folder, exist_ok=True)

# Get video file list
video_files = [f for f in os.listdir(video_folder) if f.lower().endswith('.mp4')]
print(f"Found {len(video_files)} video files")

def process_single_video(video_name=None):
    """Process a single video"""
    if not video_files:
        print("No video files found")
        return None
    
    # Select video
    if video_name is None:
        video_file = video_files[0]  # Default to first video
    else:
        video_file = video_name if video_name in video_files else video_files[0]
    
    video_path = os.path.join(video_folder, video_file)
    output_dir = os.path.join(output_folder, os.path.splitext(video_file)[0])
    
    print(f"Processing video: {video_file}")
    
    try:
        df = extractor.extract_features_from_single_video(
            video_path=video_path,
            output_folder=output_dir,
            csv_output_path=f"{os.path.splitext(video_file)[0]}_results.csv"
        )
        
        if not df.empty:
            print(f"✓ Processing completed: {video_file}")
            return df
        else:
            print(f"✗ Processing failed: {video_file}")
            return None
            
    except Exception as e:
        print(f"✗ Processing error: {e}")
        return None

def process_all_videos():
    """Process all videos"""
    if not video_files:
        print("No video files found")
        return None
    
    print(f"Starting to process {len(video_files)} videos...")
    
    try:
        df = extractor.extract_features_from_folder(
            video_folder=video_folder,
            output_folder=output_folder,
            csv_output_path=csv_output
        )
        
        if not df.empty:
            print(f"✓ All processing completed, {len(df)} videos total")
            return df
        else:
            print("✗ Processing failed")
            return None
            
    except Exception as e:
        print(f"✗ Processing error: {e}")
        return None

def load_results():
    """Load results"""
    if os.path.exists(csv_output):
        try:
            df = pd.read_csv(csv_output)
            print(f"Loaded results for {len(df)} videos")
            return df
        except Exception as e:
            print(f"Loading failed: {e}")
            return None
    else:
        print(f"Results file not found: {csv_output}")
        return None

  from .autonotebook import tqdm as notebook_tqdm


Loading YOLO-World model: yolov8s-world.pt




✓ Import successful
Initializing feature extractor...
✓ Initialization successful
Found 17 video files


In [None]:
df = process_single_video()

In [2]:
df = process_all_videos()

Starting to process 17 videos...
Found 17 video files to process


Processing videos:   0%|          | 0/17 [00:00<?, ?it/s]


Processing video: Download (1)
  - Extracting video metadata...
  - Processing audio...
Extracting audio from tiktok_videos\Download (1).mp4 to tiktok_frames\Download (1)\Download (1).wav
Audio extracted successfully: tiktok_frames\Download (1)\Download (1).wav
Loading VAD model on device: cuda


Using cache found in C:\Users\Shuwei Yang/.cache\torch\hub\snakers4_silero-vad_master


Loading audio for speech detection: tiktok_frames\Download (1)\Download (1).wav
Audio loaded: shape=torch.Size([1, 218454]), sample_rate=16000
Speech detection: 21.83% frames contain speech
Loading Whisper model on device: cuda
Transcribing audio: tiktok_frames\Download (1)\Download (1).wav
Transcription successful: 63 characters
Speech text saved to: tiktok_frames\Download (1)\speech_text.txt
  - Extracting keyframes...
  - Filtering similar keyframes...
  - Kept 4 frames after filtering
  - Selecting representative frames...

image 1/1 d:\tiktok\tiktok_frames\Download (1)\Download (1)_keyframe_0000.jpg: 640x384 1 person, 2 chairs, 63.5ms
Speed: 2.2ms preprocess, 63.5ms inference, 15.5ms postprocess per image at shape (1, 3, 640, 384)

image 1/1 d:\tiktok\tiktok_frames\Download (1)\Download (1)_keyframe_0034.jpg: 640x384 1 person, 1 cell phone, 1 toothbrush, 26.7ms
Speed: 2.6ms preprocess, 26.7ms inference, 3.2ms postprocess per image at shape (1, 3, 640, 384)

image 1/1 d:\tiktok\tik

Processing videos:   6%|▌         | 1/17 [00:06<01:37,  6.08s/it]

  ✓ Completed processing Download (1)

Processing video: Download (10)
  - Extracting video metadata...
  - Processing audio...
Extracting audio from tiktok_videos\Download (10).mp4 to tiktok_frames\Download (10)\Download (10).wav
Audio extracted successfully: tiktok_frames\Download (10)\Download (10).wav
Loading audio for speech detection: tiktok_frames\Download (10)\Download (10).wav
Audio loaded: shape=torch.Size([1, 128591]), sample_rate=16000
Speech detection: 0.40% frames contain speech
  - Extracting keyframes...
  - Filtering similar keyframes...
  - Kept 3 frames after filtering
  - Selecting representative frames...

image 1/1 d:\tiktok\tiktok_frames\Download (10)\Download (10)_keyframe_0000.jpg: 640x384 1 person, 24.9ms
Speed: 1.6ms preprocess, 24.9ms inference, 2.6ms postprocess per image at shape (1, 3, 640, 384)

image 1/1 d:\tiktok\tiktok_frames\Download (10)\Download (10)_keyframe_0100.jpg: 640x384 2 persons, 14.3ms
Speed: 1.3ms preprocess, 14.3ms inference, 1.8ms postp

Processing videos:  12%|█▏        | 2/17 [00:08<00:56,  3.77s/it]

  ✓ Completed processing Download (10)

Processing video: Download (11)
  - Extracting video metadata...
  - Processing audio...
Extracting audio from tiktok_videos\Download (11).mp4 to tiktok_frames\Download (11)\Download (11).wav
Audio extracted successfully: tiktok_frames\Download (11)\Download (11).wav
Loading audio for speech detection: tiktok_frames\Download (11)\Download (11).wav
Audio loaded: shape=torch.Size([1, 757490]), sample_rate=16000
Speech detection: 93.98% frames contain speech
Transcribing audio: tiktok_frames\Download (11)\Download (11).wav
Transcription successful: 1088 characters
Speech text saved to: tiktok_frames\Download (11)\speech_text.txt
  - Extracting keyframes...
  - Filtering similar keyframes...
  - Kept 10 frames after filtering
  - Selecting representative frames...

image 1/1 d:\tiktok\tiktok_frames\Download (11)\Download (11)_keyframe_0000.jpg: 640x384 1 person, 14.1ms
Speed: 1.6ms preprocess, 14.1ms inference, 1.9ms postprocess per image at shape (1



  - Saving representative frames...
  - Saved representative frame: Download (11)_representative_03.jpg
  - Saved representative frame: Download (11)_representative_02.jpg
  - Saved representative frame: Download (11)_representative_01.jpg
  - Extracting multimodal features...

image 1/1 d:\tiktok\tiktok_frames\Download (11)\Download (11)_representative_01.jpg: 640x384 1 person, 14.0ms
Speed: 1.8ms preprocess, 14.0ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 384)

image 1/1 d:\tiktok\tiktok_frames\Download (11)\Download (11)_representative_02.jpg: 640x384 1 person, 14.5ms
Speed: 1.6ms preprocess, 14.5ms inference, 1.8ms postprocess per image at shape (1, 3, 640, 384)

image 1/1 d:\tiktok\tiktok_frames\Download (11)\Download (11)_representative_03.jpg: 640x384 1 person, 14.0ms
Speed: 1.3ms preprocess, 14.0ms inference, 1.8ms postprocess per image at shape (1, 3, 640, 384)


Processing videos:  18%|█▊        | 3/17 [00:17<01:28,  6.30s/it]

  ✓ Completed processing Download (11)

Processing video: Download (12)
  - Extracting video metadata...
  - Processing audio...
Extracting audio from tiktok_videos\Download (12).mp4 to tiktok_frames\Download (12)\Download (12).wav
Audio extracted successfully: tiktok_frames\Download (12)\Download (12).wav
Loading audio for speech detection: tiktok_frames\Download (12)\Download (12).wav
Audio loaded: shape=torch.Size([1, 876088]), sample_rate=16000
Speech detection: 94.86% frames contain speech
Transcribing audio: tiktok_frames\Download (12)\Download (12).wav
Transcription successful: 778 characters
Speech text saved to: tiktok_frames\Download (12)\speech_text.txt
  - Extracting keyframes...
  - Filtering similar keyframes...
  - Kept 10 frames after filtering
  - Selecting representative frames...

image 1/1 d:\tiktok\tiktok_frames\Download (12)\Download (12)_keyframe_0000.jpg: 640x384 2 persons, 2 bottles, 1 cup, 1 bowl, 14.2ms
Speed: 1.6ms preprocess, 14.2ms inference, 2.5ms postpro




image 1/1 d:\tiktok\tiktok_frames\Download (12)\Download (12)_representative_01.jpg: 640x384 2 bottles, 3 cups, 1 dining table, 14.1ms
Speed: 1.6ms preprocess, 14.1ms inference, 2.1ms postprocess per image at shape (1, 3, 640, 384)

image 1/1 d:\tiktok\tiktok_frames\Download (12)\Download (12)_representative_02.jpg: 640x384 1 person, 2 bottles, 14.2ms
Speed: 1.3ms preprocess, 14.2ms inference, 1.8ms postprocess per image at shape (1, 3, 640, 384)

image 1/1 d:\tiktok\tiktok_frames\Download (12)\Download (12)_representative_03.jpg: 640x384 1 person, 3 bottles, 1 dining table, 14.3ms
Speed: 1.3ms preprocess, 14.3ms inference, 1.8ms postprocess per image at shape (1, 3, 640, 384)


Processing videos:  24%|██▎       | 4/17 [00:25<01:32,  7.09s/it]

  ✓ Completed processing Download (12)

Processing video: Download (13)
  - Extracting video metadata...
  - Processing audio...
Extracting audio from tiktok_videos\Download (13).mp4 to tiktok_frames\Download (13)\Download (13).wav
Audio extracted successfully: tiktok_frames\Download (13)\Download (13).wav
Loading audio for speech detection: tiktok_frames\Download (13)\Download (13).wav
Audio loaded: shape=torch.Size([1, 172430]), sample_rate=16000
Speech detection: 29.17% frames contain speech
Transcribing audio: tiktok_frames\Download (13)\Download (13).wav
Transcription successful: 158 characters
Speech text saved to: tiktok_frames\Download (13)\speech_text.txt
  - Extracting keyframes...
  - Filtering similar keyframes...
  - Kept 2 frames after filtering
  - Selecting representative frames...

image 1/1 d:\tiktok\tiktok_frames\Download (13)\Download (13)_keyframe_0000.jpg: 640x384 4 cars, 1 motorcycle, 1 boat, 14.4ms
Speed: 1.4ms preprocess, 14.4ms inference, 2.0ms postprocess per

Processing videos:  29%|██▉       | 5/17 [00:28<01:05,  5.43s/it]

  ✓ Completed processing Download (13)

Processing video: Download (14)
  - Extracting video metadata...
  - Processing audio...
Extracting audio from tiktok_videos\Download (14).mp4 to tiktok_frames\Download (14)\Download (14).wav
Audio extracted successfully: tiktok_frames\Download (14)\Download (14).wav
Loading audio for speech detection: tiktok_frames\Download (14)\Download (14).wav
Audio loaded: shape=torch.Size([1, 679925]), sample_rate=16000
Speech detection: 99.77% frames contain speech
Transcribing audio: tiktok_frames\Download (14)\Download (14).wav
Transcription successful: 1073 characters
Speech text saved to: tiktok_frames\Download (14)\speech_text.txt
  - Extracting keyframes...
  - Filtering similar keyframes...
  - Removed similar frame: Download (14)_keyframe_0819.jpg (SSIM: 0.991)
  - Removed similar frame: Download (14)_keyframe_0854.jpg (SSIM: 0.961)
  - Kept 36 frames after filtering
  - Selecting representative frames...

image 1/1 d:\tiktok\tiktok_frames\Download

Processing videos:  35%|███▌      | 6/17 [00:39<01:22,  7.54s/it]

  ✓ Completed processing Download (14)

Processing video: Download (15)
  - Extracting video metadata...
  - Processing audio...
Extracting audio from tiktok_videos\Download (15).mp4 to tiktok_frames\Download (15)\Download (15).wav
Audio extracted successfully: tiktok_frames\Download (15)\Download (15).wav
Loading audio for speech detection: tiktok_frames\Download (15)\Download (15).wav
Audio loaded: shape=torch.Size([1, 1348660]), sample_rate=16000
Speech detection: 97.00% frames contain speech
Transcribing audio: tiktok_frames\Download (15)\Download (15).wav
Transcription successful: 1717 characters
Speech text saved to: tiktok_frames\Download (15)\speech_text.txt
  - Extracting keyframes...
  - Filtering similar keyframes...
  - Kept 18 frames after filtering
  - Selecting representative frames...

image 1/1 d:\tiktok\tiktok_frames\Download (15)\Download (15)_keyframe_0000.jpg: 640x384 1 person, 1 cell phone, 61.5ms
Speed: 1.9ms preprocess, 61.5ms inference, 2.6ms postprocess per im

Processing videos:  41%|████      | 7/17 [00:53<01:34,  9.43s/it]

  ✓ Completed processing Download (15)

Processing video: Download (16)
  - Extracting video metadata...
  - Processing audio...
Extracting audio from tiktok_videos\Download (16).mp4 to tiktok_frames\Download (16)\Download (16).wav
Audio extracted successfully: tiktok_frames\Download (16)\Download (16).wav
Loading audio for speech detection: tiktok_frames\Download (16)\Download (16).wav
Audio loaded: shape=torch.Size([1, 116702]), sample_rate=16000
Speech detection: 9.69% frames contain speech
  - Extracting keyframes...
  - Filtering similar keyframes...
  - Kept 7 frames after filtering
  - Selecting representative frames...

image 1/1 d:\tiktok\tiktok_frames\Download (16)\Download (16)_keyframe_0000.jpg: 640x384 1 person, 1 dining table, 1 scissors, 1 toothbrush, 15.8ms
Speed: 2.0ms preprocess, 15.8ms inference, 1.9ms postprocess per image at shape (1, 3, 640, 384)

image 1/1 d:\tiktok\tiktok_frames\Download (16)\Download (16)_keyframe_0044.jpg: 640x384 2 persons, 20.1ms
Speed: 2.6m

Processing videos:  47%|████▋     | 8/17 [00:55<01:04,  7.22s/it]

  ✓ Completed processing Download (16)

Processing video: Download (2)
  - Extracting video metadata...
  - Processing audio...
Extracting audio from tiktok_videos\Download (2).mp4 to tiktok_frames\Download (2)\Download (2).wav
Audio extracted successfully: tiktok_frames\Download (2)\Download (2).wav
Loading audio for speech detection: tiktok_frames\Download (2)\Download (2).wav
Audio loaded: shape=torch.Size([1, 894247]), sample_rate=16000
Speech detection: 85.51% frames contain speech
Transcribing audio: tiktok_frames\Download (2)\Download (2).wav
Transcription successful: 1057 characters
Speech text saved to: tiktok_frames\Download (2)\speech_text.txt
  - Extracting keyframes...
  - Filtering similar keyframes...
  - Kept 13 frames after filtering
  - Selecting representative frames...

image 1/1 d:\tiktok\tiktok_frames\Download (2)\Download (2)_keyframe_0000.jpg: 640x384 1 person, 14.2ms
Speed: 1.6ms preprocess, 14.2ms inference, 2.5ms postprocess per image at shape (1, 3, 640, 384




image 1/1 d:\tiktok\tiktok_frames\Download (2)\Download (2)_representative_01.jpg: 640x384 1 person, 1 toilet, 1 sink, 1 book, 14.7ms
Speed: 1.5ms preprocess, 14.7ms inference, 2.4ms postprocess per image at shape (1, 3, 640, 384)

image 1/1 d:\tiktok\tiktok_frames\Download (2)\Download (2)_representative_02.jpg: 640x384 5 persons, 3 cars, 22.0ms
Speed: 1.7ms preprocess, 22.0ms inference, 1.9ms postprocess per image at shape (1, 3, 640, 384)

image 1/1 d:\tiktok\tiktok_frames\Download (2)\Download (2)_representative_03.jpg: 640x384 1 person, 1 umbrella, 14.2ms
Speed: 1.4ms preprocess, 14.2ms inference, 1.8ms postprocess per image at shape (1, 3, 640, 384)


Processing videos:  53%|█████▎    | 9/17 [01:05<01:02,  7.85s/it]

  ✓ Completed processing Download (2)

Processing video: Download (3)
  - Extracting video metadata...
  - Processing audio...
Extracting audio from tiktok_videos\Download (3).mp4 to tiktok_frames\Download (3)\Download (3).wav
Audio extracted successfully: tiktok_frames\Download (3)\Download (3).wav
Loading audio for speech detection: tiktok_frames\Download (3)\Download (3).wav
Audio loaded: shape=torch.Size([1, 412758]), sample_rate=16000
Speech detection: 79.40% frames contain speech
Transcribing audio: tiktok_frames\Download (3)\Download (3).wav
Transcription successful: 353 characters
Speech text saved to: tiktok_frames\Download (3)\speech_text.txt
  - Extracting keyframes...
  - Filtering similar keyframes...
  - Kept 6 frames after filtering
  - Selecting representative frames...

image 1/1 d:\tiktok\tiktok_frames\Download (3)\Download (3)_keyframe_0000.jpg: 640x384 2 persons, 2 motorcycles, 1 bus, 1 truck, 14.8ms
Speed: 1.5ms preprocess, 14.8ms inference, 1.9ms postprocess per i

Processing videos:  59%|█████▉    | 10/17 [01:10<00:49,  7.09s/it]

  ✓ Completed processing Download (3)

Processing video: Download (4)
  - Extracting video metadata...
  - Processing audio...
Extracting audio from tiktok_videos\Download (4).mp4 to tiktok_frames\Download (4)\Download (4).wav
Audio extracted successfully: tiktok_frames\Download (4)\Download (4).wav
Loading audio for speech detection: tiktok_frames\Download (4)\Download (4).wav
Audio loaded: shape=torch.Size([1, 637156]), sample_rate=16000
Speech detection: 76.45% frames contain speech
Transcribing audio: tiktok_frames\Download (4)\Download (4).wav
Transcription successful: 600 characters
Speech text saved to: tiktok_frames\Download (4)\speech_text.txt
  - Extracting keyframes...
  - Filtering similar keyframes...
  - Kept 10 frames after filtering
  - Selecting representative frames...

image 1/1 d:\tiktok\tiktok_frames\Download (4)\Download (4)_keyframe_0000.jpg: 640x384 2 cars, 1 traffic light, 14.2ms
Speed: 1.5ms preprocess, 14.2ms inference, 2.0ms postprocess per image at shape (1




image 1/1 d:\tiktok\tiktok_frames\Download (4)\Download (4)_representative_01.jpg: 640x384 2 cars, 1 traffic light, 15.1ms
Speed: 1.6ms preprocess, 15.1ms inference, 2.1ms postprocess per image at shape (1, 3, 640, 384)

image 1/1 d:\tiktok\tiktok_frames\Download (4)\Download (4)_representative_02.jpg: 640x384 1 car, 14.6ms
Speed: 1.4ms preprocess, 14.6ms inference, 1.8ms postprocess per image at shape (1, 3, 640, 384)

image 1/1 d:\tiktok\tiktok_frames\Download (4)\Download (4)_representative_03.jpg: 640x384 1 person, 1 car, 15.2ms
Speed: 1.4ms preprocess, 15.2ms inference, 1.9ms postprocess per image at shape (1, 3, 640, 384)


Processing videos:  65%|██████▍   | 11/17 [01:17<00:41,  6.95s/it]

  ✓ Completed processing Download (4)

Processing video: Download (5)
  - Extracting video metadata...
  - Processing audio...
Extracting audio from tiktok_videos\Download (5).mp4 to tiktok_frames\Download (5)\Download (5).wav
Audio extracted successfully: tiktok_frames\Download (5)\Download (5).wav
Loading audio for speech detection: tiktok_frames\Download (5)\Download (5).wav
Audio loaded: shape=torch.Size([1, 268980]), sample_rate=16000
Speech detection: 12.38% frames contain speech
Transcribing audio: tiktok_frames\Download (5)\Download (5).wav
Transcription successful: 128 characters
Speech text saved to: tiktok_frames\Download (5)\speech_text.txt
  - Extracting keyframes...
  - Filtering similar keyframes...
  - Kept 4 frames after filtering
  - Selecting representative frames...

image 1/1 d:\tiktok\tiktok_frames\Download (5)\Download (5)_keyframe_0000.jpg: 640x384 1 person, 14.6ms
Speed: 1.5ms preprocess, 14.6ms inference, 2.5ms postprocess per image at shape (1, 3, 640, 384)



Processing videos:  71%|███████   | 12/17 [01:20<00:29,  5.84s/it]

  ✓ Completed processing Download (5)

Processing video: Download (6)
  - Extracting video metadata...
  - Processing audio...
Extracting audio from tiktok_videos\Download (6).mp4 to tiktok_frames\Download (6)\Download (6).wav
Audio extracted successfully: tiktok_frames\Download (6)\Download (6).wav
Loading audio for speech detection: tiktok_frames\Download (6)\Download (6).wav
Audio loaded: shape=torch.Size([1, 9523570]), sample_rate=16000
Speech detection: 65.11% frames contain speech
Transcribing audio: tiktok_frames\Download (6)\Download (6).wav
Transcription successful: 7333 characters
Speech text saved to: tiktok_frames\Download (6)\speech_text.txt
  - Extracting keyframes...
  - Filtering similar keyframes...
  - Kept 72 frames after filtering
  - Selecting representative frames...

image 1/1 d:\tiktok\tiktok_frames\Download (6)\Download (6)_keyframe_0000.jpg: 640x384 1 person, 1 book, 109.3ms
Speed: 3.0ms preprocess, 109.3ms inference, 2.1ms postprocess per image at shape (1, 3



  - Saving representative frames...
  - Saved representative frame: Download (6)_representative_01.jpg
  - Saved representative frame: Download (6)_representative_05.jpg
  - Saved representative frame: Download (6)_representative_02.jpg
  - Saved representative frame: Download (6)_representative_04.jpg
  - Saved representative frame: Download (6)_representative_03.jpg
  - Extracting multimodal features...

image 1/1 d:\tiktok\tiktok_frames\Download (6)\Download (6)_representative_01.jpg: 640x384 1 person, 1 sandwich, 1 donut, 1 book, 14.9ms
Speed: 1.4ms preprocess, 14.9ms inference, 1.9ms postprocess per image at shape (1, 3, 640, 384)

image 1/1 d:\tiktok\tiktok_frames\Download (6)\Download (6)_representative_02.jpg: 640x384 1 person, 1 bowl, 1 dining table, 1 book, 14.1ms
Speed: 1.3ms preprocess, 14.1ms inference, 2.3ms postprocess per image at shape (1, 3, 640, 384)

image 1/1 d:\tiktok\tiktok_frames\Download (6)\Download (6)_representative_03.jpg: 640x384 1 person, 1 book, 14.0ms
S

Processing videos:  76%|███████▋  | 13/17 [02:26<01:35, 23.99s/it]

  ✓ Completed processing Download (6)

Processing video: Download (7)
  - Extracting video metadata...
  - Processing audio...
Extracting audio from tiktok_videos\Download (7).mp4 to tiktok_frames\Download (7)\Download (7).wav
Audio extracted successfully: tiktok_frames\Download (7)\Download (7).wav
Loading audio for speech detection: tiktok_frames\Download (7)\Download (7).wav
Audio loaded: shape=torch.Size([1, 478562]), sample_rate=16000
Speech detection: 85.01% frames contain speech
Transcribing audio: tiktok_frames\Download (7)\Download (7).wav
Transcription successful: 502 characters
Speech text saved to: tiktok_frames\Download (7)\speech_text.txt
  - Extracting keyframes...
  - Filtering similar keyframes...
  - Kept 6 frames after filtering
  - Selecting representative frames...

image 1/1 d:\tiktok\tiktok_frames\Download (7)\Download (7)_keyframe_0000.jpg: 640x384 1 person, 1 cell phone, 14.5ms
Speed: 1.5ms preprocess, 14.5ms inference, 1.9ms postprocess per image at shape (1, 

Processing videos:  82%|████████▏ | 14/17 [02:32<00:55, 18.55s/it]

  ✓ Completed processing Download (7)

Processing video: Download (8)
  - Extracting video metadata...
  - Processing audio...
Extracting audio from tiktok_videos\Download (8).mp4 to tiktok_frames\Download (8)\Download (8).wav
Audio extracted successfully: tiktok_frames\Download (8)\Download (8).wav
Loading audio for speech detection: tiktok_frames\Download (8)\Download (8).wav
Audio loaded: shape=torch.Size([1, 367106]), sample_rate=16000
Speech detection: 0.00% frames contain speech
  - Extracting keyframes...
  - Filtering similar keyframes...
  - Kept 21 frames after filtering
  - Selecting representative frames...

image 1/1 d:\tiktok\tiktok_frames\Download (8)\Download (8)_keyframe_0000.jpg: 640x384 1 person, 27.4ms
Speed: 1.9ms preprocess, 27.4ms inference, 2.2ms postprocess per image at shape (1, 3, 640, 384)

image 1/1 d:\tiktok\tiktok_frames\Download (8)\Download (8)_keyframe_0047.jpg: 640x384 1 person, 33.6ms
Speed: 2.0ms preprocess, 33.6ms inference, 3.4ms postprocess per i



  - Saving representative frames...
  - Saved representative frame: Download (8)_representative_02.jpg
  - Saved representative frame: Download (8)_representative_01.jpg
  - Saved representative frame: Download (8)_representative_03.jpg
  - Saved representative frame: Download (8)_representative_04.jpg
  - Extracting multimodal features...

image 1/1 d:\tiktok\tiktok_frames\Download (8)\Download (8)_representative_01.jpg: 640x384 1 person, 15.1ms
Speed: 1.6ms preprocess, 15.1ms inference, 2.5ms postprocess per image at shape (1, 3, 640, 384)

image 1/1 d:\tiktok\tiktok_frames\Download (8)\Download (8)_representative_02.jpg: 640x384 1 person, 14.8ms
Speed: 1.6ms preprocess, 14.8ms inference, 1.9ms postprocess per image at shape (1, 3, 640, 384)

image 1/1 d:\tiktok\tiktok_frames\Download (8)\Download (8)_representative_03.jpg: 640x384 1 person, 14.1ms
Speed: 1.4ms preprocess, 14.1ms inference, 1.9ms postprocess per image at shape (1, 3, 640, 384)

image 1/1 d:\tiktok\tiktok_frames\Downl

Processing videos:  88%|████████▊ | 15/17 [02:37<00:29, 14.57s/it]

  ✓ Completed processing Download (8)

Processing video: Download (9)
  - Extracting video metadata...
  - Processing audio...
Extracting audio from tiktok_videos\Download (9).mp4 to tiktok_frames\Download (9)\Download (9).wav
Audio extracted successfully: tiktok_frames\Download (9)\Download (9).wav
Loading audio for speech detection: tiktok_frames\Download (9)\Download (9).wav
Audio loaded: shape=torch.Size([1, 238561]), sample_rate=16000
Speech detection: 94.84% frames contain speech
Transcribing audio: tiktok_frames\Download (9)\Download (9).wav
Transcription successful: 333 characters
Speech text saved to: tiktok_frames\Download (9)\speech_text.txt
  - Extracting keyframes...
  - Filtering similar keyframes...
  - Kept 4 frames after filtering
  - Selecting representative frames...

image 1/1 d:\tiktok\tiktok_frames\Download (9)\Download (9)_keyframe_0000.jpg: 640x384 1 person, 1 book, 2 vases, 14.6ms
Speed: 1.6ms preprocess, 14.6ms inference, 2.9ms postprocess per image at shape (

Processing videos:  94%|█████████▍| 16/17 [02:41<00:11, 11.48s/it]

  ✓ Completed processing Download (9)

Processing video: Download
  - Extracting video metadata...
  - Processing audio...
Extracting audio from tiktok_videos\Download.mp4 to tiktok_frames\Download\Download.wav
Audio extracted successfully: tiktok_frames\Download\Download.wav
Loading audio for speech detection: tiktok_frames\Download\Download.wav
Audio loaded: shape=torch.Size([1, 266008]), sample_rate=16000
Speech detection: 0.00% frames contain speech
  - Extracting keyframes...
  - Filtering similar keyframes...
  - Kept 8 frames after filtering
  - Selecting representative frames...

image 1/1 d:\tiktok\tiktok_frames\Download\Download_keyframe_0000.jpg: 640x384 1 bottle, 1 chair, 1 potted plant, 17.9ms
Speed: 1.5ms preprocess, 17.9ms inference, 1.9ms postprocess per image at shape (1, 3, 640, 384)

image 1/1 d:\tiktok\tiktok_frames\Download\Download_keyframe_0064.jpg: 640x384 3 persons, 1 cup, 14.3ms
Speed: 1.5ms preprocess, 14.3ms inference, 2.0ms postprocess per image at shape (1




image 1/1 d:\tiktok\tiktok_frames\Download\Download_representative_01.jpg: 640x384 3 persons, 1 cup, 14.8ms
Speed: 1.6ms preprocess, 14.8ms inference, 2.1ms postprocess per image at shape (1, 3, 640, 384)

image 1/1 d:\tiktok\tiktok_frames\Download\Download_representative_02.jpg: 640x384 3 persons, 2 bottles, 1 cup, 1 sink, 14.3ms
Speed: 1.4ms preprocess, 14.3ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 384)

image 1/1 d:\tiktok\tiktok_frames\Download\Download_representative_03.jpg: 640x384 1 bottle, 4 cups, 3 potted plants, 1 dining table, 14.2ms
Speed: 1.5ms preprocess, 14.2ms inference, 1.9ms postprocess per image at shape (1, 3, 640, 384)


Processing videos: 100%|██████████| 17/17 [02:44<00:00,  9.70s/it]

  ✓ Completed processing Download

Results saved to: video_features_results.csv
✓ All processing completed, 17 videos total





In [None]:
df = load_results()