# !Note:
This is a preliminary implementation/idea that has to be further tested and validated. It may have bugs or stupid mistakes, but shows the general idea of using video embeddings in BERT Topic rather than text embeddings.


# General Explanation
This code shows how to implement BERTopic for video topic modeling and clustering. BERTopic is a technique used to extract topics from a set of documents through several steps:

- Embedding Documents: Typically, BERT or similar models are used to create embeddings for text documents. However, this approach can be adapted to use any type of embeddings. In this case, X-CLIP is used, an "extension" of CLIP, to generate embeddings from video frames.
- Dimensionality Reduction: The high-dimensional embeddings are reduced to a lower-dimensional space using UMAP.
- Clustering: The reduced embeddings are clustered using HDBSCAN.
- Topic Representation: Topics are represented by key terms extracted from the clusters. This is done using c-TF-IDF (class-based Term Frequency-Inverse Document Frequency) to generate candidate terms and MMR (Maximal Marginal Relevance) to select the most relevant terms for the best possible topic representation.

In traditional BERTopic applications, text documents are embedded and these same texts are used for topic representation. In this implementation, we use video frame embeddings obtained via X-CLIP, but use text descriptions of the videos (such as the description or transcriptions) to represent the topics.


# References:
- X-CLIP:
    - HuggingFace: https://huggingface.co/docs/transformers/model_doc/xclip
    - Paper: X-CLIP: End-to-End Multi-grained Contrastive Learning for Video-Text Retrieval (https://arxiv.org/pdf/2207.0728)
- BERT Topic:
    - https://medium.com/data-reply-it-datatech/bertopic-topic-modeling-as-you-have-never-seen-it-before-abb48bbab2b2
    - https://maartengr.github.io/BERTopic/index.html#visualizations
    - Paper: BERTopic: Neural topic modeling with a class-based TF-IDF procedure (https://arxiv.org/abs/2203.05794)

# Imports and Helper Functions

In [None]:
import av
import torch
import numpy as np
import pandas as pd
from transformers import AutoProcessor, AutoModel
import json
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
import os
from tqdm import tqdm

np.random.seed(0)

2024-06-19 11:22:44.495225: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-06-19 11:22:44.553365: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
# Function to extract video features
def get_video_features(video_path, clip_len=8, frame_sample_rate=1):
    container = av.open(video_path)
    indices = sample_frame_indices(clip_len, frame_sample_rate, container.streams.video[0].frames)
    video = read_video_pyav(container, indices)

    inputs = processor(videos=list(video), return_tensors="pt")
    video_features = model.get_video_features(**inputs)

    return video_features

# Function to decode video with PyAV decoder
def read_video_pyav(container, indices):
    frames = []
    container.seek(0)
    start_index = indices[0]
    end_index = indices[-1]
    for i, frame in enumerate(container.decode(video=0)):
        if i > end_index:
            break
        if i >= start_index and i in indices:
            frames.append(frame)
    return np.stack([x.to_ndarray(format="rgb24") for x in frames])

# Function to sample frame indices from the video
def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
    converted_len = int(clip_len * frame_sample_rate)
    end_idx = np.random.randint(converted_len, seg_len)
    start_idx = end_idx - converted_len
    indices = np.linspace(start_idx, end_idx, num=clip_len)
    indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
    return indices

# Function to save progress to a JSON file
def save_progress(json_path, video_features_list, errors):
    data_to_save = {
        'video_features': video_features_list,
        'errors': errors
    }
    with open(json_path, 'w') as f:
        json.dump(data_to_save, f)

# Function to load progress from a JSON file
def load_progress(json_path):
    if os.path.exists(json_path):
        with open(json_path, 'r') as f:
            data = json.load(f)
        return data['video_features'], data['errors']
    else:
        return [], {}


In [None]:
# Function to extract video features
def get_video_features(video_path, clip_len=8, frame_sample_rate=1):
    container = av.open(video_path)
    indices = sample_frame_indices(clip_len, frame_sample_rate, container.streams.video[0].frames)
    video = read_video_pyav(container, indices)

    inputs = processor(videos=list(video), return_tensors="pt")
    video_features = model.get_video_features(**inputs)

    return video_features

# Function to decode video with PyAV decoder
def read_video_pyav(container, indices):
    frames = []
    container.seek(0)
    start_index = indices[0]
    end_index = indices[-1]
    for i, frame in enumerate(container.decode(video=0)):
        if i > end_index:
            break
        if i >= start_index and i in indices:
            frames.append(frame)
    return np.stack([x.to_ndarray(format="rgb24") for x in frames])

# Function to sample frame indices from the video
def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
    converted_len = int(clip_len * frame_sample_rate)
    end_idx = np.random.randint(converted_len, seg_len)
    start_idx = end_idx - converted_len
    indices = np.linspace(start_idx, end_idx, num=clip_len)
    indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
    return indices

# Function to save progress to a JSON file
def save_progress(json_path, video_features_list, errors):
    data_to_save = {
        'video_features': video_features_list,
        'errors': errors
    }
    with open(json_path, 'w') as f:
        json.dump(data_to_save, f)

# Function to load progress from a JSON file
def load_progress(json_path):
    if os.path.exists(json_path):
        with open(json_path, 'r') as f:
            data = json.load(f)
        return data['video_features'], data['errors']
    else:
        return [], {}


# Obtain Video Features/Embeddings

Input: video paths
Output: video embeddings

In [None]:
processor = AutoProcessor.from_pretrained("microsoft/xclip-base-patch32")
model = AutoModel.from_pretrained("microsoft/xclip-base-patch32")

# Mock data for video paths
mock_video_paths = ['video1.mp4', 'video2.mp4', 'video3.mp4']
mock_data = {'video_path': mock_video_paths}

# Save mock data to a JSON file
with open('mock_data.json', 'w') as f:
    json.dump(mock_data, f)

# Load mock data
with open('mock_data.json', 'r') as f:
    data = json.load(f)

json_path = 'video_features.json'
video_features_list, errors = load_progress(json_path)
start_index = len(video_features_list)

for i, video_path in enumerate(tqdm(data['video_path'][start_index:], desc="Processing videos", initial=start_index)):
    try:
        video_features = get_video_features(video_path)
        video_features_list.append({
            'video_path': video_path,
            'features': video_features.detach().cpu().numpy().tolist()
        })
    except Exception as e:
        print(f"Error processing video {video_path}: {e}")
        errors[video_path] = str(e)
        video_features_list.append({
            'video_path': video_path,
            'features': None
        })
    
    # Save progress every 100 videos
    if (i + 1) % 100 == 0:
        save_progress(json_path, video_features_list, errors)

# Save any remaining progress
save_progress(json_path, video_features_list, errors)

# Convert list of features to a single numpy array
video_features_array = np.vstack([np.array(item['features']) for item in video_features_list if item['features'] is not None])

# Convert numpy array to a tensor
video_features = torch.tensor(video_features_array)

# Save the video features tensor
torch.save(video_features, 'video_features.pt')

# Load Video Features and Match Descriptions
Given the video embeddings and before BERT Topic, match them with their descriptions/text that will be used for the c-TF-IDF and MMR for topic representation.

In [None]:
# Mock data for video descriptions
mock_videoData = pd.DataFrame({
    'video_path': ['video1.mp4', 'video2.mp4', 'video3.mp4'],
    'desc': ['A video about cats', 'A video about dogs', 'A video about birds']
})

# Save mock data to a CSV file
mock_videoData.to_csv('mock_videoData.csv', index=False)

# Load video features
json_path = 'video_features.json'
with open(json_path, 'r') as f:
    data = json.load(f)

video_features_list = data['video_features']
len(video_features_list)

# Load video descriptions
videoData = pd.read_csv('mock_videoData.csv')

valid_descriptions = []
valid_features = []

for item in video_features_list:
    if item['features'] is not None:
        video_path = item['video_path']
        features = np.array(item['features'])  # Convert to numpy array
        
        # Reduce features to 2D by averaging over the sequence dimension if necessary
        # This will depend on the scenario
        if features.shape[0] == 1:
            features = features.squeeze(axis=0)
            print(f"Squeezed dimensions of features for video {video_path}: {features.shape}")
        
        # Match video path to description in videoData DataFrame
        description = videoData.loc[videoData['video_path'] == video_path, 'desc'].values
        if description.size > 0:
            valid_descriptions.append(str(description[0]))
            valid_features.append(features)

# Convert valid_features to a numpy array
valid_features_array = np.array(valid_features)


# BERT Topic

In [None]:
# Preprocess descriptions
def preprocess_descriptions(descriptions):
    # Add any desired preprocessing
    processed_descriptions = [description.lower() for description in descriptions]  # Example preprocessing step
    return processed_descriptions

In [None]:
# Initialize BERTopic model
vectorizer_model = CountVectorizer(stop_words='english')
topic_model = BERTopic(vectorizer_model=vectorizer_model)



processed_descriptions = preprocess_descriptions(valid_descriptions)

# Fit the BERTopic model with the valid descriptions and embeddings
topics, probabilities = topic_model.fit_transform(processed_descriptions, embeddings=valid_features_array)

# Add topics to the DataFrame
videoData['topic'] = None
valid_paths = [item['video_path'] for item in video_features_list if item['features'] is not None]
for idx, video_path in enumerate(valid_paths):
    videoData.loc[videoData['video_path'] == video_path, 'topic'] = topics[idx]

# Display topic information
topic_info = topic_model.get_topic_info()
print(topic_info)


# Create html for visualization:
Given the videos, clusters and topics representations: generates an html with 10 examples of videos of a cluster. The videos are represented by (only) their first frame. When creating the html, important to check that the video/frame paths are the adequate.

In [None]:
import cv2

# Function to extract the first frame from a video
def extract_first_frame(video_path, output_path):
    cap = cv2.VideoCapture(video_path)
    success, frame = cap.read()
    if success:
        cv2.imwrite(output_path, frame)
    cap.release()

# Function to create an HTML file for visualizing the topics
def create_html_for_topics(video_data, topic_model, output_dir):
    topic_info = topic_model.get_topic_info()
    
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    html_content = """<!DOCTYPE html>
    <html>
    <head>
        <title>Video Topics</title>
        <style>
            body { font-family: Arial, sans-serif; }
            h2 { margin-top: 40px; }
            .topic-container { margin-bottom: 30px; }
            .video-container { display: flex; flex-wrap: wrap; }
            .video-item { margin: 10px; text-align: center; }
            .video-item img { width: 320px; height: auto; }
            .topic-words { font-size: 20px; font-weight: bold; }
        </style>
    </head>
    <body>"""

    for topic_num in topic_info['Topic']:
        if topic_num == -1:
            continue  # Skip the outlier topic
        topic_videos = video_data[video_data['topic'] == topic_num].head(10)  # Limit to 10 examples
        topic_words = topic_model.get_topic(topic_num)
        html_content += "<div class='topic-container'>"
        html_content += f"<h2>Topic {topic_num}</h2>"
        html_content += "<p class='topic-words'><strong>Top Words: </strong>" + ", ".join([word for word, _ in topic_words]) + "</p>"
        html_content += "<div class='video-container'>"
        
        for _, row in topic_videos.iterrows():
            video_path = row['video_path']
            first_frame_filename = os.path.basename(video_path).replace(".mp4", ".jpg")
            first_frame_path = os.path.join(output_dir, first_frame_filename)
            extract_first_frame(video_path, first_frame_path)
            html_content += f"<div class='video-item'><img src='{first_frame_path}' alt='First frame'></div>"

        html_content += "</div></div>"

    html_content += "</body></html>"
    
    with open(os.path.join(output_dir, "topics.html"), "w") as f:
        f.write(html_content)


In [None]:
# Assuming videoData and topic_model are already defined
output_directory = "topics_visualization"
create_html_for_topics(videoData, topic_model, output_directory)