In [None]:
# Install Twelve Labs SDK
!pip install -U -q twelvelabs

In [None]:
# Install the lancedb library
!pip install -U -q lancedb

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.9/29.9 MB[0m [31m46.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.5/33.5 MB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# Set Up Vector Database
import lancedb
import pyarrow as pa

# Create or connect to a LanceDB database
db = lancedb.connect("my_video_embeddings_db")

# Define the schema for your video embeddings
schema = pa.schema(
    [
        pa.field("video_id", pa.string()),
        pa.field("embedding", pa.list_(pa.float32(), 1024)),  # Adjust the vector dimension as needed
        pa.field("start_time", pa.float32()),
        pa.field("end_time", pa.float32()),
        pa.field("video_url", pa.string())
    ]
)

# Create a table for storing video embeddings, providing the schema
table = db.create_table("video_embeddings", schema=schema, mode="overwrite")

In [None]:
from google.colab import userdata
TL_API_KEY=userdata.get('TL_API_KEY')

In [None]:
# Generating Embeddings with Twelve Labs API
from twelvelabs import TwelveLabs
from twelvelabs.models.embed import EmbeddingsTask

# Initialize the Twelve Labs client
twelvelabs_client = TwelveLabs(api_key=TL_API_KEY)

def generate_embedding(video_url):
    # Create an embedding task
    task = twelvelabs_client.embed.task.create(
        model_name="Marengo-retrieval-2.7",
        video_url=video_url
    )

    print(f"Created task: id={task.id} status={task.status} model_name={task.model_name}")

    # Define a callback function to monitor task progress
    def on_task_update(task: EmbeddingsTask):
        print(f" Status={task.status}")

    # Wait for the task to complete
    status = task.wait_for_done(
        sleep_interval=2,
        callback=on_task_update
    )
    print(f"Embedding done: {status}")

    # Retrieve the task result
    task_result = twelvelabs_client.embed.task.retrieve(task.id)

    # Extract and return the embeddings
    embeddings = []
    # Access the segments through video_embedding.segments
    for segment in task_result.video_embedding.segments:
        embeddings.append({
            'embedding': segment.embeddings_float,
            'start_offset_sec': segment.start_offset_sec,
            'end_offset_sec': segment.end_offset_sec,
            'embedding_scope': segment.embedding_scope #clip or video
        })

    return embeddings, task_result

# Example usage
video_url = "http://lakshonline.com/wp-content/uploads/2024/12/nicole_trailer.mp4"
embeddings, task_result = generate_embedding(video_url)
print(f"Generated {len(embeddings)} embeddings for the video")

for i, emb in enumerate(embeddings):
    print(f"Embedding {i+1}:")
    print(f" Scope: {emb['embedding_scope']}")
    print(f" Time range: {emb['start_offset_sec']} - {emb['end_offset_sec']} seconds")
    print(f" Embedding vector (first 5 values): {emb['embedding'][:5]}")
    print()

Created task: id=6768eb16c8d47cd895c4308e status=processing model_name=Marengo-retrieval-2.7
 Status=processing
 Status=processing
 Status=processing
 Status=processing
 Status=processing
 Status=processing
 Status=processing
 Status=processing
 Status=processing
 Status=processing
 Status=processing
 Status=processing
 Status=processing
 Status=processing
 Status=processing
 Status=processing
 Status=ready
Embedding done: ready
Generated 24 embeddings for the video
Embedding 1:
 Scope: clip
 Time range: 0.0 - 6.0 seconds
 Embedding vector (first 5 values): [0.082840234, -0.01033165, 0.00512286, 0.010688391, -0.01757237]

Embedding 2:
 Scope: clip
 Time range: 6.0 - 12.0 seconds
 Embedding vector (first 5 values): [0.052370224, -0.0027802594, -0.03038719, 0.026639426, -0.025815303]

Embedding 3:
 Scope: clip
 Time range: 12.0 - 18.0 seconds
 Embedding vector (first 5 values): [0.084818855, -0.009348972, 0.012168455, -0.01385785, -0.02165269]

Embedding 4:
 Scope: clip
 Time range: 18.0

In [None]:
# Storing Embeddings into LanceDB
import uuid

# Function to insert embeddings into LanceDB
def insert_embeddings(embeddings, video_url):
    data_to_insert = []
    for emb in embeddings:
        data_to_insert.append({
            "video_id": str(uuid.uuid4()),  # Generate a unique ID for each embedding
            "embedding": emb['embedding'],
            "start_time": emb['start_offset_sec'],
            "end_time": emb['end_offset_sec'],
            "video_url": video_url
        })

    table.add(data_to_insert)
    print(f"Inserted {len(data_to_insert)} embeddings for video: {video_url}")

In [None]:
# Use the function to insert the embeddings we generated earlier
insert_embeddings(embeddings, video_url)

# Verify the insertion
print(f"Total embeddings in the table: {len(table)}")

Inserted 24 embeddings for video: http://lakshonline.com/wp-content/uploads/2024/12/nicole_trailer.mp4
Total embeddings in the table: 24


In [None]:
# Querying and Retrieving Embeddings
# Function to perform similarity search
def similarity_search(query_embedding, k=2):
    results = table.search(query_embedding).limit(k).to_list()
    return results

# Function to get embedding for a text query
def get_text_embedding(text_query):
    result = twelvelabs_client.embed.create(
        model_name="Marengo-retrieval-2.7",
        text=text_query,
        text_truncate="start"
    )

    # Get embedding from the first (and likely only) segment
    embedding_vector = None
    if result.text_embedding.segments:
        segment = result.text_embedding.segments[0]
        embedding_vector = segment.embeddings_float

    return embedding_vector

# Example usage: Text-based search
text_query = "An window with skyscrapers"
query_embedding = get_text_embedding(text_query)
print("\nQuery embedding length:", len(query_embedding) if query_embedding else "No embedding found")

if query_embedding:
    search_results = similarity_search(query_embedding)
    for i, result in enumerate(search_results):
        print(f"\nMatch {i+1}:")
        print(f"Distance: {result['_distance']:.4f}")  # Similarity score (lower is better)
        print(f"Time range: {result['start_time']:.2f}s - {result['end_time']:.2f}s")
        print(f"Video ID: {result['video_id']}")
        print(f"Video URL: {result['video_url']}")


Query embedding length: 1024

Match 1:
Distance: 1.3039
Time range: 36.00s - 42.00s
Video ID: 7f590871-9fcf-4c88-9f20-c3236fed19d0
Video URL: http://lakshonline.com/wp-content/uploads/2024/12/nicole_trailer.mp4

Match 2:
Distance: 1.4188
Time range: 54.00s - 60.00s
Video ID: 5895e425-97a8-49ca-b6b3-e88ce52b298c
Video URL: http://lakshonline.com/wp-content/uploads/2024/12/nicole_trailer.mp4


In [None]:
# Function to retrieve video segment details
def get_video_segment(video_url, start_time, end_time):
    return f"{video_url}?start={start_time}&end={end_time}"

# Retrieve specific video segment for the top result
top_result = search_results[0]
video_segment = get_video_segment(top_result['video_url'],
                                  top_result['start_time'],
                                  top_result['end_time'])
print(f"Link to top result video segment: {video_segment}")

Link to top result video segment: http://lakshonline.com/wp-content/uploads/2024/12/nicole_trailer.mp4?start=36.0&end=42.0
