### Semantic Video Search

![Semantic video search](./../semantic-video-search.png)

### Setup dependence, env variables and Spark context 

In [8]:
import json
import os
from dotenv import load_dotenv
from os import listdir
from os.path import join
from pymilvus import MilvusClient
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, monotonically_increasing_id, udf
from pyspark.sql.types import *
from twelvelabs import TwelveLabs
from twelvelabs.models.embed import EmbeddingsTask

TWELVE_LABS_API_KEY = os.getenv('TWELVE_LABS_API_KEY')
VIDEO_DIR = './../videos'
PARQUET_DIR = './../parquet'
MILVUS_HOST = 'host.docker.internal'
MILVUS_COLLECTION = 'twelve_labs_video'
twelvelabs_client = TwelveLabs(api_key=TWELVE_LABS_API_KEY)
spark = SparkSession.builder \
    .appName('semantic-video-search') \
    .getOrCreate()
milvus_client = MilvusClient(uri=f"http://{MILVUS_HOST}:19530")

### Spark UDF to convert video (urls or file paths) to embeddings 

In [9]:
def generate_embedding(path, file_or_url='file'):
    twelvelabs_client = TwelveLabs(api_key=TWELVE_LABS_API_KEY)
    params = {
        'engine_name': "Marengo-retrieval-2.6",
        'video_clip_length': 10,
    }
    params['video_file' if file_or_url=='file' else 'video_url'] = path
    task = twelvelabs_client.embed.task.create(**params)
    task.wait_for_done()
    task_result = twelvelabs_client.embed.task.retrieve(task.id)
    return [
        {
            'engine': task_result.engine_name,
            'task_status': task_result.status,
            'embedding': v.values,
            'start_offset_sec': v.start_offset_sec,
            'end_offset_sec': v.end_offset_sec,
            'embedding_scope': v.embedding_scope
        }
        for v in task_result.video_embeddings
    ] if task_result.video_embeddings else [
        {
            'engine': task_result.engine_name,
            'task_status': task_result.status,
            'embedding': None,
            'start_offset_sec': None,
            'end_offset_sec': None,
            'embedding_scope': None
        }
    ]
    
generate_embedding_udf = udf(
    generate_embedding, 
    ArrayType(StructType([
        StructField("engine", StringType(), True),
        StructField("task_status", StringType(), True),
        StructField("embedding", ArrayType(FloatType(), True)),
        StructField("start_offset_sec", FloatType(), True),
        StructField("end_offset_sec", FloatType(), True),
        StructField("embedding_scope", StringType(), True)                 
    ])))

### Update embeddings
1. read processed embeddings from parquet
2. read video file list from video directory
3. check if new videos(unprocessed videos) are detected
4. process new videos by creating embeddings for each new video
5. append new embeddings to parquet repo 

In [13]:
df2 = spark.createDataFrame([(join(VIDEO_DIR, f),) for f in listdir(VIDEO_DIR)], ['video_path'])
if os.path.isdir(f"{PARQUET_DIR}/embedding"):
    df1 = spark.read.parquet(f"{PARQUET_DIR}/embedding")
    df2 = df2.join(df1, on='video_path', how='anti')
new_video_count = df2.count()
print(f"new video count: {new_video_count}")
if new_video_count > 0:
    df2 = df2.withColumn("embedding", generate_embedding_udf(df2.video_path))
    df2.write.mode('append').parquet(f"{PARQUET_DIR}/embedding")
    df1 = spark.read.parquet(f"{PARQUET_DIR}/embedding")
df1.printSchema()
df1.show(10, truncate=False)
print(f"total row count = {df1.count()}")

new video count: 0
root
 |-- video_path: string (nullable = true)
 |-- embedding: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- engine: string (nullable = true)
 |    |    |-- task_status: string (nullable = true)
 |    |    |-- embedding: array (nullable = true)
 |    |    |    |-- element: float (containsNull = true)
 |    |    |-- start_offset_sec: float (nullable = true)
 |    |    |-- end_offset_sec: float (nullable = true)
 |    |    |-- embedding_scope: string (nullable = true)

+-----------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

### Insert embeddings to vector DB
1. Each dataframe record is a video which contains a list of clips. A video is chunked to 10 seconds clips.
2. Flatten dataframe. Explode records. From videos to clips.
3. Insert clip records to vector DB.

In [14]:
df1 = df1.withColumn('embedding', explode(df1.embedding))
df1 = df1.select(
    monotonically_increasing_id().alias('id'),
    df1.video_path,
    df1.embedding.embedding.alias('vector'),
    df1.embedding.embedding_scope.alias('embedding_scope'),
    df1.embedding.start_offset_sec.alias('start_offset_sec'),
    df1.embedding.end_offset_sec.alias('end_offset_sec')
)
df1.printSchema()
data = df1.rdd.map(lambda row: row.asDict()).collect()
if milvus_client.has_collection(collection_name=MILVUS_COLLECTION):
    milvus_client.drop_collection(collection_name=MILVUS_COLLECTION)
milvus_client.create_collection(collection_name=MILVUS_COLLECTION, dimension=1024)
rs = milvus_client.insert(collection_name=MILVUS_COLLECTION, data=data)
print(rs)

root
 |-- id: long (nullable = false)
 |-- video_path: string (nullable = true)
 |-- vector: array (nullable = true)
 |    |-- element: float (containsNull = true)
 |-- embedding_scope: string (nullable = true)
 |-- start_offset_sec: float (nullable = true)
 |-- end_offset_sec: float (nullable = true)

{'insert_count': 9, 'ids': [0, 1, 2, 8589934592, 8589934593, 17179869184, 17179869185, 25769803776, 25769803777]}


### Text search

In [18]:
def text_search(query, limit=5):
    text_embedding = twelvelabs_client.embed.create(
      engine_name="Marengo-retrieval-2.6",
      text=query,
      text_truncate='none',
    )
    rs = milvus_client.search(
      collection_name=MILVUS_COLLECTION,
      data = [text_embedding.text_embedding.float],
      limit=limit,
      output_fields=['id', 'video_path', 'start_offset_sec']
    )
    return rs

def parse_search_result(rs):
  for rec in rs[0]:
    print(rec)

queries = ['cat', 'dog', 'car', 'truck', 'bmw', 'truck driving backward', 'a cat on top of another cat', 'man play with a dog', 'dog and pool']
for query in queries:
  print(f"query = {query}")
  parse_search_result(text_search(query, 10))

query = cat
{'id': 17179869184, 'distance': 0.24640296399593353, 'entity': {'id': 17179869184, 'video_path': './../videos/cute-little-kitten-baby-cat-babycat-kitten-cat-720-ytshorts.savetube.me.mp4', 'start_offset_sec': 0.0}}
{'id': 2, 'distance': 0.24361078441143036, 'entity': {'id': 2, 'video_path': './../videos/cute-baby-kittens-kitten-cutecats-babycat-shorts-catfancy-1080-ytshorts.savetube.me.mp4', 'start_offset_sec': 0.0}}
{'id': 8589934592, 'distance': 0.1390482783317566, 'entity': {'id': 8589934592, 'video_path': './../videos/shihtzu-dog-barking-shihtzu-barking-dogs-shihtzupuppies-1080-ytshorts.savetube.me.mp4', 'start_offset_sec': 0.0}}
{'id': 17179869185, 'distance': 0.13413076102733612, 'entity': {'id': 17179869185, 'video_path': './../videos/baby-dog-cute-puppy-barking-4kviral-shorts-720-ytshorts.savetube.me.mp4', 'start_offset_sec': 0.0}}
{'id': 1, 'distance': 0.11543683707714081, 'entity': {'id': 1, 'video_path': './../videos/dog-owner-regrets-his-decision-immediately-dogs

### Video search

In [16]:
def video_search(embedding, limit=5):
    rs = milvus_client.search(
        collection_name=MILVUS_COLLECTION,
        data = [embedding],
        limit=limit,
        output_fields=['id', 'video_path', 'start_offset_sec']
    )
    return rs

embeddings = {
    rec['video_path'].split('/')[-1]: rec['vector']
    for rec in data
}

queries = [
    'cute-little-kitten-baby-cat-babycat-kitten-cat-720-ytshorts.savetube.me.mp4',
    'shihtzu-dog-barking-shihtzu-barking-dogs-shihtzupuppies-1080-ytshorts.savetube.me.mp4',
    'tesla-cybertruck-fully-functioning-shorts-720-ytshorts.savetube.me.mp4',
    'dog-owner-regrets-his-decision-immediately-dogsofyoutube-dogshorts-funnydogs-funnyanimals-1080-ytshorts.savetube.me.mp4',
]
for query in queries:
    print(f"query = {query}")
    parse_search_result(video_search(embeddings[query], 10))


query = cute-little-kitten-baby-cat-babycat-kitten-cat-720-ytshorts.savetube.me.mp4
{'id': 17179869184, 'distance': 1.0, 'entity': {'id': 17179869184, 'video_path': './../videos/cute-little-kitten-baby-cat-babycat-kitten-cat-720-ytshorts.savetube.me.mp4', 'start_offset_sec': 0.0}}
{'id': 2, 'distance': 0.6096300482749939, 'entity': {'id': 2, 'video_path': './../videos/cute-baby-kittens-kitten-cutecats-babycat-shorts-catfancy-1080-ytshorts.savetube.me.mp4', 'start_offset_sec': 0.0}}
{'id': 17179869185, 'distance': 0.5760656595230103, 'entity': {'id': 17179869185, 'video_path': './../videos/baby-dog-cute-puppy-barking-4kviral-shorts-720-ytshorts.savetube.me.mp4', 'start_offset_sec': 0.0}}
{'id': 8589934592, 'distance': 0.4636029005050659, 'entity': {'id': 8589934592, 'video_path': './../videos/shihtzu-dog-barking-shihtzu-barking-dogs-shihtzupuppies-1080-ytshorts.savetube.me.mp4', 'start_offset_sec': 0.0}}
{'id': 0, 'distance': 0.4142894148826599, 'entity': {'id': 0, 'video_path': './../v