# Compare Candidates Search Approaches

#### Imports


In [3]:
import polars as pl 

from sentence_transformers import SentenceTransformer, util

from sklearn.metrics import DistanceMetric
import numpy as np

import matplotlib.pyplot as plt

#### Load data

In [7]:
df = pl.read_parquet("data/video-transcripts.parquet)")
df_eval = pl.read_csv("data/eval-raw.csv")

df.head()


video_id,datetime,title,transcript
str,datetime[μs],str,str
"""bZr2vhoXSy8""",2025-02-08 18:10:05,"""I Trained FLUX.1 on My Face (P…","""flux is a state-of-the-art ima…"
"""QvxuR8uLPFs""",2025-02-03 18:00:00,"""How to Build Customer Segments…","""although today's AI models are…"
"""W4s6b2ZM6kI""",2025-01-31 22:38:22,"""Fine-tuning Multimodal Embeddi…","""multimodal embedding models br…"
"""hOLBrIjRAj4""",2025-01-22 21:25:16,"""Fine-Tuning Text Embeddings Fo…","""embedding models represent tex…"
"""V1BR2tb_e8g""",2025-01-13 21:10:47,"""My AI Development Setup (From …","""hey everyone I'm Shaw I just g…"


#### Embed title and transcript

#### define parameters

In [5]:
colum_to_embed_list = ['title', 'transcript']
model_name_list = ['all-MiniLM-L6-v2', 'multi-qa-distilbert-cos-v1', 'multi-qa-mpnet-base-dot-v1']

In [6]:
# generata embeddings for each combination of column and model

# initialize dict to keep track of all text embeddings
text_embeddings_dict = {}

for model_name in model_name_list:
    # define embedding model
    model = SentenceTransformer(model_name)
    
    for colum_name in colum_to_embed_list:
        #define text embedding identifier
        key_name = model_name + "_" + colum_name
        print(key_name)
        %time embeddings_arr = model.encode(df[colum_name].to_list())
        print('')

        # append embeddings to dict 
        text_embeddings_dict[key_name] = embeddings_arr

all-MiniLM-L6-v2_title
CPU times: user 358 ms, sys: 162 ms, total: 521 ms
Wall time: 2.97 s

all-MiniLM-L6-v2_transcript
CPU times: user 1.28 s, sys: 186 ms, total: 1.47 s
Wall time: 1.3 s

multi-qa-distilbert-cos-v1_title
CPU times: user 182 ms, sys: 42.6 ms, total: 225 ms
Wall time: 426 ms

multi-qa-distilbert-cos-v1_transcript
CPU times: user 1.35 s, sys: 641 ms, total: 1.99 s
Wall time: 4.88 s

multi-qa-mpnet-base-dot-v1_title
CPU times: user 223 ms, sys: 131 ms, total: 354 ms
Wall time: 1.99 s

multi-qa-mpnet-base-dot-v1_transcript
CPU times: user 3.58 s, sys: 337 ms, total: 3.92 s
Wall time: 7.7 s



#### Embed queries

In [8]:
query_embedding_dict = {}

for model_name in model_name_list:
    # define embedding model
    model = SentenceTransformer(model_name)
    print(model_name)

    # embed query text
    %time embeddings_arr = model.encode(df_eval['query'].to_list())
    print('')

    query_embedding_dict[model_name] = embeddings_arr



     

all-MiniLM-L6-v2
CPU times: user 151 ms, sys: 88.1 ms, total: 239 ms
Wall time: 884 ms

multi-qa-distilbert-cos-v1
CPU times: user 103 ms, sys: 30.3 ms, total: 133 ms
Wall time: 319 ms

multi-qa-mpnet-base-dot-v1
CPU times: user 559 ms, sys: 156 ms, total: 715 ms
Wall time: 1.33 s

