# Compare Candidates Search Approaches

#### Imports


In [11]:
import polars as pl 

from sentence_transformers import SentenceTransformer, util

from sklearn.metrics import DistanceMetric
import numpy as np

import matplotlib.pyplot as plt

#### Load data

In [12]:
df = pl.read_parquet("data/video-transcripts.parquet)")
df_eval = pl.read_csv("data/eval-raw.csv")

df.head()


video_id,datetime,title,transcript
str,datetime[μs],str,str
"""bZr2vhoXSy8""",2025-02-08 18:10:05,"""I Trained FLUX.1 on My Face (P…","""flux is a state-of-the-art ima…"
"""QvxuR8uLPFs""",2025-02-03 18:00:00,"""How to Build Customer Segments…","""although today's AI models are…"
"""W4s6b2ZM6kI""",2025-01-31 22:38:22,"""Fine-tuning Multimodal Embeddi…","""multimodal embedding models br…"
"""hOLBrIjRAj4""",2025-01-22 21:25:16,"""Fine-Tuning Text Embeddings Fo…","""embedding models represent tex…"
"""V1BR2tb_e8g""",2025-01-13 21:10:47,"""My AI Development Setup (From …","""hey everyone I'm Shaw I just g…"


#### Embed title and transcript

#### define parameters

In [13]:
colum_to_embed_list = ['title', 'transcript']
model_name_list = ['all-MiniLM-L6-v2', 'multi-qa-distilbert-cos-v1', 'multi-qa-mpnet-base-dot-v1']

In [14]:
# generata embeddings for each combination of column and model

# initialize dict to keep track of all text embeddings
text_embeddings_dict = {}

for model_name in model_name_list:
    # define embedding model
    model = SentenceTransformer(model_name)
    
    for colum_name in colum_to_embed_list:
        #define text embedding identifier
        key_name = model_name + "_" + colum_name
        print(key_name)
        %time embeddings_arr = model.encode(df[colum_name].to_list())
        print('')

        # append embeddings to dict 
        text_embeddings_dict[key_name] = embeddings_arr

all-MiniLM-L6-v2_title
CPU times: user 60.2 ms, sys: 806 ms, total: 866 ms
Wall time: 5.17 s

all-MiniLM-L6-v2_transcript
CPU times: user 1.26 s, sys: 298 ms, total: 1.56 s
Wall time: 1.11 s

multi-qa-distilbert-cos-v1_title
CPU times: user 56.2 ms, sys: 40.1 ms, total: 96.3 ms
Wall time: 207 ms

multi-qa-distilbert-cos-v1_transcript
CPU times: user 1.23 s, sys: 1.44 s, total: 2.67 s
Wall time: 7.94 s

multi-qa-mpnet-base-dot-v1_title
CPU times: user 64.8 ms, sys: 101 ms, total: 165 ms
Wall time: 546 ms

multi-qa-mpnet-base-dot-v1_transcript
CPU times: user 3.64 s, sys: 711 ms, total: 4.35 s
Wall time: 7.44 s



#### Embed queries

In [15]:
query_embedding_dict = {}

for model_name in model_name_list:
    # define embedding model
    model = SentenceTransformer(model_name)
    print(model_name)

    # embed query text
    %time embeddings_arr = model.encode(df_eval['query'].to_list())
    print('')

    query_embedding_dict[model_name] = embeddings_arr



     

all-MiniLM-L6-v2
CPU times: user 31.3 ms, sys: 671 ms, total: 702 ms
Wall time: 4.54 s

multi-qa-distilbert-cos-v1
CPU times: user 35.5 ms, sys: 654 ms, total: 690 ms
Wall time: 4.96 s

multi-qa-mpnet-base-dot-v1
CPU times: user 587 ms, sys: 925 ms, total: 1.51 s
Wall time: 3.99 s



#### Evaluate search models 

In [16]:
def returnVideoID_index(df: pl.DataFrame, df_eval: pl.DataFrame, query_n: int):
    '''
        Function to return the index of the dataframe corresponding to the nth row in evaluation dataframe
    '''
    return [i for i in range(len(df)) if df['video_id'][i] == df_eval['video_id'][query_n]]

In [17]:
def evalTrueRankings(dist_arr_isorted: np.ndarray, df: pl.dataframe.frame.DataFrame, df_eval: pl.dataframe.frame.DataFrame) -> np.ndarray:
    """
        Function to return "true" video ID rankings for each evaluation query
    """
    
    # intialize array to store rankings of "correct" search result
    true_rank_arr = np.empty((1, dist_arr_isorted.shape[1]))
    
    # evaluate ranking of correct result for each query
    for query_n in range(dist_arr_isorted.shape[1]):
    
        # return "true" video ID's in df
        video_id_idx = returnVideoID_index(df, df_eval, query_n)
        
        # evaluate the ranking of the "true" video ID
        true_rank = np.argwhere(dist_arr_isorted[:,query_n]==video_id_idx)[0][0]
        
        # store the "true" video ID's ranking in array
        true_rank_arr[0,query_n] = true_rank

    return true_rank_arr

In [18]:
# initialize distance metrics to experiment
dist_name_list = ['euclidean', 'manhattan', 'chebyshev']
sim_name_list = ['cos_sim', 'dot_score']

TypeError: list indices must be integers or slices, not str