In [1]:
import polars as pl

from sentence_transformers import SentenceTransformer, util

from sklearn.neighbors import DistanceMetric
import numpy as np

import matplotlib.pyplot as plt

In [2]:
df = pl.read_parquet('~/Documents/Code/FullStackDataScience/Data/video-transcripts.parquet')
df_eval = pl.read_csv('~/Documents/Code/FullStackDataScience/Data/eval-raw.csv')
df.head()

video_id,datetime,title,transcript
str,datetime[μs],str,str
"""03x2oYg9oME""",2024-04-25 15:16:00,"""Data Science Project Managemen…","""this video is part of a larger…"
"""O5i_mMUM94c""",2024-04-19 14:05:54,"""How I’d learned #datascience (…","""here's how I'd learn data scie…"
"""xm9devSQEqU""",2024-04-18 15:59:02,"""4 Skills You Need to Be a Full…","""although it is common to deleg…"
"""Z6CmuVEi7QY""",2024-04-11 10:00:27,"""How I'd Learn Data Science (if…","""when I was first learning data…"
"""INlCLmWlojY""",2024-04-04 18:45:00,"""I Was Wrong About AI Consultin…","""last year I quit my corporate …"


In [3]:
# define "parameters"
column_to_embed_list = ['title', 'transcript']
model_name_list = ["all-MiniLM-L6-v2", "multi-qa-distilbert-cos-v1", "multi-qa-mpnet-base-dot-v1"]

In [4]:
# generate embeddings for each combination of column and model

# initialize dict to keep track of all text embeddings
text_embedding_dict = {}

for model_name in model_name_list:

    #define embedding model
    model = SentenceTransformer(model_name) 

    for column_name in column_to_embed_list:

        # define text embedding identifier
        key_name = model_name + "_" + column_name
        print(key_name)

        # generate embeddings for text under column_name
        %time embedding_arr = model.encode(df[column_name].to_list())
        print('')

        # append embeddings to dict
        text_embedding_dict[key_name] = embedding_arr

all-MiniLM-L6-v2_title
Wall time: 989 ms

all-MiniLM-L6-v2_transcript
Wall time: 8 s

multi-qa-distilbert-cos-v1_title
Wall time: 3.32 s

multi-qa-distilbert-cos-v1_transcript
Wall time: 58.2 s

multi-qa-mpnet-base-dot-v1_title
Wall time: 6.67 s

multi-qa-mpnet-base-dot-v1_transcript
Wall time: 1min 57s



In [5]:
query_embedding_dict = {}

for model_name in model_name_list:

    #define embedding model
    model = SentenceTransformer(model_name)
    print(model_name)

    # embed query text
    %time embedding_arr = model.encode(df_eval['query'].to_list())
    print('')

    # append embedding to dict
    query_embedding_dict[model_name] = embedding_arr

all-MiniLM-L6-v2
Wall time: 929 ms

multi-qa-distilbert-cos-v1
Wall time: 3.29 s

multi-qa-mpnet-base-dot-v1
Wall time: 6 s



In [6]:
def returnVideoID_index(df: pl.dataframe.frame.DataFrame, df_eval: pl.dataframe.frame.DataFrame, query_n: int) -> int:
    """
        Function to return the index of a dataframe corresponding to the nth row in evaluation dataframe
    """

    return [i for i in range(len(df)) if df['video_id'][i]==df_eval['video_id'][query_n]][0]

In [7]:
def evalTrueRankings(dist_arr_isorted: np.ndarray, df: pl.dataframe.frame.DataFrame, df_eval: pl.dataframe.frame.DataFrame) -> np.ndarray:
    """
        Function to return "true" video ID rankings for each evaluation query
    """
    
    # intialize array to store rankings of "correct" search result
    true_rank_arr = np.empty((1, dist_arr_isorted.shape[1]))
    
    # evaluate ranking of correct result for each query
    for query_n in range(dist_arr_isorted.shape[1]):
    
        # return "true" video ID's in df
        video_id_idx = returnVideoID_index(df, df_eval, query_n)
        
        # evaluate the ranking of the "true" video ID
        true_rank = np.argwhere(dist_arr_isorted[:,query_n]==video_id_idx)[0][0]
        
        # store the "true" video ID's ranking in array
        true_rank_arr[0,query_n] = true_rank

    return true_rank_arr

In [8]:
# initialize distance metrics to experiment
dist_name_list = ['euclidean', 'manhattan', 'chebyshev']
sim_name_list = ['cos_sim', 'dot_score']

In [9]:
# evaluate all possible combinations of model, columns to embed, and distance metrics

# initialize list to store results
eval_results = []

# loop through all models
for model_name in model_name_list:

    # generate query embedding
    query_embedding = query_embedding_dict[model_name]
    
    # loop through text columns
    for column_name in column_to_embed_list:

        # generate column embedding
        embedding_arr = text_embedding_dict[model_name+'_'+column_name]

        # loop through distance metrics
        for dist_name in dist_name_list:

            # compute distance between video text and query
            dist = DistanceMetric.get_metric(dist_name)
            dist_arr = dist.pairwise(embedding_arr, query_embedding)

            # sort indexes of distance array
            dist_arr_isorted = np.argsort(dist_arr, axis=0)

            # define label for search method
            method_name = "_".join([model_name, column_name, dist_name])

            # evaluate the ranking of the ground truth
            true_rank_arr = evalTrueRankings(dist_arr_isorted, df, df_eval)

            # store results
            eval_list = [method_name] + true_rank_arr.tolist()[0]
            eval_results.append(eval_list)

        # loop through sbert similarity scores
        for sim_name in sim_name_list:
            # apply similarity score from sbert
            cmd = "dist_arr = -util." + sim_name + "(embedding_arr, query_embedding)"
            exec(cmd)
    
            # sort indexes of distance array (notice minus sign in front of cosine similarity)
            dist_arr_isorted = np.argsort(dist_arr, axis=0)
    
            # define label for search method
            method_name = "_".join([model_name, column_name, sim_name.replace("_","-")])
    
            # evaluate the ranking of the ground truth
            true_rank_arr = evalTrueRankings(dist_arr_isorted, df, df_eval)
    
            # store results
            eval_list = [method_name] + true_rank_arr.tolist()[0]
            eval_results.append(eval_list)