# <center> `Video RAG`

###  `Video resolution`

`Video resolution` is the size of each image in the video. It doesn't need to be a standard size, but there are common sizes for video.

`Frame rate` determines the number of `images` seen per second. Can be described as `fps` (frames per second) or `Hz` (general unit for frequency)

<center> <img src = "/Users/glebmaksimov/Desktop/ML/projects/video_rag/docs/video.png" height = 300>


In [1]:
import cv2, torch, warnings, collections, spacy, os

from numpy.linalg import norm

from collections import Counter

from numpy import dot

from tqdm import tqdm

from moviepy.editor import VideoFileClip

from transformers import pipeline, AutoTokenizer

from langchain.text_splitter import RecursiveCharacterTextSplitter

from pydub import AudioSegment

from sentence_transformers import SentenceTransformer

from scipy.spatial.distance import euclidean

import numpy as np
import pandas as pd
import plotly.subplots as sp
import plotly.graph_objs as go 
from sklearn import metrics


# Dim Red

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import umap.umap_ as umap

# Clustering

import hdbscan
from sklearn.cluster import DBSCAN,KMeans
from sklearn import metrics


nlp = spacy.load("en_core_sci_lg") 


warnings.filterwarnings(action = 'ignore'); RANDOM_SEED = 42; device  = "cpu" 
pd.set_option("display.max_rows", 600)
pd.set_option("display.max_columns", 500)
pd.set_option("max_colwidth", 500)
%matplotlib inline

In [None]:
video = "/Users/glebmaksimov/Desktop/ML/projects/video_rag/docs/Coding LLaMA 2 from scratch in PyTorch - KV Cache, Grouped Query Attention, Rotary PE, RMSNorm.mp4"

In [None]:
# Text 

def embed(texts):

    model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

    embeddings = model.encode(texts)

    return embeddings 

def chunk_text(text, chunk_size = 300, chunk_overlap = 50):

    tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-mpnet-base-v2')

    len_f = lambda x : len(tokenizer(x, padding = True, truncation = True, return_tensors = 'pt')["input_ids"][0])

    text_splitter = RecursiveCharacterTextSplitter(
            
                                                        chunk_size      = chunk_size,
                                                        chunk_overlap   = chunk_overlap,
                                                        length_function = len_f,
                                                        separators      = [". ","! ","; ","? ",","," ", ""]
                                                    )

    chunks = text_splitter.split_text(text)

    return chunks

def cosine_similarity(list_1, list_2):
  
  cos_sim = dot(list_1, list_2) / (norm(list_1) * norm(list_2))
  
  return cos_sim

# -----------------------------------------------
# Video
# -----------------------------------------------

def get_video_specs(video):

    cap = cv2.VideoCapture(video)

    n_frames = cap.get(cv2.CAP_PROP_FRAME_COUNT)
    height   = cap.get(cv2.CAP_PROP_FRAME_HEIGHT)
    width    = cap.get(cv2.CAP_PROP_FRAME_WIDTH)
    fps      = cap.get(cv2.CAP_PROP_FPS)

    return cap, n_frames, height, width, fps

def extract_frames(video):

    cap,n_frames,_,_,_ =  get_video_specs(video)

    frames = []

    for _ in range(int(n_frames)):

        ret, img = cap.read()

        if ret == False: break

        frames.append(img)
    
    cap.release() # plt.imshow(cv2.cvtColor(images[1035], cv2.COLOR_BGR2RGB))

    return frames

def mp4_to_mp3(video):
    
    # Define the input video file and output audio file
    mp4_file = video
    mp3_file = video.replace(".mp4",".mp3")

    # Load the video clip
    video_clip = VideoFileClip(mp4_file)

    # Extract the audio from the video clip
    audio_clip = video_clip.audio

    # Write the audio to a separate file
    audio_clip.write_audiofile(mp3_file)

    # Close the video and audio clips
    audio_clip.close()
    video_clip.close()

    print("Audio extraction successful!")

# -----------------------------------------------
# Audio
# -----------------------------------------------

def get_mp3_length(file_path):
    
    audio = AudioSegment.from_mp3(file_path)

    return len(audio)/ 1000.0

def chunk_audio(audio, output_dir, chunk_size = 300, chunk_overlap = 50):
    
    # Load the audio file
    audio = AudioSegment.from_mp3(audio)
    
    # Ensure output directory exists
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Calculate segment length and overlap in milliseconds
    segment_length_ms = chunk_size * 1000
    overlap_ms = chunk_overlap * 1000

    # Initialize start and end times
    start = 0
    end = segment_length_ms

    segment_number = 1

    # Split audio with overlap
    while start < len(audio):
        # Extract segment
        segment = audio[start:end]
        # Export segment to file
        segment.export(os.path.join(output_dir, f"segment_{segment_number}.mp3"), format="mp3")
        
        # Update start and end times for next segment
        start += segment_length_ms - overlap_ms
        end = start + segment_length_ms

        segment_number += 1

        # Break if the end goes beyond the audio length
        if start >= len(audio):
            break

# -----------------------------------------------
# Clustering
# -----------------------------------------------
 
def normalize_embeddings(embeddings): return embeddings / np.linalg.norm(embeddings, axis=1).max()

def reduce_dimentionality(embeddings, n_neighbors, n_components, decomposition_algorithm):
    
    normed_embeddings = normalize_embeddings(embeddings)

    if decomposition_algorithm == "UMAP": return umap.UMAP(  n_neighbors  = n_neighbors, 
                                                             n_components = n_components,
                                                             metric       = "euclidean",
                                                             min_dist     = 0.3,
                                                             n_jobs       = -1, 
                                                             random_state = RANDOM_SEED
                                                           ).fit_transform(normed_embeddings)

    # elif decomposition_algorithm == "T-SNE": return TSNE(   n_components = n_components, 
    #                                                         n_jobs = -1,
    #                                                         random_state = RANDOM_SEED,
    #                                                         metric = "euclidean",
    #                                                         method='exact'
    #                                                     ).fit_transform(normed_embeddings)

    elif decomposition_algorithm == "PCA": return PCA(n_components = n_components, random_state = RANDOM_SEED).fit_transform(normed_embeddings)

def generate_clusters(*args):
    
    decomposed_embeddings = reduce_dimentionality(  embeddings               = args[0], 
                                                    n_neighbors              = args[1],
                                                    n_components             = 3,
                                                    decomposition_algorithm  = args[4])
    
    decomposed_embeddings = normalize_embeddings(decomposed_embeddings)

    if args[6] == "HDBSCAN": return hdbscan.HDBSCAN(    min_cluster_size         = args[2],
                                                        min_samples              = args[3],
                                                        metric                   = "euclidean",
                                                        core_dist_n_jobs         = -1,
                                                        cluster_selection_method = "eom"
                                                    ).fit(decomposed_embeddings), decomposed_embeddings

    else: return DBSCAN(eps=args[5], min_samples = args[3], n_jobs = -1, metric = "euclidean").fit(decomposed_embeddings), decomposed_embeddings
        
def score_clusters(embeddings, clusters):

    labels = clusters.labels_
     
    n_labels = len(set(labels))

    n_clusters = n_labels - (1 if -1 in labels else 0)
    
    n_noise = list(labels).count(-1)
    
    if n_labels > 1: 
        
        calinski_harabasz_score = metrics.calinski_harabasz_score(embeddings, labels)
        silhouette_score        = metrics.silhouette_score(embeddings, labels)
        davies_bouldin_score    = metrics.davies_bouldin_score(embeddings, labels)
        # DBCV_score =              DBCV(data, clusters, dist_function=euclidean)


    else: calinski_harabasz_score, davies_bouldin_score, silhouette_score = -1, np.inf, -1
        
    return labels, n_labels, n_clusters, n_noise, calinski_harabasz_score, silhouette_score, davies_bouldin_score 

def greed_search(embeddings, decomposition_algorithm, clustering_algorithm, min_cluster_size, space):
    
    trials = { 
                            "decomposition_algorithm":             [],
                            "n_neighbours":                        [],
                            "clustring_algorithm":                 [],
                            "min_samples":                         [],
                            "eps":                                 [], 
                            "n_labels":                            [],
                            "n_clusters":                          [], 
                            "n_noise":                             [],
                            "calinski_harabasz_score -> max" :     [],
                            "silhouette_score [-1,1] -> max":      [],
                            "davies_bouldin_score [0,inf] -> min": [],
                            "decomposed_embeddings":               [],
                            "labels":                              [],
    }
    
    for n_neighbors in space["n_neighbors"]:
            
            for min_samples in space["min_samples"]:
                    
                    for eps in space["eps"]:
                        
                        clusters, decomposed_embeddings = generate_clusters(embeddings, n_neighbors, min_cluster_size, min_samples, decomposition_algorithm, eps, clustering_algorithm)

                        # labels, n_labels, n_clusters, n_noise, calinski_harabasz_score, silhouette_score, davies_bouldin_score, dbcv_score = score_clusters(embeddings, clusters)
                        labels, n_labels, n_clusters, n_noise, calinski_harabasz_score, silhouette_score, davies_bouldin_score = score_clusters(embeddings, clusters)

                        new_entry = {
                             
                            "decomposition_algorithm":             decomposition_algorithm,
                            "n_neighbours":                        n_neighbors,
                            "clustring_algorithm":                 clustering_algorithm,
                            "min_samples":                         min_samples,
                            "eps":                                 eps, 
                            "n_labels":                            n_labels,
                            "n_clusters":                          n_clusters, 
                            "n_noise":                             n_noise,
                            "calinski_harabasz_score -> max" :     calinski_harabasz_score,
                            "silhouette_score [-1,1] -> max":      silhouette_score,
                            "davies_bouldin_score [0,inf] -> min": davies_bouldin_score,
                            # "DBCV_score"                          : dbcv_score,
                            "decomposed_embeddings":               [decomposed_embeddings.tolist()],
                            "labels":                              [labels.tolist()]}
                        
                        for key, value in new_entry.items(): trials[key].append(value)
                         

    return trials

def get_best_params(trials):

    # best_davies_bouldin_score    = trials["davies_bouldin_score [0,inf] -> min"].min()
    # best_silhouette_score_score  = trials["silhouette_score [-1,1] -> max"].max()
    # best_calinski_harabasz_score = trials["calinski_harabasz_score -> max"].max()
    
    best_params = trials.sort_values(by=["davies_bouldin_score [0,inf] -> min"], ascending=True).iloc[0]    
    return best_params

def plot_clusters(decomposition_algorithm, embeddings_3d, labels, n_neighbours):

    embeddings_2d = reduce_dimentionality( embeddings = embeddings_3d,
                                           n_components = 2, 
                                           decomposition_algorithm = decomposition_algorithm, 
                                           n_neighbors = n_neighbours)
    
    embeddings_3d = normalize_embeddings(embeddings_3d)
    embeddings_2d = normalize_embeddings(embeddings_2d)

    fig = sp.make_subplots(rows=1, cols=3, specs=[[{'type': 'scene'}, {'type': 'xy'}, {'type': 'bar'}]])

    scatter_3d = go.Scatter3d(x=embeddings_3d[:, 0], y=embeddings_3d[:, 1], z=embeddings_3d[:, 2], mode='markers', marker=dict(size=5, color=labels))

    fig.add_trace(scatter_3d, row=1, col=1)

    scatter_2d = go.Scatter(x=embeddings_2d[:, 0], y=embeddings_2d[:, 1], mode='markers', marker=dict(size=5, color=labels))

    fig.add_trace(scatter_2d, row=1, col=2)
     
    label_counts = list(Counter(labels).values())
    
    labels_ids = list(Counter(labels).keys())

    bar_plot = go.Bar(x=labels_ids, y=label_counts)

    fig.add_trace(bar_plot, row=1, col=3)
    
    fig.update_layout(
        scene=dict(aspectmode='data')
    )

    fig.show()

# Clusters Labeling

def most_common(lst, n_words):
    """
    Get most common words in a list of words
    
    Arguments:
        lst: list, each element is a word
        n_words: number of top common words to return
    
    Returns:
        counter.most_common(n_words): counter object of n most common words
    """
    counter=collections.Counter(lst)
    return counter.most_common(n_words)

def get_group(df, category_col, category):
    """
    Returns documents of a single category
    
    Arguments:
        df: pandas dataframe of documents
        category_col: str, column name corresponding to categories or clusters
        category: int, cluster number to return
    Returns:
        single_category: pandas dataframe with documents from a single category
    """
    
    single_category = df[df[category_col]==category].reset_index(drop=True)

    return single_category 

def extract_labels(category_docs, print_word_counts=False):
    """
    Extract labels from documents in the same cluster by concatenating
    most common verbs, ojects, and nouns

    Argument:
        category_docs: list of documents, all from the same category or
                       clustering
        print_word_counts: bool, True will print word counts of each type in this category

    Returns:
        label: str, group label derived from concatentating most common
               verb, object, and two most common nouns

    """

    verbs, dobjs, nouns, adjs = [], [], [], []
    verb,dobj,noun1, noun2  =   '', '', '', ''

    # for each document, append verbs, dobs, nouns, and adjectives to 
    # running lists for whole cluster
    for i in range(len(category_docs)):

        doc = nlp(category_docs[i])

        for token in doc:

            if token.is_stop==False:

                if token.dep_ == 'ROOT': verbs.append(token.text.lower())

                elif token.dep_=='dobj': dobjs.append(token.lemma_.lower())

                elif token.pos_=='NOUN': nouns.append(token.lemma_.lower())
                    
                elif token.pos_=='ADJ':  adjs.append(token.lemma_.lower())

    # for printing out for inspection purposes
    if print_word_counts:
        for word_lst in [verbs, dobjs, nouns, adjs]:
            counter=collections.Counter(word_lst)
            print(counter)
    
    # take most common words of each form
    if len(verbs) > 0: verb = most_common(verbs, 1)[0][0]
    
    if len(dobjs) > 0: dobj = most_common(dobjs, 1)[0][0]
    
    if len(nouns) > 0: noun1 = most_common(nouns, 1)[0][0]
    
    if len(set(nouns)) > 1: noun2 = most_common(nouns, 2)[1][0]
    
    # concatenate the most common verb-dobj-noun1-noun2 (if they exist)
    label_words = [verb, dobj]
    
    for word in [noun1, noun2]:
        if word not in label_words: label_words.append(word)
    
    if '' in label_words: label_words.remove('')
    
    label = '_'.join(label_words)
    
    return label

def apply_and_summarize_labels(df, category_col):
    """
    Assign groups to original documents and provide group counts

    Arguments:
        df: pandas dataframe of original documents of interest to
            cluster
        category_col: str, column name corresponding to categories or clusters

    Returns:
        summary_df: pandas dataframe with model cluster assignment, number
                    of documents in each cluster and derived labels
    """
    
    numerical_labels = df[category_col].unique()
    
    # create dictionary of the numerical category to the generated label
    label_dict = {}
    for label in numerical_labels:
        current_category = list(get_group(df, category_col, label)['chunk'])
        label_dict[label] = extract_labels(current_category)
        
    # create summary dataframe of numerical labels and counts
    summary_df = (df.groupby(category_col)['chunk'].count()
                    .reset_index()
                    .rename(columns={'chunk':'count'})
                    .sort_values('count', ascending=False))
    
    # apply generated labels
    summary_df['label'] = summary_df.apply(lambda x: label_dict[x[category_col]], axis = 1)
    
    return summary_df

In [None]:
# # mp4_to_mp3(video)

# # text = audio_to_text()

# # with open("/Users/glebmaksimov/Desktop/ML/projects/video_rag/docs/text.txt", mode="w") as f: f.write(text)

# text = open("/Users/glebmaksimov/Desktop/ML/projects/video_rag/docs/text.txt", mode = 'r').read()

# chunks = chunk(text, chunk_size = 300, chunk_overlap = 50)

# embeddings = embed(chunks)

In [None]:
# decomposition_algorithms = ["PCA", "UMAP"]# "T-SNE" - too long and bad for noisy data

# clustering_algorithms = ["DBSCAN", "HDBSCAN"]

# n_neighbors      = [10, 15, 25]
# min_samples      = [round(i) for i in np.linspace(5,15,num=3)]
# eps              = [i for i in np.linspace(0.005,0.08,num=3)]

# space = { "n_neighbors":      n_neighbors,
#           "min_samples":      min_samples,
#           "eps":              eps}

# overal_scores =  pd.DataFrame({ 
#                                 "decomposition_algorithm":[],
#                                 "n_neighbours":[],
#                                 "clustring_algorithm":[],
#                                 "min_samples":[],
#                                 "eps":[], 
#                                 "n_labels": [],
#                                 "n_clusters":[], 
#                                 "n_noise":[],
#                                 "calinski_harabasz_score -> max" : [],
#                                 "silhouette_score [-1,1] -> max": [],
#                                 "davies_bouldin_score [0,inf] -> min":[],
#                                 # "DBCV_score"                          :[]

#                 })

# df = pd.DataFrame({"chunk":chunks})

# for decomposition_algorithm in decomposition_algorithms:

#     for clustering_algorithm in clustering_algorithms:
                
#                 trials = greed_search(  embeddings = embeddings, 
#                                         space                   = space,
#                                         min_cluster_size        = 20,
#                                         decomposition_algorithm = decomposition_algorithm,
#                                         clustering_algorithm    = clustering_algorithm)
                
#                 trials = pd.DataFrame(trials)

#                 best_params = get_best_params(trials)

#                 display(pd.DataFrame(trials.iloc[:,:-2].to_dict(),index = [0]))
#                 overal_scores = pd.concat([overal_scores, trials.iloc[:,:-2]], axis = 0)
                 
#                 if best_params.n_labels >= 2: 
                        
#                         df[f"{decomposition_algorithm}:{clustering_algorithm} labels"] = best_params.labels[0]

                       
#                         plot_clusters( decomposition_algorithm = decomposition_algorithm, 
#                                                              embeddings_3d           = best_params.decomposed_embeddings[0], 
#                                                              labels                  = best_params.labels[0], 
#                                                              n_neighbours            = best_params.n_neighbours)
            

In [None]:
pipe = pipeline(

        task              = "automatic-speech-recognition",
        model             = "openai/whisper-small.en",
        device            = torch.device('cpu' if not torch.cuda.is_available() else 'cuda:0')
    )

path = "/Users/glebmaksimov/Desktop/ML/projects/video_rag/output_segments"

files = sorted(os.listdir(path))

audio_segments = {}

i = 0

prev_l = 0

current_l = 0

for f in tqdm(files, total=len(files),desc = "Video Processing"):
    
    file = path + "/" + f

    current_l += get_mp3_length(file)

    current_text = pipe(file)["text"]

    audio_segments[current_text] = (prev_l,current_l)

    prev_l = current_l + 1
    
    i+=1

In [None]:
audio_segments

In [None]:
texts = list(audio_segments.keys())

embeddings = embed(texts)

In [None]:
# import plotly.express as px 

# embeddings_3d = reduce_dimentionality(  embeddings               = embeddings, 
#                                         n_neighbors              = 10,
#                                         n_components             = 3,
#                                         decomposition_algorithm  = "UMAP")
    
# embeddings_3d = normalize_embeddings(embeddings_3d)

# clusters = hdbscan.HDBSCAN( min_cluster_size         = 5,
#                             min_samples              = 5,
#                             metric                   = "euclidean",
#                             core_dist_n_jobs         = -1,
#                             cluster_selection_method = "eom"
#                            ).fit(embeddings_3d)


# labels = clusters.labels_


# fig = px.scatter_3d(    
#                         x        = embeddings_3d[:, 0], 
#                         y        = embeddings_3d[:, 1],
#                         z        = embeddings_3d[:, 2],
#                         color    = labels, #{f"{k} label":v for k,v in zip(labels,labels)},
#                         height   = 700,
#                         width    = 1500,
#                         size_max = 5,
#                         opacity  = 0.7,
#                         # symbol = labels
#                     )

# fig.update_layout( margin=dict(l=0, r=0, b=0, t=0))

# fig.show()

In [None]:
# sse = [] 
# for k in range(1,11):
#     km = KMeans(n_clusters=k, random_state = RANDOM_SEED)
#     km.fit(embeddings_3d)
#     sse.append(km.inertia_)

# fig = px.line(x=range(1,11), y=sse)
 
# fig.show()


In [None]:
# kmeans = KMeans(n_clusters = 10, random_state = 2)
# kmeans.fit(embeddings_3d)

# labels = kmeans.labels_

# fig = px.scatter_3d(    
#                         x        = embeddings_3d[:, 0], 
#                         y        = embeddings_3d[:, 1],
#                         z        = embeddings_3d[:, 2],
#                         color    = labels, #{f"{k} label":v for k,v in zip(labels,labels)},
#                         height   = 700,
#                         width    = 1500,
#                         size_max = 5,
#                         opacity  = 0.7,
#                         # symbol = labels
#                     )

# fig.update_layout( margin=dict(l=0, r=0, b=0, t=0))

# fig.show()

In [None]:
for l, ch in zip(labels,chunks):

    print(f"{l} - {ch}")

In [None]:
example_category = list(get_group(df, "UMAP:HDBSCAN labels", -1)["chunk"])
extract_labels(example_category, True)

In [None]:
cluster_summary = apply_and_summarize_labels(df, "UMAP:HDBSCAN labels")
cluster_summary.head(20)

In [None]:
labeled_clusters = pd.merge(df, cluster_summary[['UMAP:HDBSCAN labels', 'label']], on='UMAP:HDBSCAN labels', how = 'left')

In [None]:
labeled_clusters