In [3]:
import numpy as np
import torch
import faiss
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from typing import List, Tuple, Dict
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt', quiet=True)

  from .autonotebook import tqdm as notebook_tqdm


True

In [4]:
# Download the 'punkt_tab' data for sentence tokenization
nltk.download('punkt_tab', quiet=True)
nltk.download('punkt', quiet=True)

True

In [5]:
class TranscriptRanker:
    def __init__(self, model_name: str = 'all-mpnet-base-v2'):
        """
        Initialize the transcript ranker with the specified sentence transformer model.

        Args:
            model_name: The name of the SentenceTransformer model to use
        """
        self.model = SentenceTransformer(model_name)
        self.index = None
        self.sentences = []
        self.embeddings = None

    def prepare_transcript(self, transcript_text: str) -> None:
        """
        Process the transcript text by splitting it into sentences and creating embeddings.

        Args:
            transcript_text: The raw transcript text from YouTube
        """
        # Split transcript into sentences
        self.sentences = sent_tokenize(transcript_text)

        # Create embeddings for all sentences
        self.embeddings = self.model.encode(self.sentences, convert_to_tensor=True)

        # Convert embeddings to numpy array for FAISS
        embeddings_np = self.embeddings.cpu().numpy().astype(np.float32)

        # Create and populate the FAISS index
        dimension = embeddings_np.shape[1]
        self.index = faiss.IndexFlatIP(dimension)  # Inner product similarity (equivalent to cosine for normalized vectors)
        faiss.normalize_L2(embeddings_np)  # Normalize vectors for cosine similarity
        self.index.add(embeddings_np)

    def query(self, user_query: str, top_k: int = 5, diversity_factor: float = 0.5) -> List[Dict]:
        """
        Find the most relevant sentences in the transcript for the given query.

        Args:
            user_query: The user's query
            top_k: Number of top results to return
            diversity_factor: Factor for MMR diversity (0-1, higher means more diversity)

        Returns:
            List of dictionaries containing the relevant sentences with scores and positions
        """
        # Encode the query
        query_embedding = self.model.encode(user_query, convert_to_tensor=True)
        query_embedding_np = query_embedding.cpu().numpy().astype(np.float32).reshape(1, -1)
        faiss.normalize_L2(query_embedding_np)

        # Perform the search
        D, I = self.index.search(query_embedding_np, len(self.sentences))

        # Apply Maximal Marginal Relevance to diversify results
        selected_indices = self._mmr(query_embedding, I[0], top_k, diversity_factor)

        # Prepare results
        results = []
        for i, idx in enumerate(selected_indices):
            results.append({
                'sentence': self.sentences[idx],
                'score': float(D[0][i]),  # Convert from numpy float to Python float
                'position': idx
            })

        return results

    def _mmr(self, query_embedding: torch.Tensor, initial_indices: List[int], top_k: int,
             diversity_factor: float) -> List[int]:
        """
        Apply Maximal Marginal Relevance to diversify results.

        Args:
            query_embedding: The embedding of the query
            initial_indices: The initial indices from FAISS
            top_k: Number of results to return
            diversity_factor: Factor for diversity (0-1, higher means more diversity)

        Returns:
            List of indices selected by MMR
        """
        query_embedding_np = query_embedding.cpu().numpy()

        # Initialize
        selected_indices = []
        remaining_indices = initial_indices.tolist()

        # Select first index (most similar to query)
        selected_indices.append(remaining_indices[0])
        remaining_indices.remove(remaining_indices[0])

        # Select remaining indices using MMR
        while len(selected_indices) < top_k and remaining_indices:
            best_score = -np.inf
            best_idx = -1

            for idx in remaining_indices:
                # Similarity to query
                similarity_to_query = float(cosine_similarity(
                    [query_embedding_np],
                    [self.embeddings[idx].cpu().numpy()]
                )[0][0])

                # Maximum similarity to already selected sentences
                max_similarity_to_selected = 0
                for selected_idx in selected_indices:
                    similarity = float(cosine_similarity(
                        [self.embeddings[idx].cpu().numpy()],
                        [self.embeddings[selected_idx].cpu().numpy()]
                    )[0][0])
                    max_similarity_to_selected = max(max_similarity_to_selected, similarity)

                # Calculate MMR score
                mmr_score = diversity_factor * similarity_to_query - (1 - diversity_factor) * max_similarity_to_selected

                if mmr_score > best_score:
                    best_score = mmr_score
                    best_idx = idx

            if best_idx != -1:
                selected_indices.append(best_idx)
                remaining_indices.remove(best_idx)
            else:
                break

        return selected_indices

    def highlight_transcript(self, ranked_results: List[Dict],
                             context_sentences: int = 1) -> List[Dict]:
        """
        Create a highlighted version of the transcript with context around the top hits.

        Args:
            ranked_results: The results from the query method
            context_sentences: Number of sentences to include before and after the matched sentence

        Returns:
            List of transcript segments with highlighted sentences and context
        """
        highlights = []

        for result in ranked_results:
            position = result['position']
            start_pos = max(0, position - context_sentences)
            end_pos = min(len(self.sentences), position + context_sentences + 1)

            # Get context sentences
            context = self.sentences[start_pos:end_pos]

            # Mark which sentence is the actual match
            is_highlight = [False] * len(context)
            highlight_pos = position - start_pos
            if 0 <= highlight_pos < len(context):
                is_highlight[highlight_pos] = True

            highlights.append({
                'sentences': context,
                'is_highlight': is_highlight,
                'score': result['score'],
                'start_position': start_pos,
                'end_position': end_pos - 1
            })

        return highlights

In [6]:
def main(transcript_text: str, user_query: str) -> Tuple[List[Dict], List[Dict]]:
    """
    Main function to rank and highlight transcript sentences based on a user query.

    Args:
        transcript_text: The transcript text from YouTube
        user_query: The user's query

    Returns:
        Tuple of (ranked_results, highlighted_transcript)
    """
    # Initialize the ranker
    ranker = TranscriptRanker()

    # Process the transcript
    ranker.prepare_transcript(transcript_text)

    # Query for relevant sentences
    ranked_results = ranker.query(user_query, top_k=5, diversity_factor=0.7)

    # Get highlighted transcript segments
    highlighted_transcript = ranker.highlight_transcript(ranked_results, context_sentences=1)

    return ranked_results, highlighted_transcript

In [7]:
# Example usage
if __name__ == "__main__":
    # Example transcript (in practice, this would come from YouTube API)
    sample_transcript = """
    Machine learning has become an essential part of modern technology. It powers recommendation systems,
    voice assistants, and autonomous vehicles. Deep learning, a subset of machine learning, uses neural
    networks with many layers to process data. Transfer learning allows models trained on one task to be
    fine-tuned for another. Reinforcement learning teaches agents to make decisions by rewarding good actions.
    Natural language processing helps computers understand and generate human language. Computer vision enables
    machines to interpret and make decisions based on visual data. Generative AI can create new content like
    images, text, and music. Ethical considerations in AI include fairness, transparency, and privacy.
    Explainable AI aims to make model decisions understandable to humans.
    """

    # Example query
    query = "How does reinforcement learning work?"

    # Process the query and transcript
    ranked_results, highlighted_transcript = main(sample_transcript, query)

    # Print results
    print("Top ranked sentences:")
    for i, result in enumerate(ranked_results):
        print(f"{i+1}. Score: {result['score']:.4f} - {result['sentence']}")

    print("\nHighlighted transcript segments:")
    for i, segment in enumerate(highlighted_transcript):
        print(f"Segment {i+1} (Score: {segment['score']:.4f}):")
        for j, (sentence, is_highlight) in enumerate(zip(segment['sentences'], segment['is_highlight'])):
            if is_highlight:
                print(f">>> {sentence}")
            else:
                print(f"    {sentence}")
        print()

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Top ranked sentences:
1. Score: 0.6950 - Reinforcement learning teaches agents to make decisions by rewarding good actions.
2. Score: 0.3259 - Deep learning, a subset of machine learning, uses neural
    networks with many layers to process data.
3. Score: 0.3246 - Explainable AI aims to make model decisions understandable to humans.
4. Score: 0.3215 - It powers recommendation systems,
    voice assistants, and autonomous vehicles.
5. Score: 0.2805 - Computer vision enables
    machines to interpret and make decisions based on visual data.

Highlighted transcript segments:
Segment 1 (Score: 0.6950):
    Transfer learning allows models trained on one task to be
    fine-tuned for another.
>>> Reinforcement learning teaches agents to make decisions by rewarding good actions.
    Natural language processing helps computers understand and generate human language.

Segment 2 (Score: 0.3259):
    It powers recommendation systems,
    voice assistants, and autonomous vehicles.
>>> Deep learni