# 🎯 Interactive Embeddings & RAG Explorer

Welcome to this interactive exploration of embeddings and RAG systems! This notebook is designed to help you understand:

1. How text is converted to embeddings
2. How semantic relationships are captured
3. How different embedding models compare
4. How RAG systems work in practice

Each section includes interactive visualizations and experiments to help build intuition. 🚀

In [None]:
%pip install -r requirements.txt

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [None]:
import os
import torch
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path
from typing import List, Dict, Any, Tuple
from dataclasses import dataclass
from sentence_transformers import SentenceTransformer
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity
from IPython.display import display, HTML, Markdown
import ipywidgets as widgets
import lancedb
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
from langchain.document_loaders import TextLoader
from langchain.document_loaders.directory import DirectoryLoader
from tqdm.notebook import tqdm

## 1. 🔍 Understanding Semantic Relationships

Let's start by exploring how different types of text are related in embedding space. We'll use several example sets to demonstrate different concepts.

In [None]:
# Example Sets
class ExampleSets:
    AI_ML = [
        "I love machine learning",
        "Artificial intelligence is fascinating",
        "Neural networks are powerful",
        "Deep learning revolutionizes AI"
    ]
    
    ANIMALS = [
        "The cat is sleeping",
        "My dog is taking a nap",
        "The bird is flying high",
        "Fish swim in the ocean"
    ]
    
    PROGRAMMING = [
        "Python is a programming language",
        "Java is also a programming language",
        "JavaScript runs in browsers",
        "SQL queries databases"
    ]
    
    MIXED = [
        "The cat is sleeping",
        "Python is a programming language",
        "Artificial intelligence is fascinating",
        "The weather is sunny today"
    ]
    
    @classmethod
    def get_all_sets(cls) -> Dict[str, List[str]]:
        return {
            'AI and ML': cls.AI_ML,
            'Animals': cls.ANIMALS,
            'Programming': cls.PROGRAMMING,
            'Mixed Topics': cls.MIXED
        }

In [None]:
class EmbeddingVisualizer:
    AVAILABLE_MODELS = {
        'mpnet': 'sentence-transformers/all-mpnet-base-v2',
        'minilm': 'sentence-transformers/all-MiniLM-L6-v2',
        'multilingual': 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'
    }
    
    def __init__(self, model_name: str = 'mpnet'):
        self.model_name = model_name
        self.model = SentenceTransformer(self.AVAILABLE_MODELS[model_name])
    
    def embed_texts(self, texts: List[str]) -> np.ndarray:
        return self.model.encode(texts, show_progress_bar=True)
    
    def create_similarity_heatmap(self, texts: List[str], embeddings: np.ndarray) -> go.Figure:
        similarity_matrix = cosine_similarity(embeddings)
        
        # Create enhanced heatmap
        fig = go.Figure(data=go.Heatmap(
            z=similarity_matrix,
            x=[f"<b>{text}</b>" for text in texts],  # Bold text
            y=[f"<b>{text}</b>" for text in texts],
            colorscale='RdBu',
            zmin=-1, zmax=1,
            text=np.round(similarity_matrix, 2),
            texttemplate='%{text}',
            textfont={"size": 10},
            hoverongaps=False
        ))
        
        fig.update_layout(
            title={
                'text': f'Semantic Similarity Matrix ({self.model_name})',
                'y':0.95,
                'x':0.5,
                'xanchor': 'center',
                'yanchor': 'top',
                'font': dict(size=20)
            },
            width=900,
            height=700,
            xaxis_tickangle=-45,
            xaxis={'side': 'bottom'},
            margin=dict(t=100, l=100, r=100, b=100)
        )
        
        return fig
    
    def create_embedding_viz(self, texts: List[str], embeddings: np.ndarray, method='both') -> Tuple[go.Figure, go.Figure]:
        figs = []
        
        if method in ['pca', 'both']:
            pca = PCA(n_components=2)
            coords_2d = pca.fit_transform(embeddings)
            
            fig_pca = go.Figure()
            
            # Add points
            fig_pca.add_trace(go.Scatter(
                x=coords_2d[:, 0],
                y=coords_2d[:, 1],
                mode='markers+text',
                text=texts,
                textposition="top center",
                marker=dict(size=10, color=np.arange(len(texts)), colorscale='Viridis'),
                hovertemplate='<b>Text:</b> %{text}<br>'+
                            '<b>PC1:</b> %{x:.2f}<br>'+
                            '<b>PC2:</b> %{y:.2f}'
            ))
            
            # Add connecting lines between similar texts
            similarity_matrix = cosine_similarity(embeddings)
            for i in range(len(texts)):
                for j in range(i+1, len(texts)):
                    if similarity_matrix[i,j] > 0.7:  # Connect highly similar texts
                        fig_pca.add_trace(go.Scatter(
                            x=[coords_2d[i,0], coords_2d[j,0]],
                            y=[coords_2d[i,1], coords_2d[j,1]],
                            mode='lines',
                            line=dict(width=1, color='rgba(100,100,100,0.2)'),
                            hoverinfo='skip',
                            showlegend=False
                        ))
            
            fig_pca.update_layout(
                title=f'PCA Visualization ({self.model_name})<br><sup>Explained variance: {pca.explained_variance_ratio_.sum():.2%}</sup>',
                width=800,
                height=600,
                showlegend=False,
                hovermode='closest'
            )
            
            figs.append(fig_pca)
        
        if method in ['tsne', 'both']:
            perplexity = min(30, len(texts) - 1)
            tsne = TSNE(n_components=2, perplexity=perplexity, random_state=42)
            coords_2d = tsne.fit_transform(embeddings)
            
            fig_tsne = go.Figure()
            
            # Add points with enhanced styling
            fig_tsne.add_trace(go.Scatter(
                x=coords_2d[:, 0],
                y=coords_2d[:, 1],
                mode='markers+text',
                text=texts,
                textposition="top center",
                marker=dict(size=10, color=np.arange(len(texts)), colorscale='Viridis'),
                hovertemplate='<b>Text:</b> %{text}<br>'+
                            '<b>t-SNE1:</b> %{x:.2f}<br>'+
                            '<b>t-SNE2:</b> %{y:.2f}'
            ))
            
            fig_tsne.update_layout(
                title=f't-SNE Visualization ({self.model_name})',
                width=800,
                height=600,
                showlegend=False,
                hovermode='closest'
            )
            
            figs.append(fig_tsne)
        
        return tuple(figs)
    
    def find_similar(self, query: str, texts: List[str], top_k: int = 3) -> Tuple[go.Figure, pd.DataFrame]:
        # Embed query and texts
        query_embedding = self.embed_texts([query])
        text_embeddings = self.embed_texts(texts)
        
        # Calculate similarities
        similarities = cosine_similarity(query_embedding, text_embeddings)[0]
        
        # Get top k results
        top_k_idx = np.argsort(similarities)[-top_k:]
        
        # Create results DataFrame
        results_df = pd.DataFrame({
            'Text': [texts[i] for i in top_k_idx],
            'Similarity': similarities[top_k_idx]
        }).sort_values('Similarity', ascending=True)
        
        # Create enhanced visualization
        fig = go.Figure()
        
        # Add bars with gradient color
        fig.add_trace(go.Bar(
            x=results_df['Similarity'],
            y=results_df['Text'],
            orientation='h',
            marker=dict(
                color=results_df['Similarity'],
                colorscale='Viridis',
                showscale=True
            ),
            hovertemplate='<b>Text:</b> %{y}<br>' +
                         '<b>Similarity:</b> %{x:.3f}'
        ))
        
        fig.update_layout(
            title={
                'text': f'Top {top_k} Similar Texts to:<br>"{query}"',
                'y':0.95,
                'x':0.5,
                'xanchor': 'center',
                'yanchor': 'top'
            },
            xaxis_title='Similarity Score',
            yaxis_title='Text',
            width=800,
            height=400,
            showlegend=False
        )
        
        return fig, results_df

### 1.1 Interactive Example Explorer 🔄

Use the widgets below to explore different example sets and see how they are related in embedding space.

In [None]:
def create_example_explorer():
    # Create widgets
    example_set = widgets.Dropdown(
        options=ExampleSets.get_all_sets().keys(),
        description='Example Set:',
        value='AI and ML'
    )
    
    model_select = widgets.Dropdown(
        options=EmbeddingVisualizer.AVAILABLE_MODELS.keys(),
        description='Model:',
        value='mpnet'
    )
    
    viz_method = widgets.Dropdown(
        options=['both', 'pca', 'tsne'],
        description='Visualization:',
        value='both'
    )
    
    def on_change(change):
        if change['type'] == 'change' and change['name'] == 'value':
            visualizer = EmbeddingVisualizer(model_select.value)
            texts = ExampleSets.get_all_sets()[example_set.value]
            embeddings = visualizer.embed_texts(texts)
            
            # Show similarity heatmap
            display(Markdown("### 📊 Similarity Heatmap"))
            display(Markdown("This heatmap shows how similar each text is to every other text. Darker red indicates higher similarity."))
            display(visualizer.create_similarity_heatmap(texts, embeddings))
            
            # Show embedding visualizations
            display(Markdown("### 🎯 Embedding Space Visualization"))
            display(Markdown("These plots show how the texts are arranged in 2D space. Connected points are semantically similar."))
            figs = visualizer.create_embedding_viz(texts, embeddings, method=viz_method.value)
            for fig in figs:
                display(fig)
    
    # Register callbacks
    example_set.observe(on_change, names='value')
    model_select.observe(on_change, names='value')
    viz_method.observe(on_change, names='value')
    
    # Create layout
    controls = widgets.VBox([
        widgets.HBox([example_set, model_select, viz_method])
    ])
    
    display(controls)
    on_change({'type': 'change', 'name': 'value'})

create_example_explorer()

VBox(children=(HBox(children=(Dropdown(description='Example Set:', options=('AI and ML', 'Animals', 'Programmi…


`clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be depracted in transformers v4.45, and will be then set to `False` by default. For more details check this issue: https://github.com/huggingface/transformers/issues/31884



Batches:   0%|          | 0/1 [00:00<?, ?it/s]

### 📊 Similarity Heatmap

This heatmap shows how similar each text is to every other text. Darker red indicates higher similarity.

### 🎯 Embedding Space Visualization

These plots show how the texts are arranged in 2D space. Connected points are semantically similar.


`clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be depracted in transformers v4.45, and will be then set to `False` by default. For more details check this issue: https://github.com/huggingface/transformers/issues/31884



Batches:   0%|          | 0/1 [00:00<?, ?it/s]

### 📊 Similarity Heatmap

This heatmap shows how similar each text is to every other text. Darker red indicates higher similarity.

### 🎯 Embedding Space Visualization

These plots show how the texts are arranged in 2D space. Connected points are semantically similar.

### 1.2 Similarity Search Explorer 🔎

Try searching for similar texts and see how different models handle the queries.

In [None]:
def create_similarity_explorer():
    # Create widgets
    example_set = widgets.Dropdown(
        options=ExampleSets.get_all_sets().keys(),
        description='Search in:',
        value='Mixed Topics'
    )
    
    model_select = widgets.Dropdown(
        options=EmbeddingVisualizer.AVAILABLE_MODELS.keys(),
        description='Model:',
        value='mpnet'
    )
    
    query_text = widgets.Text(
        value='artificial intelligence',
        description='Query:',
        style={'description_width': 'initial'}
    )
    
    top_k = widgets.IntSlider(
        value=3,
        min=1,
        max=4,
        description='Top K:'
    )
    
    def on_search(b):
        visualizer = EmbeddingVisualizer(model_select.value)
        texts = ExampleSets.get_all_sets()[example_set.value]
        
        fig, results = visualizer.find_similar(query_text.value, texts, top_k.value)
        display(Markdown(f"### 🎯 Search Results ({model_select.value})"))
        display(fig)
    
    search_button = widgets.Button(description='Search')
    search_button.on_click(on_search)
    
    # Create layout
    controls = widgets.VBox([
        widgets.HBox([example_set, model_select]),
        widgets.HBox([query_text, top_k, search_button])
    ])
    
    display(controls)

create_similarity_explorer()

VBox(children=(HBox(children=(Dropdown(description='Search in:', index=3, options=('AI and ML', 'Animals', 'Pr…


`clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be depracted in transformers v4.45, and will be then set to `False` by default. For more details check this issue: https://github.com/huggingface/transformers/issues/31884



Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

### 🎯 Search Results (mpnet)

## 2. RAG System Explorer 🔄

Now let's explore how RAG systems work with real documents.

In [None]:
@dataclass
class ChunkingConfig:
    chunk_size: int = 500
    chunk_overlap: int = 50
    strategy: str = "recursive"  # or "character"

class DocumentProcessor:
    def __init__(self, config: ChunkingConfig):
        self.config = config
        self.splitter = self._get_splitter()
    
    def _get_splitter(self):
        if self.config.strategy == "recursive":
            return RecursiveCharacterTextSplitter(
                chunk_size=self.config.chunk_size,
                chunk_overlap=self.config.chunk_overlap
            )
        return CharacterTextSplitter(
            chunk_size=self.config.chunk_size,
            chunk_overlap=self.config.chunk_overlap
        )
    
    def load_and_split(self, directory: str) -> List[str]:
        loader = DirectoryLoader(directory, glob="**/*.txt", loader_cls=TextLoader)
        documents = loader.load()
        chunks = self.splitter.split_documents(documents)
        return [chunk.page_content for chunk in chunks]
    
    def visualize_chunks(self, chunks: List[str]) -> go.Figure:
        """Create an enhanced chunk visualization"""
        df = pd.DataFrame({
            'Chunk': range(len(chunks)),
            'Length': [len(chunk) for chunk in chunks],
            'Text': chunks
        })
        
        fig = go.Figure()
        
        # Add bars with gradient color
        fig.add_trace(go.Bar(
            x=df['Chunk'],
            y=df['Length'],
            marker=dict(
                color=df['Length'],
                colorscale='Viridis',
                showscale=True
            ),
            hovertemplate='<b>Chunk %{x}</b><br>' +
                         '<b>Length:</b> %{y}<br>' +
                         '<b>Text:</b> %{customdata}',
            customdata=df['Text']
        ))
        
        fig.update_layout(
            title='Document Chunks Distribution',
            xaxis_title='Chunk Index',
            yaxis_title='Chunk Length (characters)',
            width=800,
            height=400,
            showlegend=False
        )
        
        return fig

def create_rag_explorer():
    # Create widgets
    chunk_size = widgets.IntSlider(
        value=500,
        min=100,
        max=1000,
        step=50,
        description='Chunk Size:'
    )
    
    chunk_overlap = widgets.IntSlider(
        value=50,
        min=0,
        max=200,
        step=10,
        description='Overlap:'
    )
    
    strategy = widgets.Dropdown(
        options=['recursive', 'character'],
        description='Strategy:',
        value='recursive'
    )
    
    def on_process(b):
        config = ChunkingConfig(
            chunk_size=chunk_size.value,
            chunk_overlap=chunk_overlap.value,
            strategy=strategy.value
        )
        
        processor = DocumentProcessor(config)
        chunks = processor.load_and_split('sample_docs')
        
        display(Markdown("### 📊 Chunk Distribution"))
        display(Markdown("This visualization shows how your document was split into chunks. Hover over bars to see the content."))
        display(processor.visualize_chunks(chunks))
        
        # Show chunk overlap visualization
        if len(chunks) > 1:
            display(Markdown("### 🔄 Chunk Overlap Example"))
            display(Markdown(f"Showing overlap between first two chunks (overlap={chunk_overlap.value} characters):"))
            
            chunk1 = chunks[0]
            chunk2 = chunks[1]
            
            # Find overlapping text
            overlap = set(chunk1[-chunk_overlap.value:]).intersection(set(chunk2[:chunk_overlap.value]))
            
            # Display with highlighting
            display(HTML(f"""
            <div style='background-color: #f0f0f0; padding: 10px; margin: 10px 0;'>
                <p><b>Chunk 1:</b> {chunk1[:-chunk_overlap.value]}<span style='background-color: #ffeb3b'>{chunk1[-chunk_overlap.value:]}</span></p>
                <p><b>Chunk 2:</b> <span style='background-color: #ffeb3b'>{chunk2[:chunk_overlap.value]}</span>{chunk2[chunk_overlap.value:]}</p>
            </div>
            """))
    
    process_button = widgets.Button(description='Process Document')
    process_button.on_click(on_process)
    
    # Create layout
    controls = widgets.VBox([
        widgets.HBox([chunk_size, chunk_overlap, strategy]),
        process_button
    ])
    
    display(controls)

create_rag_explorer()

VBox(children=(HBox(children=(IntSlider(value=500, description='Chunk Size:', max=1000, min=100, step=50), Int…

### 📊 Chunk Distribution

This visualization shows how your document was split into chunks. Hover over bars to see the content.