# Text Embedding Models

This notebook demonstrates how to use dense and sparse embedding models for text representation.

In [None]:
# Install required libraries
%pip install fastembed numpy pandas

In [None]:
import numpy as np
from fastembed import TextEmbedding, SparseTextEmbedding, SparseEmbedding

## Define Default Model Names

We'll use the following models for our embeddings:

In [None]:
# Default model names
SPARSE_MODEL_NAME = "Qdrant/bm42-all-minilm-l6-v2-attentions"
DENSE_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"

## EmbeddingModels Class Definition

This class handles both sparse and dense embeddings.

In [None]:
class EmbeddingModels:
    def __init__(self, sparse_model_name=SPARSE_MODEL_NAME, dense_model_name=DENSE_MODEL_NAME, batch_size=32):
        """Initialize embedding models."""
        self.batch_size = batch_size
        print(f"Loading sparse model: {sparse_model_name}")
        self.sparse_model = SparseTextEmbedding(model_name=sparse_model_name, batch_size=batch_size)
        
        print(f"Loading dense model: {dense_model_name}")
        self.dense_model = TextEmbedding(model_name=dense_model_name, batch_size=batch_size)
        
    def get_sparse_embeddings(self, texts: list[str]) -> list[SparseEmbedding]:
        """Generate sparse embeddings for a list of texts."""
        return list(self.sparse_model.embed(texts, batch_size=self.batch_size))
    
    def get_dense_embeddings(self, texts: list[str]) -> list[np.ndarray]:
        """Generate dense embeddings for a list of texts."""
        return list(self.dense_model.embed(texts, batch_size=self.batch_size))
        
    def process_dataframe(self, df, text_column='text'):
        """Process a DataFrame to add sparse and dense embeddings."""
        texts = df[text_column].tolist()
        
        print("Generating sparse embeddings...")
        df["sparse_embedding"] = self.get_sparse_embeddings(texts)
        
        print("Generating dense embeddings...")
        df["dense_embedding"] = self.get_dense_embeddings(texts)
        
        return df

## Usage Example

Here's how to use the EmbeddingModels class:

In [None]:
import pandas as pd

# Create a sample DataFrame
sample_df = pd.DataFrame({
    'id': [1, 2, 3],
    'text': [
        'A red Honda Civic with manual transmission',
        'A blue Toyota Camry with automatic transmission',
        'A black Ford Mustang with sports package'
    ]
})

# Initialize embedding models
embedding_models = EmbeddingModels()

# Process the DataFrame
processed_df = embedding_models.process_dataframe(sample_df)

# Show the results
print("\nDataFrame with embeddings:")
print(processed_df.head())