### Building and logging Semantic Search Systems

### Importing Packages

In [4]:
import mlflow
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer, util
from mlflow.models import infer_signature

### Setting MLflow Experiment and tracking URI

In [5]:
mlflow.set_tracking_uri('http://127.0.0.1:5000')
mlflow.set_experiment('Semantic Search')

2025/07/07 14:04:17 INFO mlflow.tracking.fluent: Experiment with name 'Semantic Search' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/952327830813475046', creation_time=1751877257966, experiment_id='952327830813475046', last_update_time=1751877257966, lifecycle_stage='active', name='Semantic Search', tags={}>

### Setting the Search Space

In [6]:
# Sample document corpus
documents = [
    "Machine learning is a subset of artificial intelligence.",
    "Deep learning uses neural networks with multiple layers.",
    "Natural language processing helps computers understand text.",
    "Computer vision enables machines to interpret visual information.",
    "Reinforcement learning trains agents through trial and error.",
    "Data science combines statistics and programming for insights.",
    "Cloud computing provides scalable infrastructure resources.",
    "MLflow helps manage the machine learning lifecycle.",
]

### Building and Logging the Semantic Search System

In [7]:

def build_semantic_search_system():
    """Build and log a complete semantic search system."""

    with mlflow.start_run(run_name="semantic_search_system"):
        # Load the sentence transformer
        model = SentenceTransformer("all-MiniLM-L6-v2")

        # Log model parameters
        mlflow.log_params(
            {
                "model_name": "all-MiniLM-L6-v2",
                "embedding_dimension": model.get_sentence_embedding_dimension(),
                "max_seq_length": model.max_seq_length,
                "corpus_size": len(documents),
            }
        )

        # Encode the document corpus
        print("Encoding document corpus...")
        corpus_embeddings = model.encode(documents, convert_to_tensor=True)

        # Save corpus and embeddings as artifacts
        corpus_df = pd.DataFrame({"documents": documents})
        corpus_df.to_csv("corpus.csv", index=False)
        mlflow.log_artifact("corpus.csv")

        # Example queries for testing
        test_queries = [
            "What is artificial intelligence?",
            "How do neural networks work?",
            "Tell me about text processing",
            "What tools help with ML development?",
        ]

        # Perform semantic search for each query
        search_results = []
        for query in test_queries:
            print(f"\nSearching for: '{query}'")

            # Encode the query
            query_embedding = model.encode(query, convert_to_tensor=True)

            # Calculate similarities
            similarities = util.semantic_search(
                query_embedding, corpus_embeddings, top_k=3
            )[0]

            # Store results
            for hit in similarities:
                search_results.append(
                    {
                        "query": query,
                        "document": documents[hit["corpus_id"]],
                        "similarity_score": hit["score"],
                        "rank": len([r for r in search_results if r["query"] == query])
                        + 1,
                    }
                )

            # Print top results
            for hit in similarities:
                print(f"  Score: {hit['score']:.4f} - {documents[hit['corpus_id']]}")

        # Log search results
        results_df = pd.DataFrame(search_results)
        results_df.to_csv("search_results.csv", index=False)
        mlflow.log_artifact("search_results.csv")

        # Calculate evaluation metrics
        avg_top1_score = results_df[results_df["rank"] == 1]["similarity_score"].mean()
        avg_top3_score = results_df["similarity_score"].mean()

        mlflow.log_metrics(
            {
                "avg_top1_similarity": avg_top1_score,
                "avg_top3_similarity": avg_top3_score,
                "total_queries_tested": len(test_queries),
            }
        )

        # Log the model with inference signature
        signature = infer_signature(test_queries, model.encode(test_queries))

        model_info = mlflow.sentence_transformers.log_model(
            model=model,
            name="semantic_search_model",
            signature=signature,
            input_example=test_queries[:2],
        )

        print(f"\nModel logged successfully!")
        print(f"Average top-1 similarity: {avg_top1_score:.4f}")
        print(f"Average top-3 similarity: {avg_top3_score:.4f}")

        return model_info


# Run the semantic search system
model_info = build_semantic_search_system()

Encoding document corpus...

Searching for: 'What is artificial intelligence?'
  Score: 0.6593 - Machine learning is a subset of artificial intelligence.
  Score: 0.3906 - Natural language processing helps computers understand text.
  Score: 0.3812 - Computer vision enables machines to interpret visual information.

Searching for: 'How do neural networks work?'
  Score: 0.6108 - Deep learning uses neural networks with multiple layers.
  Score: 0.3995 - Computer vision enables machines to interpret visual information.
  Score: 0.3719 - Machine learning is a subset of artificial intelligence.

Searching for: 'Tell me about text processing'
  Score: 0.6680 - Natural language processing helps computers understand text.
  Score: 0.2696 - Computer vision enables machines to interpret visual information.
  Score: 0.2294 - Data science combines statistics and programming for insights.

Searching for: 'What tools help with ML development?'
  Score: 0.5608 - MLflow helps manage the machine learn