## src/main.py

**- src/routers/search.py**

**- src/services/opensearch/factory.py**

```python
from src.config import get_settings
from src.db.factory import make_database

from src.services.opensearch.factory import make_opensearch_client
from src.routers import search # papers, ping
# from src.services.arxiv.factory import make_arxiv_client
# from src.services.pdf_parser.factory import make_pdf_parser_service

@asynccontextmanager
async def lifespan(app: FastAPI):
    # load settings - app.state.settings
    database = make_database()
    app.state.database = database
    opensearch_client = make_opensearch_client()
    app.state.opensearch_client = opensearch_client
    if opensearch_client.health_check():
        # Ensure index exists - opensearch_client.create_index(force=False)
        # Get index statistics
        stats = opensearch_client.get_index_stats()

    # initialize arxiv client - app.state.arxiv_client
    # initialize pef-parser client - app.state.pdf_parser
    yield
    database.teardown()

app = FastAPI(
    title="arXiv Paper Curator API",
    description="Personal arXiv CS.AI paper curator with RAG capabilities",
    version=os.getenv("APP_VERSION", "0.1.0"),
    lifespan=lifespan,
)

# Include routers
# app.include_router(ping.router, prefix="/api/v1")
# app.include_router(papers.router, prefix="/api/v1")
app.include_router(search.router, prefix="/api/v1")

```

## src/routers

### src/routers/search.py

``` routes -> model from schemas.api.search```

``` routes -> get client by calling src.services.opensearch.factory```

#### Imports

```python
from src.schemas.api.search import SearchHit, SearchRequest, SearchResponse
"""
schemas/api/search.py : contains Model

class SearchRequest(BaseModel):
    # Search request model
class SearchHit(BaseModel):
    # Individual search result
class SearchResponse(BaseModel):
    # Search response model
"""

from src.dependencies import OpenSearchDep

# src/services/opensearch/client.py

def get_opensearch_client(request: Request) -> OpenSearchClient:
    """Get OpenSearch client from the request state."""
    return request.app.state.opensearch_client

OpenSearchDep = Annotated[OpenSearchClient, Depends(get_opensearch_client)]

```

#### Original

```python
router = APIRouter(prefix="/search", tags=["search"])

@router.post("/", response_model=SearchResponse)
async def search_papers(request: SearchRequest, opensearch_client: OpenSearchDep) -> SearchResponse:
    """BM25 algorithm for relevance scoring - titles,abstracts,authors"""
    try:
        # Check if OpenSearch is healthy - status_code=503 (HTTPException)
        # Perform search with filters
        # Convert results to response model
        return SearchResponse(query=request.query, total=results.get("total", 0), hits=hits, error=results.get("error"))
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Search failed: {str(e)}")
```

## src/services/opensearch

``` routers -> factory method from services/opensearch -> client -> index_config & query_builder```

### src/services/opensearch/factory.py

**- src/services/opensearch/client.py**


```python

# from .client import OpenSearchClient

@lru_cache(maxsize=1)
def make_opensearch_client() -> OpenSearchClient:
    """Factory function to create cached OpenSearch client instance.

    Uses lru_cache to maintain a singleton instance, consistent with
    other service factories in the codebase.

    :returns: Cached instance of the OpenSearch client
    :rtype: OpenSearchClient
    """
    settings = get_settings()
    return OpenSearchClient(host=settings.opensearch.host, settings=settings)
```

### src/services/opensearch/index_config.py

```python

ARXIV_PAPERS_INDEX = "arxiv-papers"

# Index mapping configuration for arXiv papers
ARXIV_PAPERS_MAPPING = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0,
        "analysis": {
            "analyzer": {...}
        },
    },
    "mappings": {
        "dynamic": "strict",
        "properties": {
            "arxiv_id": {"type": "keyword"},
            "title": {...},
            ...
            "abstract": {"type": "text", 
            ...
        },
    },
}

```

### src/services/opensearch/query_builder.py

```python

class PaperQueryBuilder:
    """Query builder for arXiv papers search following reference patterns.
    Builds complex OpenSearch queries with proper scoring, filtering, and highlighting.
    """
    def __init__(
        self,
        query: str,
        size: int = 10,
        from_: int = 0,
        fields: Optional[List[str]] = None,
        categories: Optional[List[str]] = None,
        track_total_hits: bool = True,
        latest_papers: bool = False,
    ):
        # Multi-field search with boosting: title (highest), abstract (medium), authors (lower)
        self.fields = fields or ["title^3", "abstract^2", "authors^1"]
        ...
    def build(self) -> Dict[str, Any]:
        query_body = {
            "query": self._build_query(),
            "size": self.size,
            "from": self.from_,
            "track_total_hits": self.track_total_hits,
            "_source": self._build_source_fields(),
            "highlight": self._build_highlight(),
        }
        # Add sorting if needed
        sort = self._build_sort()
        if sort:
            query_body["sort"] = sort

        return query_body

    def _build_query(self) -> Dict[str, Any]:
        """Build the main query with filters."""
        # Build filter clauses
        filter_clauses = self._build_filters()
        # Construct bool query
        bool_query = {}
        if must_clauses:
            bool_query["must"] = must_clauses
        else:
            # If no text query, match all documents
            bool_query["must"] = [{"match_all": {}}]
        if filter_clauses:
            bool_query["filter"] = filter_clauses

        return {"bool": bool_query}

    def _build_text_query(self) -> Dict[str, Any]:
        """Build the main text search query."""
        return {
            "multi_match": {...}
        }
    
    def _build_filters(self) -> List[Dict[str, Any]]:
        """Build filter clauses for the query."""
        filters = []
        # Category filter
        if self.categories:
            filters.append({"terms": {"categories": self.categories}})
        return filters

    def _build_highlight(self) -> Dict[str, Any]:
        """Build highlighting configuration.
        :returns: Highlight configuration dictionary
        """
    def _build_sort(self) -> Optional[List[Dict[str, Any]]]:
        """Build sorting configuration."""
        # If latest_papers is requested, always sort by publication date
        if self.latest_papers:
            return [{"published_date": {"order": "desc"}}, "_score"]

        # For text queries, use relevance scoring (no explicit sort)
        if self.query.strip():
            return None

        # For empty queries, sort by publication date (newest first)
        return [{"published_date": {"order": "desc"}}, "_score"]

# Builder pattern
def build_search_query(
    query: str,
    size: int = 10,
    from_: int = 0,
    categories: Optional[List[str]] = None,
) -> Dict[str, Any]:
    """Helper function to build a search query with optional filters."""
    builder = PaperQueryBuilder(query=query, size=size, from_=from_, categories=categories)
    return builder.build()

```

### src/services/opensearch/client.py

**- src/services/opensearch/index_config.py**

**- src/services/opensearch/query_builder.py**

```python
from opensearchpy import OpenSearch
from opensearchpy.exceptions import NotFoundError, RequestError

class OpenSearchClient:
    """
    Client for OpenSearch operations including index management and search. - methods for creating indices, indexing papers,
    searching with BM25 scoring, and managing OpenSearch cluster operations.
    """
    def __init__(self, host: str = "http://localhost:9200", settings: Optional[Settings] = None):
        self.client = OpenSearch(..)
        # Use configured index name, fall back to constant if not set
        self.index_name = self.settings.opensearch.index_name or ARXIV_PAPERS_INDEX
    
    def create_index(self, force: bool = False) -> bool:
        """Create the arxiv-papers index with proper mappings."""
        try:
            # Check if index exists
            if self.client.indices.exists(index=self.index_name):
                # if force: self.client.indices.delete(index=self.index_name) 
                #else return False
            # Create index with mappings
            response = self.client.indices.create(index=self.index_name, body=ARXIV_PAPERS_MAPPING)
            return response.get("acknowledged")==True
        except (RequestError,Exception) as e:
            # raise proper error
            return False

    def index_paper(self, paper_data: Dict[str, Any]) -> bool:
        """Index a single paper document."""
        try:
            # Ensure required fields - if "arxiv_id" not in paper_data:
            # Add timestamps in paper_data if not present 
            # Convert authors list to string if needed 
            response = self.client.index(
                index=self.index_name,
                id=paper_data["arxiv_id"],
                body=paper_data,
                refresh=True,  # Make it immediately searchable
            )
            indexed_paper = response.get("result") in ["created", "updated"]
            return indexed_paper==True
        except Exception as e:
            # raise error
            return False

    def bulk_index_papers(self, papers: List[Dict[str, Any]]) -> Dict[str, int]:
        results = {"success": 0, "failed": 0}
        # call self.index_paper(paper) & update results
        return results

    def search_papers(
        self,
        query: str,
        size: int = 10,
        from_: int = 0,
        fields: Optional[List[str]] = None,
        categories: Optional[List[str]] = None,
        track_total_hits: bool = True,
        latest_papers: bool = False,
    ) -> Dict[str, Any]:
        """Search papers using BM25 scoring with query builder."""
        try:
            query_builder = PaperQueryBuilder(...)
            search_body = query_builder.build()
            response = self.client.search(index=self.index_name, body=search_body)
            results = {"total": response["hits"]["total"]["value"], "hits": []}
            # populates hits from processing each hit from response["hits"]["hits"]
            return results
        except (NotFoundError,Exception) as e:
            return {
                "total": 0, "hits": [], "error": # filter error from e
            }
    def get_index_mapping(self) -> Optional[Dict[str, Any]]:
        """Get index mapping (alias for get_mappings for compatibility)."""
        try:
            mappings = self.client.indices.get_mapping(index=self.index_name)
            # Extract just the properties from the nested structure
            if mappings and self.index_name in mappings:
                return mappings[self.index_name].get("mappings", {})
            return {}
        except Exception as e:
            return None
    def get_index_settings(self) -> Optional[Dict[str, Any]]:
        """Get index settings (alias for get_settings for compatibility)."""
        # same as get_index_mapping() just call self.client.indices.get_settings() and check if it's not None and index_name present

    def get_index_stats(self) -> Dict[str, Any]:
        """Get statistics about the index."""
    def get_cluster_info(self) -> Optional[Dict[str, Any]]:
        """Get OpenSearch cluster information."""
    def get_cluster_health(self) -> Optional[Dict[str, Any]]:
        """Get detailed cluster health information."""
```