## src/main.py

```python
from src.config import get_settings
from src.db.factory import make_database

# from src.services.arxiv.factory import make_arxiv_client
# from src.services.pdf_parser.factory import make_pdf_parser_service

# from src.routers import papers

@asynccontextmanager
async def lifespan(app: FastAPI):
    app.state.settings = get_settings()
    database = make_database()
    app.state.database = database
    app.state.arxiv_client = make_arxiv_client()
    app.state.pdf_parser = make_pdf_parser_service()
    
    yield
    database.teardown()

# Include routers
ping.router, prefix="/api/v1"
papers.router, prefix="/api/v1"

```

## src/routers/

**- src/routers/papers.py**

```routers -> schema & repositories```

### src/routers/papers.py

```routers -> model from schemas.arxiv.paper,repositories.paper,models.paper```

#### Imports

```python
from src.schemas.arxiv.paper import PaperResponse, PaperSearchResponse

"""
schemas/arxiv/paper.py : contains following Model(s)

class ArxivPaper(BaseModel):
    ...
class PaperBase(BaseModel):
    ...
class PaperCreate(PaperBase):
    ...
class PaperResponse(PaperBase):
    ...
class PaperSearchResponse(BaseModel):
    ...
"""
from src.dependencies import SessionDep
"""
dependencies -> call : request.app.state.database

Get settings from the request state.

"""

from src.repositories.paper import PaperRepository

```

##### src/repositories/paper.py

```python
from src.models.paper import Paper

class Paper(Base):
    __tablename__ = "papers"

    # Core arXiv metadata
    id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
    arxiv_id = Column(String, unique=True, nullable=False, index=True)
    ...
    # Parsed PDF content (added for comprehensive storage)
    # PDF processing metadata
    # Timestamps
    created_at = Column(DateTime, default=lambda: datetime.now(timezone.utc))
    updated_at = Column(DateTime, default=lambda: datetime.now(timezone.utc), onupdate=lambda: datetime.now(timezone.utc))

```

```python
# from src.models.paper import Paper
# from src.schemas.arxiv.paper import PaperCreate

class PaperRepository:
    def __init__(self, session: Session):
        self.session = session # sqlalchemy.orm
    def create(self, paper: PaperCreate) -> Paper:
        db_paper = Paper(**paper.model_dump())
        # add to session -> commit -> refresh
        return db_paper
    def get_by_arxiv_id(self, arxiv_id: str) -> Optional[Paper]:
        # query stmt
        return self.session.scalar(stmt)
    def get_by_id(self, paper_id: UUID) -> Optional[Paper]:
        # similar to get_by_arxiv_id
    def get_all(self, limit: int = 100, offset: int = 0) -> List[Paper]:
        stmt = select(Paper).order_by(Paper.published_date.desc()).limit(limit).offset(offset)
        return list(self.session.scalars(stmt))
    def get_count(self) -> int:
        stmt = select(func.count(Paper.id))
        return self.session.scalar(stmt) or 0

    def get_processed_papers(self, limit: int = 100, offset: int = 0) -> List[Paper]:
        """Get papers that have been successfully processed with PDF content."""
        stmt = (
            select(Paper)
            .where(Paper.pdf_processed == True)
            .order_by(Paper.pdf_processing_date.desc())
            .limit(limit)
            .offset(offset)
        )
        return list(self.session.scalars(stmt))
    
    def get_unprocessed_papers(self, limit: int = 100, offset: int = 0) -> List[Paper]:
        """Get papers that haven't been processed for PDF content yet."""
        stmt = select(Paper).where(Paper.pdf_processed == False).order_by(Paper.published_date.desc()).limit(limit).offset(offset)
        return list(self.session.scalars(stmt))
    
    def get_papers_with_raw_text(self, limit: int = 100, offset: int = 0) -> List[Paper]:
        """Get papers that have raw text content stored. 
        i.e Paper.raw_text != None
        """
    def get_processing_stats(self) -> dict:
        # Count processed papers
        # Count papers with text
        return {
            "total_papers": total_papers,
            "processed_papers": processed_papers,
            "papers_with_text": papers_with_text,
            "processing_rate": (processed_papers / total_papers * 100) if total_papers > 0 else 0,
            "text_extraction_rate": (papers_with_text / processed_papers * 100) if processed_papers > 0 else 0,
        }
    def update(self, paper: Paper) -> Paper:
        # add to session -> commit -> refresh
        return paper
    
    def upsert(self, paper_create: PaperCreate) -> Paper:
        # Check if paper already exists
        existing_paper = self.get_by_arxiv_id(paper_create.arxiv_id)
        if existing_paper:
            # Update existing paper with new content
            for key, value in paper_create.model_dump(exclude_unset=True).items():
                setattr(existing_paper, key, value)
            return self.update(existing_paper)
        else:
            # Create new paper
            return self.create(paper_create)

```

#### Original

```python
# from src.dependencies import SessionDep
# from src.repositories.paper import PaperRepository
# from src.schemas.arxiv.paper import PaperResponse, PaperSearchResponse

router = APIRouter(prefix="/papers", tags=["papers"])

@router.get("/", response_model=PaperSearchResponse)
def list_papers(
    db: SessionDep,
    limit: int = Query(default=10, ge=1, le=100, description="Number of papers to return (1-100)"),
    offset: int = Query(default=0, ge=0, description="Number of papers to skip"),
) -> PaperSearchResponse:
    """Get a list of papers with pagination."""
    paper_repo = PaperRepository(db)
    papers = paper_repo.get_all(limit=limit, offset=offset)
    # Get total count for pagination info
    total = paper_repo.get_count()
    return PaperSearchResponse(papers=[PaperResponse.model_validate(paper) for paper in papers], total=total)

@router.get("/{arxiv_id}", response_model=PaperResponse)
def get_paper_details(
    db: SessionDep,
    arxiv_id: str = Path(
        ..., description="arXiv paper ID (e.g., '2401.00001' or '2401.00001v1')", regex=r"^\d{4}\.\d{4,5}(v\d+)?$"
    ),
) -> PaperResponse:
    """Get details of a specific paper by arXiv ID."""
    paper_repo = PaperRepository(db)
    paper = paper_repo.get_by_arxiv_id(arxiv_id)
    # paper not found - HTTPException(status_code=404)
    return PaperResponse.model_validate(paper)
```

## src/services/

**- src/services/arxiv/factory.py**

**- src/services/pdf_parser/factory.py**

```arxiv service -> own factory methods```

```pdf_parser service -> own factory methods```

## src/services/arxiv/

**- src/services/arxiv/factory.py**

**- src/services/arxiv/client.py**

```arxiv service -> factory method -> client```

### src/services/arxiv/factory.py

```python

# from .client import ArxivClient

def make_arxiv_client() -> ArxivClient:
    """Factory function to create an arXiv client instance."""
    client = ArxivClient(settings=settings.arxiv)

    return client
```

### src/services/arxiv/client.py

```arxiv service -> model from schemas.arxiv.paper```

#### Imports

```python

from src.schemas.arxiv.paper import ArxivPaper

"""
schemas/arxiv/paper.py : contains following Model(s)

class ArxivPaper(BaseModel):
    ...
class PaperBase(BaseModel):
    ...
class PaperCreate(PaperBase):
    ...
class PaperResponse(PaperBase):
    ...
class PaperSearchResponse(BaseModel):
    ...

"""
```

#### Original

```python
# from src.schemas.arxiv.paper import ArxivPaper

class ArxivClient:
    """Client for fetching papers from arXiv API."""
    def __init__(self, settings: ArxivSettings):
        self._settings = settings
        self._last_request_time: Optional[float] = None

    @cached_property
    def pdf_cache_dir(self) -> Path:
         """PDF cache directory."""
    # getter methods as @property
    async def fetch_papers(
        self,
        max_results: Optional[int] = None,
        start: int = 0,
        sort_by: str = "submittedDate",
        sort_order: str = "descending",
        from_date: Optional[str] = None,
        to_date: Optional[str] = None,
    ) -> List[ArxivPaper]:
        """
        Fetch papers from arXiv for the configured category."""
        # Build basic search query
        # Add date filtering if provided - arXiv format
        # add additional params in basic search query
        try:
            # Add rate limiting delay between all requests (arXiv recommends 3 seconds)
            if self._last_request_time is not None:
                time_since_last = time.time() - self._last_request_time
                if time_since_last < self.rate_limit_delay:
                    sleep_time = self.rate_limit_delay - time_since_last
                    await asyncio.sleep(sleep_time)
            self._last_request_time = time.time()
            async with httpx.AsyncClient(timeout=self.timeout_seconds) as client:
                response = await client.get(url)
                # get response_code,xml_data
            papers = self._parse_response(xml_data)
            return papers
        except (httpx.TimeoutException,httpx.HTTPStatusError) as e:
            # raise proper exception

    def _parse_response(self, xml_data: str) -> List[ArxivPaper]:
        """
        Parse arXiv API XML response into ArxivPaper objects."""
        try:
            root = ET.fromstring(xml_data)
            entries = root.findall("atom:entry", self.namespaces)

            papers = []
            for entry in entries:
                paper = self._parse_single_entry(entry)
                if paper:
                    papers.append(paper)

            return papers
        except (ET.ParseError,Exception) as e:
            # raise proper exception
    
     def _parse_single_entry(self, entry: ET.Element) -> Optional[ArxivPaper]:
        """Parse a single entry from arXiv XML response."""
        try:
            # get the values
            return ArxivPaper(
                arxiv_id=arxiv_id,
                title=title,
                authors=authors,
                abstract=abstract,
                published_date=published,
                categories=categories,
                pdf_url=pdf_url,
            )
        except Exception as e:
            return None

    def _get_pdf_url(self, entry: ET.Element) -> str:
        """Extract PDF URL from entry links."""
        for link in entry.findall("atom:link", self.namespaces):
            if link.get("type") == "application/pdf":
                url = link.get("href", "")
                # Convert HTTP to HTTPS for arXiv URLs
                if url.startswith("http://arxiv.org/"):
                    url = url.replace("http://arxiv.org/", "https://arxiv.org/")
                return url
        return ""
    
    async def download_pdf(self, paper: ArxivPaper, force_download: bool = False) -> Optional[Path]:
        """Download PDF for a given paper to local cache."""
        # null check -> paper.pdf_url
        pdf_path = self._get_pdf_path(paper.arxiv_id)
        # Return cached PDF -> pdf_path.exists()
        if await self._download_with_retry(paper.pdf_url, pdf_path):
            return pdf_path
        else:
            return None

    async def _download_with_retry(self, url: str, path: Path, max_retries: Optional[int] = None) -> bool:
        """Download a file with retry logic."""
        # Respect rate limits
        await asyncio.sleep(self.rate_limit_delay)
        for attempt in range(max_retries):
            try:
                async with httpx.AsyncClient(timeout=float(self.timeout_seconds)) as client:
                    async with client.stream("GET", url) as response:
                        response.raise_for_status()
                        with open(path, "wb") as f:
                            async for chunk in response.aiter_bytes():
                                f.write(chunk)
                return True
            except httpx.TimeoutException as e:
                if attempt < max_retries - 1:
                    wait_time = self._settings.download_retry_delay_base * (attempt + 1)
                    await asyncio.sleep(wait_time)
                else:
                    # raise Timeout
            except httpx.HTTPError as e:
                if attempt < max_retries - 1:
                    wait_time = self._settings.download_retry_delay_base * (attempt + 1)  # Exponential backoff
                    await asyncio.sleep(wait_time)
                else:
                    # raise DownloadError
            except Exception as e:
                # raise DownloadError

        # Clean up partial download
        if path.exists():
            path.unlink()

        return False
    
    
    async def fetch_papers_with_query(
        self,
        search_query: str,
        max_results: Optional[int] = None,
        start: int = 0,
        sort_by: str = "submittedDate",
        sort_order: str = "descending",
    ) -> List[ArxivPaper]:
        """same as fetch_papers() but with different query""" 


    async def fetch_paper_by_id(self, arxiv_id: str) -> Optional[ArxivPaper]:
        """same as fetch_papers() but less query & no ae limiting"""

```

## src/services/pdf_parser/

**- src/services/pdf_parser/factory.py**

**- src/services/pdf_parser/parser.py**

**- src/services/pdf_parser/docling.py**

```pdf_parser service -> factory method -> parser & docling```

### src/services/pdf_parser/factory.py

```python
# from .parser import PDFParserService


@lru_cache(maxsize=1)
def make_pdf_parser_service() -> PDFParserService:
    """Create cached PDF parser service using Docling."""
    settings = get_settings()
    return PDFParserService(
        max_pages=settings.pdf_parser.max_pages,
        max_file_size_mb=settings.pdf_parser.max_file_size_mb,
        do_ocr=settings.pdf_parser.do_ocr,
        do_table_structure=settings.pdf_parser.do_table_structure,
    )

def reset_pdf_parser() -> None:
    """
    Reset the cached instance using lru_cache's built-in cache management.
    Useful for testing or when configuration changes.
    """
    make_pdf_parser_service.cache_clear()
```

### src/services/pdf_parser/parser.py

``` pdf_parser service -> schemas.pdf_parser model & DoclingParser from docling```

#### Imports

```python

from src.schemas.pdf_parser.models import PdfContent

"""
class ParserType(str, Enum):
    DOCLING = "docling"

class PaperSection(BaseModel):
    ...
class PaperFigure(BaseModel):
    ...
class PaperTable(BaseModel):
    ...
class PdfContent(BaseModel):
    ...
class ArxivMetadata(BaseModel):
    ...
class ParsedPaper(BaseModel):
    ...
"""

from .docling import DoclingParser

class DoclingParser:
    def __init__(self, max_pages: int, max_file_size_mb: int, do_ocr: bool = False, do_table_structure: bool = True):
        # Configure pipeline options
        pipeline_options = PdfPipelineOptions(...)
        self._converter = DocumentConverter(format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)})
        ...

```

#### Original

```python
# from src.schemas.pdf_parser.models import PdfContent
# from .docling import DoclingParser

class PDFParserService:
    def __init__(self, max_pages: int = 20, max_file_size_mb: int = 20,do_ocr: bool = False, do_table_structure: bool = True):
        self.docling_parser = DoclingParser(
            max_pages=max_pages, max_file_size_mb=max_file_size_mb, do_ocr=do_ocr, do_table_structure=do_table_structure
        )

    async def parse_pdf(self, pdf_path: Path) -> Optional[PdfContent]:
        # raise error if not pdf_path.exists()
        try:
            result = await self.docling_parser.parse_pdf(pdf_path)
            if result:
                return result
            else:
               # raise erroe
        except (PDFValidationError, PDFParsingException):
            raise
        except Exception as e:
            # raise error

```

### src/services/pdf_parser/docling.py

```pdf_parser service -> model from schemas.pdf_parser.models```

#### Imports

```python
# docling.py -> schemas/pdf_parser/models

from src.schemas.pdf_parser.models import PaperFigure, PaperSection, PaperTable, ParserType, PdfContent

"""
class ParserType(str, Enum):
    DOCLING = "docling"

class PaperSection(BaseModel):
    ...
class PaperFigure(BaseModel):
    ...
class PaperTable(BaseModel):
    ...
class PdfContent(BaseModel):
    ...
class ArxivMetadata(BaseModel):
    ...
class ParsedPaper(BaseModel):
    ...
"""

from .parser import PDFParserService

class PDFParserService:
    def __init__(self, max_pages: int, max_file_size_mb: int, do_ocr: bool = False, do_table_structure: bool = True):
        self.docling_parser = DoclingParser(...)

    async def parse_pdf(self, pdf_path: Path) -> Optional[PdfContent]:
        """Parse PDF using Docling parser only."""
        # result = await self.docling_parser.parse_pdf(pdf_path)
```

#### Original

```python
# from src.schemas.pdf_parser.models import PaperFigure, PaperSection, PaperTable, ParserType, PdfContent
import pypdfium2 as pdfium
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption

class DoclingParser:
    def __init__(self, max_pages: int, max_file_size_mb: int, do_ocr: bool = False, do_table_structure: bool = True):
        # Configure pipeline options
        pipeline_options = PdfPipelineOptions(...)
        self._converter = DocumentConverter(format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)})
        self._warmed_up = False
        self.max_pages = max_pages
        self.max_file_size_bytes = max_file_size_mb * 1024 * 1024

    def _warm_up_models(self):
        """Pre-warm the models with a small dummy document to avoid cold start."""
        if not self._warmed_up:
            # This happens only once per DoclingParser instance
            self._warmed_up = True

    def _validate_pdf(self, pdf_path: Path) -> bool:
        """Comprehensive PDF validation including size and page limits."""
        try:
            # Check file exists and is not empty
            file_size = pdf_path.stat().st_size
            # if file_size > self.max_file_size_bytes: log warning,raiser error
            # Check if file starts with PDF header
            with open(pdf_path, "rb") as f:
                header = f.read(8)
                if not header.startswith(b"%PDF-"):
                   # rasie error
            # Check page count limit
            pdf_doc = pdfium.PdfDocument(str(pdf_path))
            actual_pages = len(pdf_doc)
            pdf_doc.close()
            # if actual_pages > self.max_pages: log warning,raiser error
            return True

        except (PDFValidationError,Exception) as e:
            # raise proper error

    async def parse_pdf(self, pdf_path: Path) -> Optional[PdfContent]:
        """Parse PDF using Docling parser.
        Limited to 20 pages to avoid memory issues with large papers."""
        self._validate_pdf(pdf_path)
        self._warm_up_models()
        # Limit processing to avoid memory issues with large papers
        result = self._converter.convert(str(pdf_path), max_num_pages=self.max_pages, max_file_size=self.max_file_size_bytes)
        doc = result.document
        sections = []
        current_section = {"title": "Content", "content": ""}
        for element in doc.texts:
            if hasattr(element, "label") and element.label in ["title", "section_header"]:
                # Save previous section if it has content
                if current_section["content"].strip():
                    sections.append(PaperSection(title=current_section["title"], content=current_section["content"].strip()))
                # Start new section
                current_section = {"title": element.text.strip(), "content": ""}
            else:
                # Add content to current section
                if hasattr(element, "text") and element.text:
                    current_section["content"] += element.text + "\n"
        # Add final section
        if current_section["content"].strip():
            sections.append(PaperSection(title=current_section["title"], content=current_section["content"].strip()))

        return PdfContent(
            sections=sections,
            figures=[],  # Removed: basic metadata not useful
            tables=[],  # Removed: basic metadata not useful
            raw_text=doc.export_to_text(),
            references=[],
            parser_used=ParserType.DOCLING,
            metadata={"source": "docling", "note": "Content extracted from PDF, metadata comes from arXiv API"},
        )
    except PDFValidationError as error_msg:
        # filter & display error_msg - "too large","too many pages"
    except Exception as error_msg:
        # filter & dispaly error_msg - "not valid","timeout","memory","max_num_pages","other"
```