# This notebook contains functional tests for the backend API of the document ingestion and retrieval system.

In [1]:
import os

import requests


In [None]:
DATA_DIR = "../data/"

In [None]:
url = "http://localhost:8000/ingest"
file_path = os.path.join(DATA_DIR, "/testPDFs/BERT.pdf")

# We open the file in binary mode
with open(file_path, "rb") as f:
    files = {"file": f}
    # We send the model choice as form data
    data = {"model_name": "bert"}

    print(f"üì§ Uploading {file_path}...")
    response = requests.post(url, files=files, data=data)

if response.status_code == 200:
    print("‚úÖ Ingestion Success!")
    print(response.json())
else:
    print(f"‚ùå Error {response.status_code}: {response.text}")

In [None]:
url = "http://localhost:8000/query"
payload = {
    "query_text": "What are the architecture details of the transformer?",
    "n_results": 2,
    "model_name": "bert"
}

print(f"üîç Searching for: '{payload['query_text']}'")
response = requests.post(url, json=payload)

if response.status_code == 200:
    data = response.json()
    results = data.get("results", [])

    print(f"‚úÖ Found {len(results)} matches:\n")
    for res in results:
        print(f"üìÑ Doc ID: {res['doc_id']}")
        print(f"üìä Score:  {res['score']:.4f}")
        print(f"üìù Content: {res['content'][:100]}...")  # Preview first 100 chars
        print("-" * 30)
else:
    print(f"‚ùå Error {response.status_code}: {response.text}")

In [None]:
# Deleting specific documents
url = "http://localhost:8000/delete"
payload = {
    "model_name": "bert",
    "doc_ids": ["arXiv_1706.03762v7#Introduction", "arXiv_1706.03762v7#Conclusion"]
}
requests.post(url, json=payload)

In [32]:
# Resetting the entire collection for a model
url = "http://localhost:8000/reset"
payload = {"model_name": "bert"}
requests.post(url, json=payload)

<Response [200]>

In [31]:
# List items in the collection
url = "http://localhost:8000/list-ids"

params = {
    "model_name": "bert",
    "limit": 5
}

response = requests.get(url, params=params)
print(response.json())

{'model_name': 'bert', 'ids': ['arXiv:1810.04805v2#Preamble', 'arXiv:1810.04805v2#BERT:_Pre-training_of_Deep_Bidirectional_Transform', 'arXiv:1810.04805v2#Abstract', 'arXiv:1810.04805v2#1_Introduction', 'arXiv:1810.04805v2#2_Related_Work'], 'total_in_batch': 5}


In [33]:
# Debug embeddings for a specific text
url = "http://localhost:8000/debug/embed"
payload = {
    "text": "The transformer architecture allows for parallelization.",
    "model_name": "bert"
}
response = requests.post(url, json=payload)
print(response.json()["dimension"])

768


In [34]:
# Parse a PDF and get JSON
url = "http://localhost:8000/debug/parse-pdf"
files = {'file': open('../data/testPDFs/BERT.pdf', 'rb')}

response = requests.post(url, files=files)
data = response.json()

print("Title detected:", data['metadata_extracted']['title'])
print("Sections found:", list(data['sections'].keys()))

Title detected: BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding
Sections found: ['Preamble', 'BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding', 'Abstract', '1 Introduction', '2 Related Work', '2.1 Unsupervised Feature-based Approaches', '2.2 Unsupervised Fine-tuning Approaches', '2.3 Transfer Learning from Supervised Data', '3 BERT', '3.1 Pre-training BERT', '3.2 Fine-tuning BERT', '4 Experiments', '4.1 GLUE', '4.2 SQuAD v1.1', '4.3 SQuAD v2.0', '4.4 SWAG', '5 Ablation Studies', '5.1 Effect of Pre-training Tasks', '5.2 Effect of Model Size', '5.3 Feature-based Approach with BERT', '6 Conclusion', 'References', "Appendix for 'BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding'", 'A.1 Illustration of the Pre-training Tasks', 'A.2 Pre-training Procedure', 'A.3 Fine-tuning Procedure', 'A.4 Comparison of BERT, ELMo ,and OpenAI GPT', 'A.5 Illustrations of Fine-tuning on Different Tasks', 'B.1 Deta

In [None]:
url = "http://localhost:8000/recommend"

# "Attention Is All You Need" (S2 ID) and "BERT" (ArXiv ID) as positive examples
payload = {
    "positive_paper_ids": [
        "649def34f8be52c8b66281af98ae884c09aef38b",
        "arXiv:1810.04805"
    ],
    "negative_paper_ids": [],
    "limit": 5
}

try:
    print("üß† Requesting recommendations...")
    response = requests.post(url, json=payload)
    response.raise_for_status()  # Raise error for 4xx/5xx

    data = response.json()

    print(f"\n‚úÖ Received {len(data['recommendations'])} recommendations:\n")

    for paper in data['recommendations']:
        print(f"üìÑ {paper['title']} ({paper['year']})")
        print(f"   Authors: {', '.join([a['name'] for a in paper['authors']])}")
        print(f"   Link: {paper['url']}")
        print("-" * 40)

except requests.exceptions.HTTPError as e:
    print(f"‚ùå API Error: {e}")
    print(response.text)

In [None]:
url = "http://localhost:8000/paper/search"
title_query = "Attention Is All You Need"
# Query parameters
params = {
    "query": title_query
}

try:
    print(f"üîé Searching for: '{title_query}'...")
    response = requests.get(url, params=params)
    response.raise_for_status()

    data = response.json()
    paper_id = data.get("paperId")

    if paper_id:
        print(f"‚úÖ Found Paper ID: {paper_id}")

    else:
        print("‚ö†Ô∏è No paper found with that title.")


except requests.exceptions.RequestException as e:
    print(f"‚ùå API Request Failed: {e}")
