In [None]:
# !pip install pymongo
# !pip install pymupdf
# !pip install sentence-transformers
# !pip install torch --upgrade
# !pip uninstall transformers sentence-transformers -y
# !pip install transformers sentence-transformers
# %pip install torch==1.9.0
# %pip install transformers --upgrade

In [None]:
from pymongo import MongoClient
import requests
from bs4 import BeautifulSoup
import fitz  # PyMuPDF

In [None]:
client = MongoClient("mongodb://localhost:27017/")
print(client.list_database_names())

In [None]:
VOLUME_ID = "3500"  # You can change this place
BASE_URL = f"https://ceur-ws.org/Vol-{VOLUME_ID}/"

In [None]:
def get_pdf_links(volume_url):
    response = requests.get(volume_url)
    soup = BeautifulSoup(response.text, "html.parser")
    pdf_links = [BASE_URL + a["href"] for a in soup.find_all("a", href=True) if a["href"].endswith(".pdf")]
    return pdf_links

pdf_urls = get_pdf_links(BASE_URL)
print(f"{len(pdf_urls)} PDF found!")

In [None]:
def extract_text_from_pdf(pdf_url):
    response = requests.get(pdf_url)
    pdf_path = "temp.pdf"
    
    # Save PDF
    with open(pdf_path, "wb") as f:
        f.write(response.content)

    # Open PDF and extract text
    doc = fitz.open(pdf_path)
    text = "\n".join([page.get_text("text") for page in doc])

    return text

# Sample test
pdf_url = "https://ceur-ws.org/Vol-3500/paper1.pdf"  # A PDF URL from CEUR
sample_text = extract_text_from_pdf(pdf_url)
print(sample_text[:500])  # Show first 500 characters

In [None]:
from sentence_transformers import SentenceTransformer

# Load model
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
print("Model uploaded successfully!")



In [None]:
def convert_text_to_embeddings(text):
    sentences = text.split("\n")  # We can consider each line as a sentence
    embeddings = model.encode(sentences)  # Model with embedding'e conversion
    return embeddings

In [None]:
# Function to save data to MongoDB
def save_embeddings_to_mongodb(embeddings, sentences, db_name="ceur_papers", collection_name="embeddings"):
    client = MongoClient("mongodb://localhost:27017/")  # MongoDB connection
    db = client[db_name]
    collection = db[collection_name]
    
    for i, embedding in enumerate(embeddings):
        document = {
            "text": sentences[i],
            "embedding": embedding.tolist()  # We convert numpy array to list
        }
        collection.insert_one(document).acknowledged

In [None]:
def process_pdf_and_store(pdf_url):
    # Load model
    model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
    
    # Extract text from PDF
    text = extract_text_from_pdf(pdf_url)
    
    # Convert texts to embedding
    embeddings = convert_text_to_embeddings(text)
    
    # Save to MongoDB
    save_embeddings_to_mongodb(embeddings, text.split("\n"))

# Start action with PDF URL
pdf_url = "https://ceur-ws.org/Vol-3500/paper1.pdf"  # A PDF URL from CEUR
process_pdf_and_store(pdf_url)