In [95]:
import os
import requests
import pandas as pd
from bs4 import BeautifulSoup
from langchain.docstore.document import Document

In [2]:
import advertools as adv
sitemap = adv.sitemap_to_df("https://www.brandeis.edu/sitemap.xml")
urls = sitemap['loc'].tolist()

2025-02-23 15:24:28,440 | INFO | sitemaps.py:623 | sitemap_to_df | Getting https://www.brandeis.edu/sitemap.xml


In [60]:
def fetch(url):
    try:
        response = requests.get(url, timeout=10)
        
        if response.status_code == 404:
            print(f"404 Not Found: {url}")
            return None
        
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        content = soup.find(id="skip-content")
        return Document(page_content=content.get_text(strip=True) if content else None, metadata={"source": url, 'title': soup.title.get_text()})
    except Exception as e:
        print(f"Error fetching {url}: {e}")
        return None

In [None]:
def scrape_in_batches(urls, batch_size=50):
    documents = []
    for i in range(0, len(urls), batch_size):
        print(f"Scraping batch {i // batch_size + 1} / {len(urls) // batch_size + 1}...")
        batch = urls[i:i + batch_size]
        batch_results = [fetch(url) for url in batch]
        documents.extend([doc for doc in batch_results if doc is not None])
    
    print("Scraping complete.")
    return documents

In [None]:
documents = scrape_in_batches(urls, batch_size=20)

Scraping batch 1 / 810...
Scraping batch 2 / 810...
Scraping batch 3 / 810...
Scraping batch 4 / 810...
Error fetching https://www.brandeis.edu/officesdir/internal.html: 1 validation error for Document
page_content
  Input should be a valid string [type=string_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.10/v/string_type
Error fetching https://www.brandeis.edu/rose/search/index.html: 1 validation error for Document
page_content
  Input should be a valid string [type=string_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.10/v/string_type
Scraping batch 5 / 810...
Scraping batch 6 / 810...
Scraping batch 7 / 810...
Scraping batch 8 / 810...
Scraping batch 9 / 810...
Scraping batch 10 / 810...
Scraping batch 11 / 810...
Error fetching https://www.brandeis.edu/rose/programs/2022/secure/alok-encore-presentation.html: 1 validation error for Document
page_content
  Inpu

In [117]:
def remove_non_utf8_from_docs(docs):
    clean_docs = [
        Document(
            page_content=doc.page_content.encode("utf-8", "ignore").decode("utf-8"),
            metadata=doc.metadata
        )
        for doc in docs
    ]
    return clean_docs

In [118]:
documents = remove_non_utf8_from_docs(documents)

In [74]:
import json

In [121]:
def write_to_file(docs, filename):
    with open(filename, 'w', encoding='utf-8') as f:
        for doc in docs:
            json_record = {
                "page_content": doc.page_content,
                "metadata": doc.metadata
            }
            f.write(json.dumps(json_record, ensure_ascii=False) + "\n")

In [122]:
import json
from langchain.schema import Document

def load_from_file(filename):
    documents = []
    with open(filename, 'r', encoding='utf-8') as f:
        for line in f:
            json_record = json.loads(line.strip())
            documents.append(Document(page_content=json_record["page_content"], metadata=json_record["metadata"]))
    return documents


In [123]:
write_to_file(documents, 'documents.jsonl')

In [None]:
import re
def format_numeric_data(text: str) -> str:
    return re.sub(r"\d+(\.\d+)?", lambda m: f" {m.group()} ", text)

In [88]:
for doc in documents:
    doc.page_content = format_numeric_data(doc.page_content)

In [124]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    add_start_index=True
)
splits = text_splitter.split_documents(documents)

In [126]:
from langchain_openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings(model="text-embedding-3-small", api_key=os.getenv("OPENAI_API_KEY"))
url = "9cb98d40-7c3b-441b-9b6d-4f1ac13eb1fa.europe-west3-0.gcp.cloud.qdrant.io"

In [128]:
from langchain_qdrant import QdrantVectorStore

In [131]:
qdrant = QdrantVectorStore.from_documents(
    splits,
    embeddings,
    url=url,
    prefer_grpc=True,
    api_key=os.getenv("QDRANT_CLUSTER_KEY"),
    collection_name="brandeis.edu",
)

2025-02-23 19:23:26,752 | INFO | _client.py:1038 | _send_single_request | HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2025-02-23 19:23:28,294 | INFO | _client.py:1038 | _send_single_request | HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2025-02-23 19:23:29,889 | INFO | _client.py:1038 | _send_single_request | HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2025-02-23 19:23:31,360 | INFO | _client.py:1038 | _send_single_request | HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2025-02-23 19:23:32,757 | INFO | _client.py:1038 | _send_single_request | HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2025-02-23 19:23:33,516 | INFO | _client.py:1038 | _send_single_request | HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2025-02-23 19:23:34,651 | INFO | _client.py:1038 | _send_single_request | HTTP Request: POST https://api.openai.