In [5]:
import xml.etree.ElementTree as ET

# Load local XML file
tree = ET.parse("sitemap.xml")
root = tree.getroot()

# Extract all URLs
urls = []
for url in root.findall(".//{http://www.sitemaps.org/schemas/sitemap/0.9}loc"):
    urls.append(url.text)

print(f"✅ Total URLs extracted: {len(urls)}")


✅ Total URLs extracted: 500


In [6]:
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
import pickle

# Change this if needed
headers = {"User-Agent": "Mozilla/5.0"}

def extract_article_data(url):
    try:
        res = requests.get(url, headers=headers, timeout=10)
        res.raise_for_status()
        soup = BeautifulSoup(res.text, "lxml")

        title = soup.find("title").text.strip()
        
        # Try common content selectors — adapt as needed
        article = soup.find("article")
        if article:
            content = " ".join(p.text for p in article.find_all("p"))
        else:
            content = " ".join(p.text for p in soup.find_all("p"))

        return {
            "url": url,
            "title": title,
            "content": content.strip()
        }
    except Exception as e:
        return None

# Load your extracted URLs here
# urls = [...]  # You already have this from the XML
scraped_data = []

for url in tqdm(urls):
    data = extract_article_data(url)
    if data and data.get("content"):
        scraped_data.append(data)

# ✅ Save to pickle file
with open("scraped_data.pkl", "wb") as f:
    pickle.dump(scraped_data, f)

print(f"✅ Done: {len(scraped_data)} articles saved to scraped_data.pkl")


100%|█████████████████████████████████████████| 500/500 [13:53<00:00,  1.67s/it]

✅ Done: 472 articles saved to scraped_data.pkl





In [7]:
import pickle
from llama_index.core import Document

# Load data
with open("scraped_data.pkl", "rb") as f:
    raw_data = pickle.load(f)

# Convert to LlamaIndex Document objects
documents = [
    Document(
        text=item["content"],
        metadata={"title": item["title"], "url": item["url"]}
    )
    for item in raw_data if item.get("content")
]

print(f"Loaded {len(documents)} documents")


Loaded 472 documents
