<a href="https://colab.research.google.com/github/MahsaSetoode/Chatbot/blob/main/webscraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Part 1: Mount Google Drive
from google.colab import drive
import os

# Mount Google Drive
drive.mount('/content/drive')

# Create a folder inside your Drive to store the output
output_dir = "/content/drive/MyDrive/extracted_texts"
os.makedirs(output_dir, exist_ok=True)
print(f"Output directory: {output_dir}")


Mounted at /content/drive
Output directory: /content/drive/MyDrive/extracted_texts


In [None]:
# Part 2: Install Required Libraries
!pip install PyPDF2
!pip install beautifulsoup4
!pip install pdfplumber



In [None]:
# Part 3: Scraper Functions
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from io import BytesIO
import PyPDF2

BASE_URL = "https://www.bahai.org"

def get_internal_pages(main_url):
    """Fetch sub-page URLs from expandable list."""
    response = requests.get(main_url)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, "html.parser")

    links = []
    container = soup.find("div", class_="indent expandablelist")
    if container:
        for a in container.find_all("a", href=True):
            full_url = urljoin(main_url, a["href"])
            links.append(full_url)
    return links

def get_pdf_links_from_subpage(page_url):
    """From a subpage, find the actual 'PDF' download link."""
    response = requests.get(page_url)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, "html.parser")

    ## doc_rows = soup.select("tr.document-row")
    #if doc_rows:
     #   pdfs = []
     #   for doc_row in doc_rows:

     #       details = doc_row.find_next_sibling("tr", class_="details-row js-details-row")
     #       if not details:
     #           continue
     #       for li in details.select('li[title="PDF"]'):
     #           a_tag = li.find("a", href=True)
     #           if a_tag and a_tag["href"].endswith(".pdf"):
     #               full_url = urljoin(subpage_url, a_tag["href"].strip())
     #               pdfs.append(full_url)
     #   return list(dict.fromkeys(pdfs))

    # Fallback method: simple scan for any <a> tag containing "pdf" in its text
    for a in soup.find_all("a", href=True):
        if "pdf" in a.text.lower():
            return [urljoin(subpage_url, a["href"])]


    return []

#def extract_text_from_pdf_bytes(pdf_bytes):
 #   text = ""
  #  with BytesIO(pdf_bytes) as f:
   #     reader = PyPDF2.PdfReader(f)
    #    for page in reader.pages:
#            text += page.extract_text() or ""
#    return text

import pdfplumber

def extract_text_from_pdf_bytes(pdf_bytes):
    text = ""
    with BytesIO(pdf_bytes) as f:
        with pdfplumber.open(f) as pdf:
            for page in pdf.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text + "\n"
    return text


In [None]:
# 🟦 English writtings
english_urls = [
    "https://www.bahai.org/library/authoritative-texts/bahaullah/",
    "https://www.bahai.org/library/authoritative-texts/the-bab/",
    "https://www.bahai.org/library/authoritative-texts/abdul-baha/",
    "https://www.bahai.org/library/authoritative-texts/shoghi-effendi/",
    # "https://www.bahai.org/library/authoritative-texts/the-universal-house-of-justice/"
]

# 🟨 Arabic writtings
#arabic_urls = [
#   "https://www.bahai.org/ar/library/authoritative-texts/downloads",
#]

# 🟪 Persian writtings
persian_urls = [
    "https://www.bahai.org/fa/library/authoritative-texts/bahaullah/",
    "https://www.bahai.org/fa/library/authoritative-texts/the-bab/",
    "https://www.bahai.org/fa/library/authoritative-texts/abdul-baha/",
    "https://www.bahai.org/fa/library/authoritative-texts/shoghi-effendi/",

]

# 🗂️ All lists combined in a dict
language_lists = {
    #"English writtings": english_urls,
    # "Arabic": arabic_urls,
    "Persian writtings": persian_urls
}

In [None]:
# Part 4: Run Main Script and Save Files to Drive
for language, url_list in language_lists.items():
    print(f"\n🟢 Starting list: {language}")
    lang_dir = os.path.join(output_dir, language.replace(" ", "_"))
    os.makedirs(lang_dir, exist_ok=True)

    for main_url in url_list:
        print(f"\n🌐 URL: {main_url}")

        # Record current files
        existing_files = set(os.listdir(lang_dir))

        # subpages = get_internal_pages(main_url)
        try:
            subpages = get_internal_pages(main_url)
        except Exception as e:
            print(f"❌ Failed to load page: {e}")
            continue
        print(f"Found {len(subpages)} subpages.\n")

        for i, subpage_url in enumerate(subpages, 1):
            print(f"{i}. Visiting {subpage_url} ...")
            pdf_links = get_pdf_links_from_subpage(subpage_url)

            if not pdf_links:
                print("⚠️ PDF link not found.")
                continue

            print(f"   ↳ Found PDF: {len(pdf_links)} PDF link(s) found.")
            for pdf in pdf_links:

                try:
                    pdf_resp = requests.get(pdf)
                    pdf_resp.raise_for_status()
                    text = extract_text_from_pdf_bytes(pdf_resp.content)
                    filename = os.path.basename(urlparse(pdf).path).replace(".pdf", ".txt")
                    path = os.path.join(lang_dir, filename)

                    # Save text file only if it's new
                    if filename not in existing_files:
                        with open(path, "w", encoding="utf-8") as f:
                            f.write(text)
                        print(f"         ✓ saved {filename}")
                    else:
                      print(f"         – {filename} already exists, skipping")
                except Exception as e:
                    print(f"         ✗ failed to download {filename}: {e}")

        # 2. List all files after scraping
        final_files = set(os.listdir(lang_dir))
        new_files = final_files - existing_files

        # 3. Print summary
        print(f"\n📊 Done! {len(new_files)} new files added:")
        for name in sorted(lang_dir):
            print(f"  - {name}")


🟢 Starting list: Persian writtings

🌐 URL: https://www.bahai.org/fa/library/authoritative-texts/bahaullah/
Found 15 subpages.

1. Visiting https://www.bahai.org/fa/library/authoritative-texts/bahaullah/days-remembrance/ ...
   ↳ Found PDF: 1 PDF link(s) found.
         ✓ saved days-remembrance.txt
2. Visiting https://www.bahai.org/fa/library/authoritative-texts/bahaullah/gems-divine-mysteries/ ...
   ↳ Found PDF: 1 PDF link(s) found.
         ✓ saved gems-divine-mysteries.txt
3. Visiting https://www.bahai.org/fa/library/authoritative-texts/bahaullah/tabernacle-unity/ ...
   ↳ Found PDF: 1 PDF link(s) found.
         ✓ saved tabernacle-unity.txt
4. Visiting https://www.bahai.org/fa/library/authoritative-texts/bahaullah/call-divine-beloved/ ...
   ↳ Found PDF: 1 PDF link(s) found.
         ✓ saved call-divine-beloved.txt
5. Visiting https://www.bahai.org/fa/library/authoritative-texts/bahaullah/kitab-i-aqdas/ ...
   ↳ Found PDF: 1 PDF link(s) found.
         ✓ saved kitab-i-aqdas.txt
6.