In [None]:
import requests
import re
from bs4 import BeautifulSoup
import time

# Function to get all ghazal category pages from a poet's main page (using a fixed base URL)
def get_ghazal_category_links(poet_url):
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}

    try:
        response = requests.get(poet_url, headers=headers)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"❌ Error fetching poet's page: {e}")
        return []

    soup = BeautifulSoup(response.text, "html.parser")
    category_links = set()

    for link in soup.find_all("a", href=True):
        if "/ghazals" in link["href"]:  # Filtering ghazal-related links
            full_url = link["href"]
            category_links.add(full_url)

    return list(category_links)

# Function to fetch ghazal links from a category page (using a fixed base URL)
def get_ghazal_links(category_url):
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}

    try:
        response = requests.get(category_url, headers=headers)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"❌ Error fetching ghazal links: {e}")
        return []

    soup = BeautifulSoup(response.text, "html.parser")
    ghazal_links = set()

    for link in soup.find_all("a", href=True):
        if "/ghazals/" in link["href"]:
            full_url = link["href"]
            ghazal_links.add(full_url)

    return list(ghazal_links)

# Function to check if a word contains English letters
def contains_english(word):
    return bool(re.search(r"[a-zA-Z]", word))

# Function to scrape ghazal text from a given URL
def scrape_ghazal(url):
    print(f"📌 Scraping: {url}\n")

    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"❌ Error fetching {url}: {e}\n")
        return ""

    soup = BeautifulSoup(response.text, "html.parser")
    poetry_container = soup.find_all("p")

    if not poetry_container:
        print("⚠ No ghazal found.\n")
        return ""

    poetry_text = []
    for p in poetry_container:
        words = [span.get_text(strip=True) for span in p.find_all("span")]
        line = " ".join(words)
        if contains_english(line):
            poetry_text.append(line)

    poetry_text = poetry_text[2:] if len(poetry_text) > 2 else []

    if poetry_text:
        ghazal = "\n".join(poetry_text)
        print(ghazal + "\n" + "="*50 + "\n")
        return ghazal
    else:
        print("⚠ No English-transliterated text found.\n")
        return ""

# Function to write the scraped data to a text file
def write_to_text(data, filename="updated_ghazals.txt"):
    with open(filename, mode="a", encoding="utf-8") as file:
        for item in data:
            #file.write(f"Ghazal URL: {item[0]}\n")
            file.write(f"\n{item[1]}\n")
            #file.write("=" * 50 + "\n")

# Main execution
if _name_ == "_main_":
    base_url = "https://www.rekhta.org/poets?wref=rweb"  # Replace with the actual base URL
    category_links = get_ghazal_category_links(base_url)
    print(f"✅ Found {len(category_links)} ghazal category pages.\n")

    all_ghazal_links = []
    for category_link in category_links:
        links = get_ghazal_links(category_link)
        all_ghazal_links.extend(links)

    print(f"✅ Found {len(all_ghazal_links)} ghazals.\n")

    scraped_data = []
    i = 0
    for link in all_ghazal_links:
        i += 1
        print("Ghazal no:", i)
        ghazal_text = scrape_ghazal(link)
        if ghazal_text:
            scraped_data.append([link, ghazal_text])
        time.sleep(2)  # Delay to prevent blocking

    if scraped_data:
        write_to_text(scraped_data)
        print(f"✅ Data has been written to ghazals.txt")