# Sitemap Extraction

In [3]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import threading
from queue import Queue
import concurrent.futures

base_url = "https://fa.wiki.khomeini.ir"
visited_links = set()
unvisited_links = Queue()
unvisited_links.put(base_url)

all_links = set()
max_pages = 100000
num_threads = 16

lock = threading.Lock()

def extract_links(url):
    try:
        page_response = requests.get(url, timeout=16)
        page_response.raise_for_status()
        page_soup = BeautifulSoup(page_response.content, "html.parser")
        links = page_soup.find_all("a")
        new_links = set()

        for link in links:
            href = link.get("href")
            if href and href.startswith("/wiki/"):
                full_url = urljoin(base_url, href)
                if full_url not in visited_links:
                    new_links.add(full_url)

        return new_links
    except (requests.Timeout, requests.RequestException) as e:
        print(f"Request error for URL {url}: {e}")
        return set()
    except Exception as e:
        print(f"General error for URL {url}: {e}")
        return set()

def crawl_links():
    global all_links
    pages_extracted = 0
    while pages_extracted < max_pages and not unvisited_links.empty():
        current_url = unvisited_links.get()
        if current_url in visited_links:
            continue

        with lock:
            visited_links.add(current_url)

        new_links = extract_links(current_url)
        with lock:
            all_links.update(new_links)
            pages_extracted += 1

        for link in new_links:
            if link not in visited_links:
                unvisited_links.put(link)

        print(f"Links extracted: {len(all_links)}")

def main():
    with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as executor:
        futures = [executor.submit(crawl_links) for _ in range(num_threads)]
        concurrent.futures.wait(futures)

    print("Link extraction finished.")

    # ذخیره لینک‌ها در فایل
    with open("links.txt", "w", encoding="utf-8") as f:
        for link in all_links:
            f.write(link + "\n")

if __name__ == "__main__":
    main()

Links extracted: 52
Links extracted: 183
Links extracted: 195
Links extracted: 204
Links extracted: 204
Links extracted: 208
Links extracted: 229
Links extracted: 230
Links extracted: 270
Links extracted: 279
Links extracted: 282
Links extracted: 285
Links extracted: 327
Links extracted: 337
Request error for URL https://fa.wiki.khomeini.ir/wiki/%D9%88%DB%8C%DA%A9%DB%8C_%D8%A7%D9%85%D8%A7%D9%85_%D8%AE%D9%85%DB%8C%D9%86%DB%8C:%D8%AA%DA%A9%D8%B0%DB%8C%D8%A8%E2%80%8C%D9%86%D8%A7%D9%85%D9%87%D9%94_%D8%B9%D9%85%D9%88%D9%85%DB%8C: 404 Client Error: Not Found for url: https://fa.wiki.khomeini.ir/wiki/%D9%88%DB%8C%DA%A9%DB%8C_%D8%A7%D9%85%D8%A7%D9%85_%D8%AE%D9%85%DB%8C%D9%86%DB%8C:%D8%AA%DA%A9%D8%B0%DB%8C%D8%A8%E2%80%8C%D9%86%D8%A7%D9%85%D9%87%D9%94_%D8%B9%D9%85%D9%88%D9%85%DB%8C
Links extracted: 337
Links extracted: 381
Links extracted: 384
Links extracted: 470
Links extracted: 517
Links extracted: 525
Links extracted: 596
Links extracted: 613
Links extracted: 1424
Request error for URL https

# Crawl Information From Site

In [8]:
f = open("C:/Users/AsusIran/Desktop\Persian_QA/02-QA-Dataset-Generator/Wiki-Khomeini/wiki-khomein-sitemap.txt", "r")
urls = set(f.read().split('\n'))
urls

{'https://fa.wiki.khomeini.ir/wiki/%D9%88%DB%8C%DA%98%D9%87:%D8%AA%D8%BA%DB%8C%DB%8C%D8%B1%D8%A7%D8%AA_%D9%85%D8%B1%D8%AA%D8%A8%D8%B7/%DA%A9%D8%A7%D9%BE%DB%8C%D8%AA%D9%88%D9%84%D8%A7%D8%B3%DB%8C%D9%88%D9%86',
 'https://fa.wiki.khomeini.ir/wiki/%D9%88%DB%8C%DA%98%D9%87:%D8%AA%D8%BA%DB%8C%DB%8C%D8%B1%D8%A7%D8%AA_%D9%85%D8%B1%D8%AA%D8%A8%D8%B7/%D8%A8%D8%A7%D9%86%DA%A9',
 'https://fa.wiki.khomeini.ir/wiki/%D8%B3%D8%AE%D9%86%D8%A7%D9%86_%D8%A7%D9%85%D8%A7%D9%85%E2%80%8C%D8%AE%D9%85%DB%8C%D9%86%DB%8C',
 '',
 'https://fa.wiki.khomeini.ir/wiki/%D9%88%DB%8C%DA%98%D9%87:%D9%BE%DB%8C%D9%88%D9%86%D8%AF_%D8%A8%D9%87_%D8%A7%DB%8C%D9%86_%D8%B5%D9%81%D8%AD%D9%87/%D8%A8%D8%B3%DB%8C%D8%AC_%D8%B9%D9%85%D9%88%D9%85%DB%8C',
 'https://fa.wiki.khomeini.ir/wiki/%D9%88%DB%8C%DA%98%D9%87:%D8%AA%D8%BA%DB%8C%DB%8C%D8%B1%D8%A7%D8%AA_%D9%85%D8%B1%D8%AA%D8%A8%D8%B7/%DB%B1%DB%B3_%D8%AE%D8%B1%D8%AF%D8%A7%D8%AF',
 'https://fa.wiki.khomeini.ir/wiki/%D9%88%DB%8C%DA%98%D9%87:%D9%BE%DB%8C%D9%88%D9%86%D8%AF_%D8%A8%D9%87_%D8

In [9]:
urls = list(urls)
urls = urls[5000:]
urls

['https://fa.wiki.khomeini.ir/wiki/%D8%A7%D9%85%DA%A9%D8%A7%D9%86_%D8%A7%D8%B4%D8%B1%D9%81',
 'https://fa.wiki.khomeini.ir/wiki/%D9%88%DB%8C%DA%98%D9%87:%D9%BE%DB%8C%D9%88%D9%86%D8%AF_%D8%A8%D9%87_%D8%A7%DB%8C%D9%86_%D8%B5%D9%81%D8%AD%D9%87/%D8%AA%D8%BA%DB%8C%DB%8C%D8%B1_%D8%AA%D8%A7%D8%B1%DB%8C%D8%AE_%D9%87%D8%AC%D8%B1%DB%8C_%D8%A8%D9%87_%D8%B4%D8%A7%D9%87%D9%86%D8%B4%D8%A7%D9%87%DB%8C',
 'https://fa.wiki.khomeini.ir/wiki/%D8%AC%D8%B9%D9%81%D8%B1_%D8%B4%D8%AC%D9%88%D9%86%DB%8C',
 'https://fa.wiki.khomeini.ir/wiki/%D9%88%DB%8C%DA%98%D9%87:%D8%A7%DB%8C%D8%AC%D8%A7%D8%AF_%D8%AA%D8%BA%DB%8C%DB%8C%D8%B1%D9%85%D8%B3%DB%8C%D8%B1/%D8%AA%D9%84%D9%82%DB%8C%D8%AD_%D9%85%D8%B5%D9%86%D9%88%D8%B9%DB%8C',
 'https://fa.wiki.khomeini.ir/wiki/%D8%B1%D8%AF%D9%87:%D8%B4%D8%A7%DA%AF%D8%B1%D8%AF%D8%A7%D9%86_%D8%A7%D9%85%D8%A7%D9%85%E2%80%8C%D8%AE%D9%85%DB%8C%D9%86%DB%8C',
 'https://fa.wiki.khomeini.ir/wiki/%D9%88%DB%8C%DA%98%D9%87:%D8%A7%DB%8C%D8%AC%D8%A7%D8%AF_%D8%AA%D8%BA%DB%8C%DB%8C%D8%B1%D9%85%D8%B3%DB

In [10]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import re
from concurrent.futures import ThreadPoolExecutor, as_completed

# مجموعه‌ای برای ذخیره لینک‌های بازدیدشده
visited_links = set()

# لیست لینک‌ها برای بازدید
links_list = urls

scraped_dataset = {"url": [], "title": [], "text": []}

# تنظیمات ماکسیمم صفحات برای استخراج
max_pages = len(urls)

def extract_info(url):
    try:
        page_response = requests.get(url, timeout=16)
        page_response.raise_for_status()
        page_soup = BeautifulSoup(page_response.content, "html.parser")
        title = page_soup.find("title").text.strip()
        paragraphs = page_soup.find_all("p")

        text = "\n".join([p.get_text().strip() for p in paragraphs])
        if "انتقال به نوار کناری" in text:
            text = clean_text(text)
            return title, text
        else:
            return None, None
    except (requests.Timeout, requests.RequestException) as e:
        print(f"Request error for URL {url}: {e}")
        return None, None
    except Exception as e:
        print(f"General error for URL {url}: {e}")
        return None, None

def clean_text(text):
    pattern = re.compile(r".*?نهفتن", re.DOTALL)
    cleaned_text = re.sub(pattern, "", text).strip()
    sentences = cleaned_text.split(".")[:8]
    return ".".join(sentences).strip() + "."

def process_url(current_url):
    if current_url in visited_links:
        return None

    visited_links.add(current_url)
    title, text = extract_info(current_url)
    if title and text:
        return {"url": current_url, "title": title, "text": text}
    return None

pages_extracted = 0

with ThreadPoolExecutor(max_workers=16) as executor:
    future_to_url = {executor.submit(process_url, url): url for url in links_list}
    for future in as_completed(future_to_url):
        result = future.result()
        if result:
            scraped_dataset["url"].append(result["url"])
            scraped_dataset["title"].append(result["title"])
            scraped_dataset["text"].append(result["text"])
            pages_extracted += 1
            print(f"Pages extracted: {pages_extracted} => {result['title']}")
        if pages_extracted >= max_pages:
            break

Pages extracted: 1 => مرزنشینان - ویکی امام خمینی
Pages extracted: 2 => قرآن صاعد (کتاب) - ویکی امام خمینی
Pages extracted: 3 => سیر مبارزات‌ امام‌خمینی‌ در آینه اسناد به روایت ساواک (کتاب) - ویکی امام خمینی
Pages extracted: 4 => جعفر جوادشجونی - ویکی امام خمینی
Pages extracted: 5 => پیامبر اکرم(ص) - ویکی امام خمینی
Request error for URL https://fa.wiki.khomeini.ir/wiki/%D9%88%DB%8C%DA%98%D9%87:%D8%AA%D8%BA%DB%8C%DB%8C%D8%B1%D8%A7%D8%AA_%D9%85%D8%B1%D8%AA%D8%A8%D8%B7/%D8%B3%DB%8C%D8%AF%D8%A7%D8%B3%D8%AF%D8%A7%D9%84%D9%84%D9%87_%D9%85%D8%AF%D9%86%DB%8C: 404 Client Error: Not Found for url: https://fa.wiki.khomeini.ir/wiki/%D9%88%DB%8C%DA%98%D9%87:%D8%AA%D8%BA%DB%8C%DB%8C%D8%B1%D8%A7%D8%AA_%D9%85%D8%B1%D8%AA%D8%A8%D8%B7/%D8%B3%DB%8C%D8%AF%D8%A7%D8%B3%D8%AF%D8%A7%D9%84%D9%84%D9%87_%D9%85%D8%AF%D9%86%DB%8C
Pages extracted: 6 => تعهد و تخصص - ویکی امام خمینی
Pages extracted: 7 => امکان اشرف - ویکی امام خمینی
Pages extracted: 8 => پرتو آفتاب (کتاب) - ویکی امام خمینی
Pages extracted: 9 => الا

# Save Crawled Dataset

In [11]:
import pandas as pd

df = pd.DataFrame()
df = df.from_dict(scraped_dataset)
df.to_csv("scraped_dataset_khomeini_wiki.csv")