# Sitemap Extraction

In [1]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import threading
from queue import Queue
import concurrent.futures

base_url = "https://wiki.ahlolbait.com/"
visited_links = set()
unvisited_links = Queue()
unvisited_links.put(base_url)

all_links = set()
max_pages = 15000
num_threads = 16

lock = threading.Lock()

def extract_links(url):
    try:
        page_response = requests.get(url, timeout=16)
        page_response.raise_for_status()
        page_soup = BeautifulSoup(page_response.content, "html.parser")
        links = page_soup.find_all("a")
        new_links = set()

        for link in links:
            href = link.get("href")
            if href and href.startswith("/"):
                full_url = urljoin(base_url, href)
                if full_url not in visited_links:
                    new_links.add(full_url)

        return new_links
    except (requests.Timeout, requests.RequestException) as e:
        print(f"Request error for URL {url}: {e}")
        return set()
    except Exception as e:
        print(f"General error for URL {url}: {e}")
        return set()

def crawl_links():
    global all_links
    pages_extracted = 0
    while pages_extracted < max_pages and not unvisited_links.empty():
        current_url = unvisited_links.get()
        if current_url in visited_links:
            continue

        with lock:
            visited_links.add(current_url)

        new_links = extract_links(current_url)
        with lock:
            all_links.update(new_links)
            pages_extracted += 1

        for link in new_links:
            if link not in visited_links:
                unvisited_links.put(link)
        if pages_extracted%1000==0: print(pages_extracted)
        print(f"Links extracted: {len(all_links)}")

def main():
    with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as executor:
        futures = [executor.submit(crawl_links) for _ in range(num_threads)]
        concurrent.futures.wait(futures)

    print("Link extraction finished.")

    # ذخیره لینک‌ها در فایل
    with open("links.txt", "w", encoding="utf-8") as f:
        for link in all_links:
            f.write(link + "\n")

if __name__ == "__main__":
    main()

Links extracted: 214
Links extracted: 562
Links extracted: 911
Links extracted: 938
Links extracted: 960
Links extracted: 1069
Links extracted: 1205
Links extracted: 1239
Links extracted: 1286
Links extracted: 1449
Links extracted: 1573
Links extracted: 1727
Links extracted: 1740
Links extracted: 1775
Links extracted: 1795
Links extracted: 1860
Links extracted: 1881
Links extracted: 1900
Links extracted: 2235
Links extracted: 2268
Links extracted: 2600
Links extracted: 2610
Links extracted: 2638
Links extracted: 2696
Links extracted: 2711
Links extracted: 3056
Links extracted: 3214
Links extracted: 3236
Links extracted: 3237
Links extracted: 3251
Links extracted: 3340
Links extracted: 3358
Links extracted: 3384
Links extracted: 3397
Links extracted: 3420
Links extracted: 3760
Links extracted: 3778
Links extracted: 3790
Links extracted: 3851
Links extracted: 3858
Links extracted: 4086
Links extracted: 4133
Links extracted: 4469
Links extracted: 4483
Links extracted: 4497
Links extracted



Links extracted: 79883
Links extracted: 79885
Links extracted: 79887
Links extracted: 79893
Links extracted: 79897
Links extracted: 79900
Links extracted: 79906
Links extracted: 79908
Links extracted: 79914
Links extracted: 79915
Links extracted: 79920
Links extracted: 79924
Links extracted: 79927
Links extracted: 79929
Links extracted: 79931
Links extracted: 79935
Links extracted: 79937
Links extracted: 79943
Links extracted: 79948
Links extracted: 79951
Links extracted: 79956
Links extracted: 79959
Request error for URL https://wiki.ahlolbait.com/index.php?title=%D8%B5%D9%81%D8%AD%D9%87%D9%94_%D8%A7%D8%B5%D9%84%DB%8C&offset=&limit=50&action=history: 403 Client Error: Forbidden for url: https://wiki.ahlolbait.com/index.php?title=%D8%B5%D9%81%D8%AD%D9%87%D9%94_%D8%A7%D8%B5%D9%84%DB%8C&offset=&limit=50&action=history
Links extracted: 79959
Links extracted: 79961
Links extracted: 79963
Links extracted: 79966
Links extracted: 79972
Links extracted: 79977
Links extracted: 79982
Links extra

# Crawl Information From Site

In [121]:
f = open("filtered_links.txt", "r", encoding='utf-8')
urls = set(f.read().split('\n'))
urls

{'',
 'https://wiki.ahlolbait.com/فریب',
 'https://wiki.ahlolbait.com/حاج_ملااسدالله_بروجردی',
 'https://wiki.ahlolbait.com/محمد_اقبال_لاهوری',
 'https://wiki.ahlolbait.com/اسماعیل_حسن',
 'https://wiki.ahlolbait.com/حکم_بن_ابی_العاص',
 'https://wiki.ahlolbait.com/آداب_ماه_ربیع_الاول',
 'https://wiki.ahlolbait.com/کتابخانه_مجازی_ایران',
 'https://wiki.ahlolbait.com/زبیر_بن_عوام',
 'https://wiki.ahlolbait.com/کل_یوم_عاشورا',
 'https://wiki.ahlolbait.com/امامزاده_سلطان_ابراهیم',
 'https://wiki.ahlolbait.com/ابن_ابی_زینب_محمد_بن_ابراهیم_نعمانی',
 'https://wiki.ahlolbait.com/بازارهای_عرب_پیش_از_اسلام',
 'https://wiki.ahlolbait.com/سال_سوم_هجرت',
 'https://wiki.ahlolbait.com/آخوند_ملا_محمد_کاظم_خراسانی',
 'https://wiki.ahlolbait.com/شیخ_محمدتقی_بافقی',
 'https://wiki.ahlolbait.com/علی_بن_محمد_سمری',
 'https://wiki.ahlolbait.com/پژوهشکده_تفسیر_اهل_البیت_علیهم_السلام',
 'https://wiki.ahlolbait.com/مستحب',
 'https://wiki.ahlolbait.com/فهرست_اماکن_مقدسه_گیلان',
 'https://wiki.ahlolbait.com/شفیعا

In [123]:
import re
from urllib.parse import urlparse, parse_qs

# فایل ورودی و خروجی
input_file = 'links.txt'
output_file = 'filtered_links.txt'

# لیست لینک‌های ورودی
with open(input_file, 'r', encoding='utf-8-sig') as file:
    links = file.readlines()

# الگوی regex برای لینک‌های مورد نظر
pattern = re.compile(r'^https://wiki\.ahlolbait\.com/[اآبپتثجچحخدذرزژسشصضطظعغفقکگلمنوهی_\-%]+$')

# استخراج لینک‌های مورد نظر و تبدیل لینک‌های با ساختار مشخص
filtered_links = []
for link in links:
    link = link.strip()
    new_link = link

    if "index.php" in link:
        parsed_url = urlparse(link)
        query_params = parse_qs(parsed_url.query)
        if 'title' in query_params:
            new_link = f"https://wiki.ahlolbait.com/{query_params['title'][0]}"

    # حذف هر چیزی بعد از ":"
    if len(new_link.split(":"))>2:
        base_link, tail = new_link.split(':')[0]+new_link.split(':')[1], new_link.split(':')[-1]
        base_link = 'https://wiki.ahlolbait.com/' + base_link.split('/')[-1]
        if '/' in tail:
            tail = tail.split('/')[1]
        new_link = f"{base_link}/{tail}"

    # بررسی مطابقت با الگوی regex و حذف پسوندهای نامناسب
    if pattern.match(new_link) and not (new_link.endswith('.jpg') or new_link.endswith('.png') or new_link.endswith('.php')):
        filtered_links.append(new_link)

# حذف لینک‌های تکراری
filtered_links = set(filtered_links)

print(f"لینک‌های فیلتر شده: {len(filtered_links)}")

# نوشتن لینک‌های فیلتر شده در فایل خروجی
with open(output_file, 'w', encoding='utf-8-sig') as file:
    for link in filtered_links:
        file.write(link + '\n')

print("لینک‌های فیلتر شده در فایل 'filtered_links.txt' ذخیره شدند.")

لینک‌های فیلتر شده: 7244
لینک‌های فیلتر شده در فایل 'filtered_links.txt' ذخیره شدند.


In [124]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import re
from concurrent.futures import ThreadPoolExecutor, as_completed

# مجموعه‌ای برای ذخیره لینک‌های بازدیدشده
visited_links = set()

# لیست لینک‌ها برای بازدید
links_list = urls

scraped_dataset = {"url": [], "title": [], "text": []}

# تنظیمات ماکسیمم صفحات برای استخراج
max_pages = len(urls)

def clean_text(text):
    # حذف عباراتی که در براکت هستند مانند [1], [2], ...
    text = re.sub(r'\[[0-9]+\]', '', text)
    text = re.sub(r'^این\s+صفحه\s+مدخلی\s+از.*?است\s*', '', text)
    text = re.sub(r'^این\s+مدخل.*?است\s*\.', '', text)
    text = re.sub(r'محتوای\s+فعلی\s+بخشی\s+از\s+یک\s+کتاب\s+متناسب\s+با\s+عنوان\s+است\.', '', text)
    return text.strip()

def extract_info(url):
    try:
        page_response = requests.get(url, timeout=16)
        page_response.raise_for_status()
        page_soup = BeautifulSoup(page_response.content, "html.parser")
        title = page_soup.find("title").text.strip()
        
        # یافتن تمام تگ‌های p قبل از اولین h2
        paragraphs = []
        flag_end = False
        for element in page_soup.find_all(['p', 'h2']):
            if element.name == 'p':
                paragraphs.append(element.get_text())
            if flag_end: 
                if element.get_text()[:-1]=='.': break
            if element.name == 'h2':
                flag_end = True
        
        text = ""
        for p in paragraphs: 
            if p.strip()!="": text += p.strip()+" "
        
        # اعمال تمیزکاری روی متن
        text = clean_text(text)
        
        # برش تا 8 جمله
        text_list = text.split('.')[:8]
        text = ""
        for t in text_list: text += t + '. '
        
        return title, text
    
    except (requests.Timeout, requests.RequestException) as e:
        print(f"Request error for URL {url}: {e}")
        return None, None
    except Exception as e:
        print(f"General error for URL {url}: {e}")
        return None, None

def process_url(current_url):
    if current_url in visited_links:
        return None

    visited_links.add(current_url)
    title, text = extract_info(current_url)
    if title and text:
        return {"url": current_url, "title": title, "text": text}
    return None

pages_extracted = 0

with ThreadPoolExecutor(max_workers=16) as executor:
    future_to_url = {executor.submit(process_url, url): url for url in links_list}
    for future in as_completed(future_to_url):
        result = future.result()
        if result:
            scraped_dataset["url"].append(result["url"])
            scraped_dataset["title"].append(result["title"])
            scraped_dataset["text"].append(result["text"])
            pages_extracted += 1
            print(f"Pages extracted: {pages_extracted} => {result['title']}")
        if pages_extracted >= max_pages:
            break

Request error for URL : Invalid URL '': No scheme supplied. Perhaps you meant https://?
Request error for URL https://wiki.ahlolbait.com/حکم_بن_ابی_العاص: 404 Client Error: Not Found for url: https://wiki.ahlolbait.com/%D8%AD%DA%A9%D9%85_%D8%A8%D9%86_%D8%A7%D8%A8%DB%8C_%D8%A7%D9%84%D8%B9%D8%A7%D8%B5
Request error for URL https://wiki.ahlolbait.com/امامزاده_سلطان_ابراهیم: 404 Client Error: Not Found for url: https://wiki.ahlolbait.com/%D8%A7%D9%85%D8%A7%D9%85%D8%B2%D8%A7%D8%AF%D9%87_%D8%B3%D9%84%D8%B7%D8%A7%D9%86_%D8%A7%D8%A8%D8%B1%D8%A7%D9%87%DB%8C%D9%85
Request error for URL https://wiki.ahlolbait.com/اسماعیل_حسن: 404 Client Error: Not Found for url: https://wiki.ahlolbait.com/%D8%A7%D8%B3%D9%85%D8%A7%D8%B9%DB%8C%D9%84_%D8%AD%D8%B3%D9%86
Pages extracted: 1 => زبیر بن عوام - دانشنامه‌ی اسلامی
Pages extracted: 2 => بازارهای عرب پیش از اسلام - دانشنامه‌ی اسلامی
Pages extracted: 3 => کتابخانه مجازی ایران - دانشنامه‌ی اسلامی
Request error for URL https://wiki.ahlolbait.com/محمد_اقبال_لاهور

# Save Crawled Dataset

In [125]:
import pandas as pd

df = pd.DataFrame()
df = df.from_dict(scraped_dataset)
df.to_csv("scraped_dataset_ahlolbait.csv", encoding='utf-8-sig')

In [133]:
import pandas as pd
import re 

df = pd.read_csv("scraped_dataset_ahlolbait.csv")

df['context'] = df['text']
df['context'] = df['context'].apply(lambda x: re.sub(r'\[.*?\]', '', x))
df['answer'] = df['context']

# حذف عبارت - ویکی خمینی از ستون title
df['title'] = df['title'].str.replace(' - دانشنامه‌ی اسلامی', '')
df['question'] = df['title']

# انتخاب فقط ستون‌های مورد نیاز
df = df[['title', 'question', 'answer', 'context']]

# ذخیره فایل اصلاح شده
df.to_csv("crawl_wiki_ahlolbait.csv", index=False, encoding='utf-8-sig')


In [135]:
import pandas as pd
import re

# خواندن دیتاست
df = pd.read_csv('crawl_wiki_ahlolbait.csv')

def clean_text(text):
    # حذف جملاتی که شامل "دانشنامه هنوز" هستند
    text = re.sub(r'\b[^.]*\bدانشنامه هنوز\b[^.]*\.', '', text)
    return text.strip()

# اعمال تمیزکاری روی ستون‌های 'answer' و 'context'
df['answer'] = df['answer'].apply(clean_text)
df['context'] = df['context'].apply(clean_text)

# ذخیره دیتاست تمیز شده
df.to_csv('cleaned_wiki_ahlolbait.csv', index=False, encoding='utf-8-sig')