In [None]:
# notebooks/crawling_documentation.ipynb

import sys
import os
import pandas as pd
import sqlite3

# اضبط مسار العمل ليكون جذر المشروع
os.chdir(r"C:\Users\muhammad\Desktop\ir")

from services.crawling.crawler import crawl_website, merge_crawled_data_to_db


START_URL = "https://en.wikipedia.org/wiki/Information_retrieval" 
MAX_PAGES_TO_CRAWL = 20
DATASET_KEY = "antique"
DB_PATH = f"data/{DATASET_KEY}_docs.db"

crawled_articles = crawl_website(START_URL, MAX_PAGES_TO_CRAWL)


print("\n" + "="*50)
print(" CRAWLING RESULTS & STATISTICS")
print("="*50)

num_records = len(crawled_articles)
print(f"Total records extracted: {num_records}")

if num_records > 0:
    # إنشاء DataFrame لعرض الإحصائيات بسهولة
    df = pd.DataFrame(crawled_articles)
    df['text_length'] = df['text'].str.len()

    print(f"\nAverage text length: {df['text_length'].mean():.2f} characters")
    
    print("\nSample of crawled data:")
    display(df.head())
    
    # --- الخطوة 3: التحقق من قاعدة البيانات قبل الدمج ---
    conn = sqlite3.connect(DB_PATH)
    count_before = conn.execute("SELECT COUNT(*) FROM docs").fetchone()[0]
    print(f"\nNumber of documents in DB *before* merge: {count_before}")
    conn.close()
    
    # --- الخطوة 4: دمج البيانات في قاعدة البيانات ---
    merge_crawled_data_to_db(crawled_articles, DB_PATH)
    
    # --- الخطوة 5: التحقق من قاعدة البيانات بعد الدمج ---
    conn = sqlite3.connect(DB_PATH)
    count_after = conn.execute("SELECT COUNT(*) FROM docs").fetchone()[0]
    print(f"Number of documents in DB *after* merge: {count_after}")
    conn.close()
    
    print(f"\nVerification: {count_after - count_before} new documents were added.")
else:
    print("\nNo articles were crawled. Check your start URL and parsing logic.")

Crawling: https://en.wikipedia.org/wiki/Information_retrieval
Crawling: https://en.wikipedia.org/wiki/MEDLARS
Crawling: https://en.wikipedia.org/wiki/MEDLARS#cite_ref-16
Crawling: https://en.wikipedia.org/wiki/Atlantic_Monthly
Crawling: https://en.wikipedia.org/wiki/Information_retrieval#cite_note-13
Crawling: https://en.wikipedia.org/wiki/Subject_indexing
Crawling: https://en.wikipedia.org/wiki/Alvin_Weinberg
Crawling: https://en.wikipedia.org/wiki/Learning_to_rank
Crawling: https://en.wikipedia.org/wiki/Altbib
Crawling: https://en.wikipedia.org/wiki/Topic-based_vector_space_model
Crawling: https://en.wikipedia.org/wiki/Learning_to_rank#cite_ref-58
Crawling: https://en.wikipedia.org/wiki/Special:BookSources/1-85604-482-3
Crawling: https://en.wikipedia.org/wiki/Special:BookSources/1-85604-482-3#Taiwan,_Republic_of_China
Crawling: https://en.wikipedia.org/wiki/Learning_to_rank#cite_note-5
Crawling: https://en.wikipedia.org/w/index.php?title=Alvin_M._Weinberg&action=edit&section=11
Crawl