In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urlparse, urljoin

def clean_url(url, base_url):
    parsed_url = urlparse(url)
    clean_url = parsed_url._replace(fragment="").geturl()  # הסרת עוגנים (#)
    if not clean_url.startswith('http'):
        clean_url = urljoin(base_url, clean_url)  # הפיכת כתובת יחסית למוחלטת
    return clean_url

def web_crawler(base_url):
    response = requests.get(base_url)
    soup = BeautifulSoup(response.text, 'html.parser')

    data = []
    seen_urls = set()

    page_title = soup.title.string if soup.title else 'No Title'
    data.append({
        'שם דף': page_title,
        'כתובת עמוד': base_url,
        'תוכן עמוד': soup.get_text()
    })
    seen_urls.add(base_url)

    links = soup.find_all('a', href=True)
    
    for link in links:
        url = clean_url(link['href'], base_url)
        if url not in seen_urls:
            try:
                print(f"Processing: {url}")
                link_response = requests.get(url)
                link_soup = BeautifulSoup(link_response.text, 'html.parser')
                link_title = link_soup.title.string if link_soup.title else 'No Title'
                data.append({
                    'שם דף': link_title,
                    'כתובת עמוד': url,
                    'תוכן עמוד': link_soup.get_text()
                })
                seen_urls.add(url)
            except requests.exceptions.RequestException as e:
                print(f"Error accessing {url}: {e}")

    return data

def save_to_excel(data, filename='web_crawl_output_ynet.xlsx'):
    df = pd.DataFrame(data)
    df.to_excel(filename, index=False)

# כתובת האתר הראשי של Ynet
base_url = 'https://www.ynet.co.il/'

# ביצוע זחילה ושמירת התוצאות
data = web_crawler(base_url)
save_to_excel(data)

print(f"Data successfully saved to 'web_crawl_output_ynet.xlsx'")

Processing: https://www.ynet.co.il/news/category/184
Processing: https://www.ynet.co.il/redmail
Processing: https://www.ynet.co.il/home/0,7340,L-201,00.html
Processing: https://www.ynet.co.il/home/0,7340,L-8,00.html
Processing: javascript:YitPaywall.openLoginPopUp(false,{ redirect_to_plus : true })
Error accessing javascript:YitPaywall.openLoginPopUp(false,{ redirect_to_plus : true }): No connection adapters were found for 'javascript:YitPaywall.openLoginPopUp(false,{ redirect_to_plus : true })'
Processing: https://premium.ynet.co.il/Web/Register
Processing: https://www.ynet.co.il/plus
Processing: https://www.ynet.co.il/news
Processing: https://www.ynet.co.il/news/247
Processing: https://www.ynet.co.il/radio
Processing: https://www.ynet.co.il/economy
Processing: https://www.ynet.co.il/sport
Processing: https://www.ynet.co.il/entertainment
Processing: https://pplus.ynet.co.il/homepage
Processing: https://www.ynet.co.il/health
Processing: https://www.ynet.co.il/wheels
Processing: https:/

Processing: https://www.ynet.co.il/environment-science/article/syqtpi6s0
Processing: https://www.ynet.co.il/environment-science/article/hkske36sa
Processing: https://www.ynet.co.il/environment-science/article/s17xtipsc
Processing: https://www.ynet.co.il/environment-science/article/sybzjz6sc
Processing: https://www.ynet.co.il/environment-science/article/r1jd8bnja
Processing: https://www.ynet.co.il/economy/article/skuhho2ic
Processing: https://www.ynet.co.il/digital/technology/article/s135bd2ja
Processing: https://www.ynet.co.il/digital/technews/article/r1q0essj0
Processing: https://www.ynet.co.il/digital/technews/article/hk4m2f5jr
Processing: https://www.ynet.co.il/digital/article/bj00qimcs0
Processing: https://www.ynet.co.il/digital/technews/article/syjwjudi0
Processing: https://www.instagram.com/laishamag/?utm_source=ynet&utm_medium=referral&utm_campaign=social-inks&utm_content=laisha
Processing: https://www.facebook.com/LaishaMagazine?utm_source=ynet&utm_medium=referral&utm_campaign=