In [16]:
import requests
from urllib.parse import urlparse, urljoin
from bs4 import BeautifulSoup
import colorama


# init the colorama module
# use colorama just for using different colors when printing, to distinguish between internal and external links:
colorama.init()
GREEN = colorama.Fore.GREEN
GRAY = colorama.Fore.LIGHTBLACK_EX
RESET = colorama.Fore.RESET


# initialize the set of links (unique links)
internal_urls = set()
external_urls = set()


def is_valid(url):
    """
    Checks whether `url` is a valid URL.
    """
    parsed = urlparse(url)
    return bool(parsed.netloc) and bool(parsed.scheme)


def get_all_website_links(url):
    """
    Returns all URLs that is found on `url` in which it belongs to the same website
    """
    # all URLs of `url`
    urls = set()
    # domain name of the URL without the protocol
    domain_name = urlparse(url).netloc
    soup = BeautifulSoup(requests.get(url).content, "html.parser")
    
    for a_tag in soup.findAll("a"):
        href = a_tag.attrs.get("href")
        if href == "" or href is None:
            # href empty tag
            continue
        # join the URL if it's relative (not absolute link)
        href = urljoin(url, href)
        parsed_href = urlparse(href)
        # remove URL GET parameters, URL fragments, etc.
        href = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path
        if not is_valid(href):
            # not a valid URL
            continue
        if href in internal_urls:
            # already in the set
            continue
        if domain_name not in href:
            # external link
            if href not in external_urls:
                print(f"{GRAY}[!] External link: {href}{RESET}")
                external_urls.add(href)
            continue
        print(f"{GREEN}[*] Internal link: {href}{RESET}")
        urls.add(href)
        internal_urls.add(href)
    return urls

# number of urls visited so far will be stored here
total_urls_visited = 0

def crawl(url, max_urls=100):
    """
    Crawls a web page and extracts all links.
    You'll find all links in `external_urls` and `internal_urls` global set variables.
    params:
        max_urls (int): number of max urls to crawl, default is 100.
    """
    global total_urls_visited
    total_urls_visited += 1
    links = get_all_website_links(url)
    for link in links:
        if total_urls_visited > max_urls:
            break
        crawl(link, max_urls=max_urls)

In [17]:
crawl('https://www.immoweb.be/en')
print("[+] Total External links:", len(external_urls))
print("[+] Total Internal links:", len(internal_urls))
print("[+] Total:", len(external_urls) + len(internal_urls))

[*] Internal link: https://www.immoweb.be/en
[*] Internal link: https://www.immoweb.be/en/search/house-and-apartment/for-sale
[*] Internal link: https://www.immoweb.be/en/search/house-and-apartment/for-rent
[*] Internal link: https://www.immoweb.be/en/search/house/for-sale
[*] Internal link: https://www.immoweb.be/en/search/house/for-rent
[*] Internal link: https://www.immoweb.be/en/search/apartment/for-sale
[*] Internal link: https://www.immoweb.be/en/search/apartment/for-rent
[*] Internal link: https://www.immoweb.be/en/page/about-immoweb
[*] Internal link: https://www.immoweb.be/en/page/the-immoweb-newsroom
[!] External link: https://www.linkedin.com/company/immoweb/jobs/
[!] External link: https://www.axelspringer.com/en/
[!] External link: https://valuation.immoweb.be/real-estate-estimate/seg1
[!] External link: https://www.constructr.be/en
[*] Internal link: https://www.immoweb.be/en/credit-application
[!] External link: https://protect.immoweb.be/tenant-home-insurance/landing
[*

[*] Internal link: https://www.immoweb.be/en/search/house/for-rent/hainaut/province
[*] Internal link: https://www.immoweb.be/en/search/house/for-rent/liege/province
[*] Internal link: https://www.immoweb.be/en/search/house/for-rent/limburg/province
[*] Internal link: https://www.immoweb.be/en/search/house/for-rent/luxembourg/province
[*] Internal link: https://www.immoweb.be/en/search/house/for-rent/namur/province
[*] Internal link: https://www.immoweb.be/en/search/house/for-rent/walloon-brabant/province
[*] Internal link: https://www.immoweb.be/en/search/house/for-rent/west-flanders/province
[*] Internal link: https://www.immoweb.be/en/search/house/for-rent/brugge/district
[*] Internal link: https://www.immoweb.be/en/search/house/for-rent/diksmuide/district
[*] Internal link: https://www.immoweb.be/en/search/house/for-rent/ieper/district
[*] Internal link: https://www.immoweb.be/en/search/house/for-rent/kortrijk/district
[*] Internal link: https://www.immoweb.be/en/search/house/for-r

[*] Internal link: https://www.immoweb.be/en/search/castle/for-rent/damme/8340
[*] Internal link: https://www.immoweb.be/en/search/country-cottage/for-rent/damme/8340
[*] Internal link: https://www.immoweb.be/en/search/mixed-use-building/for-rent/damme/8340
[*] Internal link: https://www.immoweb.be/en/search/other-properties/for-rent/damme/8340
[*] Internal link: https://www.immoweb.be/en/search/country-house/for-rent/damme/8340
[*] Internal link: https://www.immoweb.be/en/search/pavilion/for-rent/damme/8340
[*] Internal link: https://www.immoweb.be/en/search/villa/for-rent/zedelgem/8210
[*] Internal link: https://www.immoweb.be/en/search/apartment-block/for-rent/zedelgem/8210
[*] Internal link: https://www.immoweb.be/en/search/town-house/for-rent/zedelgem/8210
[*] Internal link: https://www.immoweb.be/en/search/mansion/for-rent/zedelgem/8210
[*] Internal link: https://www.immoweb.be/en/search/exceptional-property/for-rent/zedelgem/8210
[*] Internal link: https://www.immoweb.be/en/sear

In [18]:
crawl('https://www.immoweb.be/en/search/house-and-apartment/for-sale')
print("[+] Total External links:", len(external_urls))
print("[+] Total Internal links:", len(internal_urls))
print("[+] Total:", len(external_urls) + len(internal_urls))

[*] Internal link: https://www.immoweb.be/en/search/house-and-apartment/for-sale/antwerp/province
[*] Internal link: https://www.immoweb.be/en/search/house-and-apartment/for-sale/brussels/province
[*] Internal link: https://www.immoweb.be/en/search/house-and-apartment/for-sale/east-flanders/province
[*] Internal link: https://www.immoweb.be/en/search/house-and-apartment/for-sale/flemish-brabant/province
[*] Internal link: https://www.immoweb.be/en/search/house-and-apartment/for-sale/hainaut/province
[*] Internal link: https://www.immoweb.be/en/search/house-and-apartment/for-sale/liege/province
[*] Internal link: https://www.immoweb.be/en/search/house-and-apartment/for-sale/limburg/province
[*] Internal link: https://www.immoweb.be/en/search/house-and-apartment/for-sale/luxembourg/province
[*] Internal link: https://www.immoweb.be/en/search/house-and-apartment/for-sale/namur/province
[*] Internal link: https://www.immoweb.be/en/search/house-and-apartment/for-sale/walloon-brabant/provinc

In [19]:
crawl('https://www.immoweb.be/en/search/house-and-apartment/for-sale/antwerp/province')
print("[+] Total External links:", len(external_urls))
print("[+] Total Internal links:", len(internal_urls))
print("[+] Total:", len(external_urls) + len(internal_urls))

[*] Internal link: https://www.immoweb.be/en/search/house-and-apartment/for-sale/antwerp/district
[*] Internal link: https://www.immoweb.be/en/search/house-and-apartment/for-sale/mechelen/district
[*] Internal link: https://www.immoweb.be/en/search/house-and-apartment/for-sale/turnhout/district
[*] Internal link: https://www.immoweb.be/en/search/villa/for-sale/antwerp/district
[*] Internal link: https://www.immoweb.be/en/search/apartment-block/for-sale/antwerp/district
[*] Internal link: https://www.immoweb.be/en/search/town-house/for-sale/antwerp/district
[*] Internal link: https://www.immoweb.be/en/search/mansion/for-sale/antwerp/district
[*] Internal link: https://www.immoweb.be/en/search/exceptional-property/for-sale/antwerp/district
[*] Internal link: https://www.immoweb.be/en/search/farmhouse/for-sale/antwerp/district
[*] Internal link: https://www.immoweb.be/en/search/bungalow/for-sale/antwerp/district
[*] Internal link: https://www.immoweb.be/en/search/chalet/for-sale/antwerp/d