# 🔍 Website URL Crawler in Google Colab
This notebook allows you to crawl internal URLs from a website and download them as an Excel file.

In [None]:
!pip install requests beautifulsoup4 pandas xlsxwriter

In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin
import pandas as pd
from io import BytesIO
from IPython.display import FileLink

In [None]:
def get_website_urls(base_url):
    visited = set()
    to_visit = set([base_url])
    unique_urls = set()
    domain = urlparse(base_url).netloc

    print(f"Starting to crawl the website: {base_url}")

    while to_visit:
        current_url = to_visit.pop()
        if current_url not in visited:
            visited.add(current_url)
            print(f"Visiting URL: {current_url}")
            try:
                response = requests.get(current_url, timeout=10)
                response.encoding = 'utf-8'
                if response.status_code == 200:
                    soup = BeautifulSoup(response.content, 'lxml')
                    for link in soup.find_all('a', href=True):
                        href = link['href']
                        full_url = urljoin(base_url, href)
                        if urlparse(full_url).netloc == domain and full_url not in visited:
                            to_visit.add(full_url)
                            unique_urls.add(full_url)
                else:
                    print(f"Error: Received status code {response.status_code} for {current_url}")
            except requests.exceptions.RequestException as e:
                print(f"Error crawling {current_url}: {e}")

    print(f"Finished crawling. Found {len(unique_urls)} unique internal URLs.")
    return sorted(unique_urls)

In [None]:
def organize_urls_hierarchically(urls):
    hierarchy = []
    sorted_urls = sorted(urls, key=lambda x: x.split('/'))
    max_depth = max(len(urlparse(url).path.strip('/').split('/')) for url in sorted_urls if urlparse(url).path.strip('/'))

    for url in sorted_urls:
        path = urlparse(url).path.strip('/')
        if not path:
            continue
        parts = path.split('/')
        levels = ["/" + "/".join(parts[:i+1]) for i in range(len(parts))]
        while len(levels) < max_depth:
            levels.append("")
        hierarchy.append((*levels, url))

    return hierarchy

In [None]:
def export_to_excel(hierarchy):
    max_levels = max(len(entry) - 1 for entry in hierarchy)
    columns = [f'Navigation Level {i+1}' for i in range(max_levels)] + ['Current URL Address']
    df = pd.DataFrame(hierarchy, columns=columns)

    buffer = BytesIO()
    with pd.ExcelWriter(buffer, engine='xlsxwriter') as writer:
        df.to_excel(writer, index=False, sheet_name='URLs')
    buffer.seek(0)

    with open("website_urls.xlsx", "wb") as f:
        f.write(buffer.read())
    
    return "website_urls.xlsx"

In [None]:
# 🔧 Input URL and run everything
base_url = input("Enter the base URL (e.g., https://example.com): ").strip()

if base_url and base_url.startswith("http"):
    urls = get_website_urls(base_url)
    if urls:
        hierarchy = organize_urls_hierarchically(urls)
        print(f"✅ Found {len(hierarchy)} URLs.")
        file_path = export_to_excel(hierarchy)
        print("📥 Download the Excel file below:")
        display(FileLink(file_path))
    else:
        print("⚠️ No URLs found.")
else:
    print("❌ Please enter a valid URL starting with 'http' or 'https'.")