In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [4]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import time

def is_valid_url(url):
    """Checks if the URL has a scheme (http/https) and netloc."""
    parsed = urlparse(url)
    return bool(parsed.netloc) and bool(parsed.scheme)

def crawl_websites(urls_list):
    """
    Iterates through a list of URLs and crawls them.
    """
    # Set a User-Agent to act like a real browser
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }

    for url in urls_list:
        print(f"\n{'='*60}")
        print(f"[*] Starting Crawl: {url}")
        print(f"{'='*60}")

        try:
            # 1. Fetch the content
            response = requests.get(url, headers=headers, timeout=10)
            
            if response.status_code != 200:
                print(f"[!] Failed to retrieve {url} (Status: {response.status_code})")
                continue

            # 2. Parse the HTML
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # 3. Extract links
            links = soup.find_all('a', href=True)
            unique_links = set()

            for link in links:
                href = link['href']
                # Convert relative paths (/about) to absolute (https://site.com/about)
                full_url = urljoin(url, href)
                
                if is_valid_url(full_url):
                    unique_links.add(full_url)

            # 4. Output results
            print(f"[*] Success! Found {len(unique_links)} unique links:\n")
            
            # Print only the first 20 links to keep Jupyter clean (remove slice to see all)
            for i, link in enumerate(list(unique_links)): 
                print(f" {i+1}. {link}")
            
            if len(unique_links) > 20:
                print(f"\n... and {len(unique_links) - 18} more links.")

        except requests.exceptions.RequestException as e:
            print(f"[!] Error connecting to {url}: {e}")
        
        # Be polite to servers
        time.sleep(1)

# ==========================================
# INPUT: Define your websites here
# ==========================================
websites_to_crawl = [
    "https://www.buroojinstitute.org",
    "https://azaan.com.pk/"
]

# Run the crawler
crawl_websites(websites_to_crawl)


[*] Starting Crawl: https://www.buroojinstitute.org
[*] Success! Found 37 unique links:

 1. https://forms.gle/yfAT8vX1TRvSzbRs8
 2. https://www.linkedin.com/company/burooj-institute/
 3. https://buroojinstitute.org
 4. https://www.buroojinstitute.org/about-us/
 5. https://buroojinstitute.org/donation/
 6. https://www.instagram.com/buroojinstituteofficial/
 7. https://docs.google.com/forms/d/e/1FAIpQLSdI8-NaeQWK59JggCpm92g8AAagZFXq5sBBrSFcuueEeHByzw/closedform
 8. https://forms.gle/LmidwGyk1E5gxM2s7
 9. https://buroojinstitute.org/burooj-instructors/
 10. https://buroojinstitute.org/basic-tajweed-course-female/
 11. https://buroojinstitute.org/workshops/
 12. https://www.buroojinstitute.org/upcoming-program/
 13. https://forms.gle/6ZDXzjjqeeQ6fayg8
 14. https://www.buroojinstitute.org/deen-foundations/
 15. https://buroojinstitute.org/alim-alimah-winter-online-2025/
 16. https://buroojinstitute.org/alim-alimah-winter-onsite-2025/
 17. https://forms.gle/xjWVHgsnnL2yBCa28
 18. https://b