In [1]:
import requests
from bs4 import BeautifulSoup

# List of Pune engineering college websites
college_urls = [
    "http://www.coep.org.in",  # College of Engineering Pune
    "https://www.viit.ac.in",  # Vishwakarma Institute of Information Technology
    "http://www.mitpune.edu.in",  # MIT Pune
    # Add more college websites here
]

# Function to fetch and parse the page using BeautifulSoup
def fetch_page_content(url):
    try:
        # Send an HTTP request to the URL
        response = requests.get(url, timeout=10)  # Adding a timeout for safety
        # Check if the request was successful
        if response.status_code == 200:
            # Parse the content with BeautifulSoup
            soup = BeautifulSoup(response.text, 'html.parser')
            return soup
        else:
            print(f"Failed to retrieve {url} - Status Code: {response.status_code}")
            return None
    except Exception as e:
        print(f"Error occurred while fetching {url}: {e}")
        return None

# Index to store the URLs found on each website
index = {}
global_link_counter = 1  # Global counter to index all links across websites

# Function to fetch and index data from each college website
def fetch_college_data():
    global global_link_counter
    for url in college_urls:
        print(f"Fetching data from: {url}")
        page_content = fetch_page_content(url)

        if page_content:
            # Extract all the anchor tags <a> for links
            links = page_content.find_all('a')
            print(f"Links found on {url}:")

            # List to hold the indexed links found on this page
            link_list = []

            for link in links:
                href = link.get('href')
                if href and href.startswith('http'):  # Only include valid URLs
                    # Index the link and store in the link list with the counter
                    indexed_link = f"{global_link_counter}: {href}"
                    link_list.append(indexed_link)
                    print(indexed_link)
                    global_link_counter += 1

            # Store the list of indexed links in the index dictionary
            index[url] = link_list
        else:
            print(f"Could not retrieve content from {url}")

# Run the crawler
fetch_college_data()

# Print the index of all crawled websites and their found links
print("\nIndex of all links found:")
for college, links in index.items():
    print(f"{college}:")
    for link in links:
        print(f"  {link}")


Fetching data from: http://www.coep.org.in
Links found on http://www.coep.org.in:
1: https://www.coep.org.in/departments/planning
2: http://www.coep.org.in/ccf/
3: http://www.coep.org.in/content/laboratories0
4: http://www.coep.org.in/departments/mechanical/notices
5: https://www.youtube.com/live/LAfI59jQn4A
6: https://aicte-qip-hpc-coeptech.netlify.app/
7: https://accounts.google.com/InteractiveLogin/signinchooser?continue=https%3A%2F%2Fdocs.google.com%2Fforms%2Fd%2Fe%2F1FAIpQLSd9q3IsTCxdJl9MC_hPK0AYUBELmGC65CYd9SwtFXDSEICIUA%2Fviewform%3Fusp%3Dsf_link&followup=https%3A%2F%2Fdocs.google.com%2Fforms%2Fd%2Fe%2F1FAIpQLSd9q3IsTCxdJl9MC_hPK0AYUBELmGC65CYd9SwtFXDSEICIUA%2Fviewform%3Fusp%3Dsf_link&ltmpl=forms&osid=1&passive=1209600&service=wise&ifkv=AdF4I76fD_sEDuxfGE77uspdSJ9fWpsCwVZmAMfUK7noRKneoO9NhgZqpkGP55j2A3soOOwiBahI&ddm=0&flowName=GlifWebSignIn&flowEntry=ServiceLogin
8: https://docs.google.com/forms/d/e/1FAIpQLSe2tueL93nrX-TfZcBIfDC3G2B7y64u9jnAb_jXBv2_DRk1UA/viewform?usp=sf_link
9: