<a href="https://colab.research.google.com/github/NOTGOD6000/Web-crawler-Email-Scraper-/blob/main/web%20crawler%20email%20scraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import requests
from bs4 import BeautifulSoup
import re
from urllib.parse import urljoin, urlparse
from concurrent.futures import ThreadPoolExecutor, as_completed
import threading
import csv

# Starting point for the crawler
start_url = "https://www.punjabiuniversity.ac.in/pages/Courses.aspx?Id=18"

# Configuration: Maximum pages and depth to crawl
max_pages = 100
max_depth = 3

# Thread-safe sets and a lock for shared data
lock = threading.Lock()
visited = set([start_url])  # URLs already visited
emails_found = set()        # Emails found across pages

def extract_emails_and_links(url, base_url):
    """
    Downloads the webpage, extracts emails and internal links.
    Returns a tuple of (emails, links).
    """
    emails_local = set()
    links_local = set()
    try:
        response = requests.get(url, timeout=10)
        soup = BeautifulSoup(response.text, "html.parser")

        # Extract emails using regex
        text = soup.get_text()
        emails_local.update(re.findall(r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+", text))

        # Extract and normalize internal links
        for link in soup.find_all("a", href=True):
            href = link['href']
            full_url = urljoin(base_url, href)
            parsed = urlparse(full_url)

            # Only follow internal links (same domain)
            if parsed.netloc == urlparse(base_url).netloc:
                links_local.add(full_url)
    except Exception:
        pass  # Ignore any errors (timeouts, connection issues, etc.)
    return emails_local, links_local

def crawl(url, base_url, depth):
    """
    Crawls a given URL to extract emails and discover new links to follow.
    Returns a list of new links to crawl next.
    """
    global visited, emails_found
    if depth > max_depth:
        return []

    # Extract emails and links from the current page
    emails_local, links_local = extract_emails_and_links(url, base_url)

    # Add found emails to the global set
    with lock:
        emails_found.update(emails_local)

    new_links = []
    with lock:
        for link in links_local:
            # Add new unvisited internal links, if limit not exceeded
            if link not in visited and len(visited) < max_pages:
                visited.add(link)
                new_links.append((link, base_url, depth + 1))
    return new_links

def main():
    """
    Orchestrates the crawling using a thread pool for concurrency.
    """
    to_crawl = [(start_url, start_url, 0)]  # Queue of (URL, base URL, depth)
    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = []
        while to_crawl:
            # Submit crawl tasks
            for args in to_crawl:
                futures.append(executor.submit(crawl, *args))
            to_crawl = []

            # Wait for all current tasks to complete and collect new links
            for future in as_completed(futures):
                new_links = future.result()
                to_crawl.extend(new_links)

            # Clear futures for the next batch
            futures = []

if __name__ == "__main__":
    main()

    # Save the found emails to a CSV file
    if emails_found:
        print(f"✅ Found {len(emails_found)} emails. Saving to emails.csv ...")
        with open("emails.csv", "w", newline="", encoding="utf-8") as f:
            writer = csv.writer(f)
            writer.writerow(["Email"])
            for email in sorted(emails_found):
                writer.writerow([email])
        print("✅ Saved emails to emails.csv")
    else:
        print("⚠️ No emails found.")


✅ Found 207 emails. Saving to emails.csv ...
✅ Saved emails to emails.csv


In [5]:
import requests
from bs4 import BeautifulSoup
import re
from urllib.parse import urljoin, urlparse
from concurrent.futures import ThreadPoolExecutor, as_completed
import threading

# Starting URL to begin crawling
start_url = "https://www.punjabiuniversity.ac.in/"

# Limits to control crawl size
max_pages = 30  # Maximum number of pages to visit
max_depth = 2   # Maximum depth of link traversal from the start URL

# Thread lock to prevent race conditions when accessing shared data
lock = threading.Lock()

# Sets to keep track of visited URLs and found email addresses
visited = set([start_url])
emails_found = set()

# Not currently used but declared - might be useful for future extension
tasks = []

def extract_emails_and_links(url, base_url, depth):
    """
    Fetches a web page and extracts all emails and internal links.
    Returns:
        - emails_local: A set of email addresses found
        - links_local: A set of internal URLs found
    """
    emails_local = set()
    links_local = set()
    try:
        # Send HTTP GET request to fetch the page
        response = requests.get(url, timeout=10)
        soup = BeautifulSoup(response.text, "html.parser")

        # Extract text and search for emails using regex
        text = soup.get_text()
        emails_local.update(re.findall(
            r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+", text))

        # Extract all <a href=""> links
        for link in soup.find_all("a", href=True):
            href = link['href']
            full_url = urljoin(base_url, href)  # Convert to absolute URL
            parsed = urlparse(full_url)

            # Only consider internal links (same domain)
            if parsed.netloc == urlparse(base_url).netloc:
                links_local.add(full_url)
    except Exception:
        # Skip pages that fail to load (e.g., timeout, 404)
        pass
    return emails_local, links_local

def crawl(url, base_url, depth):
    """
    Crawls the specified URL:
    - Extracts emails and new internal links
    - Returns a list of new links to crawl (if within limits)
    """
    global visited, emails_found

    if depth > max_depth:
        return []  # Don't go beyond the allowed depth

    # Extract emails and links from the page
    emails_local, links_local = extract_emails_and_links(url, base_url, depth)

    # Safely update the global emails set
    with lock:
        emails_found.update(emails_local)

    new_links = []
    with lock:
        # Check for new links that haven't been visited and within page limit
        for link in links_local:
            if link not in visited and len(visited) < max_pages:
                visited.add(link)
                new_links.append((link, base_url, depth + 1))
    return new_links

def main():
    """
    Main controller function that manages the crawling process
    using a thread pool for concurrent crawling.
    """
    to_crawl = [(start_url, start_url, 0)]  # Initialize with start URL
    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = []

        while to_crawl:
            # Submit crawl tasks to the executor
            for args in to_crawl:
                futures.append(executor.submit(crawl, *args))

            to_crawl = []  # Clear the queue for next round

            # Process completed crawl tasks
            for future in as_completed(futures):
                new_links = future.result()
                to_crawl.extend(new_links)  # Queue up new links for crawling

            futures = []  # Clear the futures list for next batch

# Entry point of the script
if __name__ == "__main__":
    main()

    # Display results
    if emails_found:
        print("✅ Emails found:")
        for email in sorted(emails_found):
            print(email)
    else:
        print("⚠️ No emails found.")


✅ Emails found:
adminwebsite@pbi.ac.in
admissions@pbi.ac.in
bhiminderpbi@gmail.com
brar_jas@yahoo.co.in
campus.dehla@gmail.com
campus.ralla@gmail.com
camspup7@gmail.com
ccsr.pup@gmail.com
cdeispbi@yahoo.com
ce2013pup@yahoo.com
cepwd.pup@gmail.com
coemrampuraphul@gmail.com
coordinator_nss@yahoo.com
daapup@pbi.ac.in
daljit_ahluwalia@yahoo.com
dbs@pbi.ac.in
dbsskdn@gmail.com
dcs@pbi.ac.in
dean.alumnirelations@gmail.com
deanalumni@pbi.ac.in
deanandheadlaw@gmail.com
deanartsandculture.pup@gmail.com
deanstudentswelfare6@gmail.com
deepskandhala@yahoo.co.in
departmenthistory2015@gmail.com
diapupadmission@pbi.ac.in
director@pbi.ac.in
director@wscpedia.org
director_dias@pbi.ac.in
directorthhmpup@gmail.com
directoryouthwelfaredept@gmail.com
dirsports@pbi.ac.in
dispup2016@gmail.com
dispup2016@pbi.ac.in
dpm@pbi.ac.in
edcellpup@gmail.com
edrcbti@gmail.com
eich@pbi.ac.in
emmrc.patiala@gmail.com
etranscript@pbi.ac.in
foreignlanguages@pbi.ac.in
gurmatsangeetchair@pbi.ac.in
gurmatsangeetonline@pbi.ac.in