In [3]:
# Importing required libraries
import scrapy  # Scrapy framework for scraping
from scrapy.spiders import CrawlSpider  # CrawlSpider class for crawling multiple pages
import re  # Regular expressions library to identify patterns (emails in this case)
from scrapy_selenium import SeleniumRequest  # Scrapy-Selenium to handle JavaScript websites
from selenium.webdriver.common.by import By  # Used for finding elements on a page
from selenium.webdriver.support import expected_conditions as EC  # Used to wait until page loads

# Defining the class that will handle the email extraction
class EmailExtractor(CrawlSpider):
    # Give your spider a name so you can run it with this name later
    name = 'email_extractor'

    # This is the constructor, it gets called when you create an object from the class
    def __init__(self, *args, **kwargs):
        # Call the parent class's constructor
        super(EmailExtractor, self).__init__(*args, **kwargs)
        # List to store the found email addresses
        self.email_list = []
        # Add the websites you want to scrape for emails here
        self.urls = ["https://vitbhopal.ac.in/"
        ]

    # This function starts the scraping process
    def start_requests(self):
        # Loop through each URL in the list of websites
        for url in self.urls:
            # For each URL, create a request to visit the page using Selenium to handle JavaScript
            yield SeleniumRequest(
                url=url,  # Website URL
                callback=self.parse,  # Once the page loads, call the 'parse' function to process it
                wait_until=EC.presence_of_element_located((By.TAG_NAME, "html")),  # Wait until the page is fully loaded
                dont_filter=True  # Allow revisiting the same domain if needed
            )

    # This function processes the page and extracts the emails
    def parse(self, response):
        # Define a regular expression pattern to match email addresses
        EMAIL_REGEX = r'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+'
        
        # Use 're.finditer' to find all the email addresses on the page
        emails = re.finditer(EMAIL_REGEX, str(response.text))
        
        # Loop through the found emails and add them to the email list
        for email in emails:
            self.email_list.append(email.group())  # 'email.group()' gives the full email text

        # Yield (return) each unique email found, removing duplicates using 'set()'
        for email in set(self.email_list):
            yield {
                "email": email  # Return the email in dictionary format, where 'email' is the key
            }

        # Clear the email list after processing each page to avoid duplicating emails across pages
        self.email_list.clear()


ModuleNotFoundError: No module named 'scrapy_selenium'