In [1]:
import requests
from urllib.parse import urlparse, urljoin
from bs4 import BeautifulSoup

In [2]:
def scrape(domain, initial_url, max_depth):
    links_recorded = set()
    links_followed = set()
    #max_depth = depth

    def recursive_search(current_url, depth):
        if depth > max_depth:
            return

        response = requests.get(current_url)
        #200 = successfull GET request
        if response.status_code == 200:
            parsed_page = BeautifulSoup(response.text, 'html.parser')
            anchors = parsed_page.find_all('a')
            
            for tag in anchors:
                href = tag.get('href')
                
                if href is not None:
                    absolute_url = urljoin(current_url, href)
                    #ulrparse breaks the absolute url into component parts - scheme, netloc, etc...
                    parsed_url = urlparse(absolute_url)
                    
                    
                    #CHECKING HTML Files
                    #parsed_url.netloc is the domain component of the parsed url
                    #Checks if the domain of the discovered url is the same as the domain set by the user
                    #Checks that the link doesn't already exist in the recorded links
                    if parsed_url.netloc == domain and absolute_url not in links_recorded:
                        links_recorded.add(absolute_url)
                        #print(absolute_url) Printed the links during testing
                        recursive_search(absolute_url, depth + 1)
                        
                        
            #Finding all CSS files, can also be done by 
            css_files = parsed_page.find_all('link', {'rel': 'stylesheet'})
            for css_file in css_files:
                css_url = urljoin(current_url, css_file['href'])

                if css_url not in links_followed:
                    links_followed.add(css_url)
                    css_response = requests.get(css_url)
                    #200 = get request was successfull
                    if css_response.status_code == 200:
                        css_content = css_response.text
                    
                        #Using RegEx to find urls in CSS code
                        import re
                        css_urls = re.findall(r'url\((.*?)\)', css_content)
                        for css_url in css_urls:
                            absolute_css_url = urljoin(css_url, css_url)
                            parsed_css_url = urlparse(absolute_css_url)
                            #Checking that the discovered Url is within the domain and hasn't already been recorded
                            if parsed_css_url.netloc == domain and absolute_css_url not in links_recorded:
                                links_recorded.add(absolute_css_url)
                                #print(absolute_css_url) - Printed urls during testing
                                recursive_search(absolute_css_url, depth + 1)
            
            #Find all the JS files and look for scripts with the source attribute
            script_tags = parsed_page.find_all('script', {'src': True})
            for script_tag in script_tags:
                script_url = urljoin(current_url, script_tag['src'])
                if script_url not in links_followed:
                    links_followed.add(script_url)
                    script_response = requests.get(script_url)
                    #200 = get request was successfull
                    if script_response.status_code == 200:
                        script_content = script_response.text
                        
                        # Using RegEx to find links within JS scripts. 
                        # Tested many different Regex functions, there are many different examples available online for 
                        # regex funcitons to find urls in js code. I didn't find any 1 perfect solutions so I tested
                        # a few and kept the one that returned the fewest false positives.
                        javascript_urls = re.findall(r'["\']((?:https?:)?\/\/[^"\']+)["\']', script_content)
                        for javascript_url in javascript_urls:
                            absolute_javascript_url = urljoin(current_url, javascript_url)
                            parsed_javascript_url = urlparse(absolute_javascript_url)
                            #Checking that the script URL is within the domain and hasn't already been recorded
                            if parsed_javascript_url.netloc == domain and absolute_javascript_url not in links_recorded:
                                links_recorded.add(absolute_javascript_url)
                                #print(absolute_javascript_url) - Printed urls during testing
                                recursive_search(absolute_javascript_url, depth + 1)

    recursive_search(initial_url, 0)

    with open('recorded_links.txt', 'w') as file:
        file.write('\n'.join(links_recorded))

#Test
scrape('www.rit.edu', 'http://www.rit.edu', 3)

http://www.rit.edu#main-content
http://www.rit.edu/request-information
http://www.rit.edu/request-information#main-content
http://www.rit.edu/visit
http://www.rit.edu/admissions/apply
http://www.rit.edu/giving/
http://www.rit.edu/about-rit
http://www.rit.edu/uniquely-rit
http://www.rit.edu/history-rit
http://www.rit.edu/university-leadership
http://www.rit.edu/rochester-ny
http://www.rit.edu/careers
http://www.rit.edu/contact
http://www.rit.edu/campuses
http://www.rit.edu/rankings-and-recognition
http://www.rit.edu/discover-rit
http://www.rit.edu/faces
http://www.rit.edu/academics
http://www.rit.edu/study/undergraduate
http://www.rit.edu/study/graduate
http://www.rit.edu/study/combined-accelerated-bachelors-masters
http://www.rit.edu/colleges
http://www.rit.edu/new-economy
http://www.rit.edu/study/immersions-and-minors
http://www.rit.edu/individualized-study
http://www.rit.edu/undeclared-options
http://www.rit.edu/areas-of-study
http://www.rit.edu/calendar
http://www.rit.edu/experienti