# Got a website?
## Get a list of all the links
▶ First, go to "Website url and output file\
▶ site = your website\
▶ Remember to download the output file from the "Files" in the menu to the left.

In [None]:
#@title Website url and output file
#@markdown Anything you can get to within the page you specify, so you probably want your home page

#import datetime will allow us to append the date to your file name
from datetime import date
today = date.today()

site = 'https://www.google.com' #@param{type: 'string'}

#@markdown overwrite will overwrite your output file if it exists.
overwrite = True #@param {type:"boolean"}
#@markdown ---
#@markdown The following you can leave as is or specify a name.\
#@markdown  \
#@markdown * **default_file_name** - will look like urls_2024-01-27.txt and have today's date
#@markdown * **descriptor_file_name** - will look like urls_google_2024-01-27.txt and include the date and your site's domain or subdomain
site_descriptor = site.split(".")[1]
default_file_name = 'urls_'+str(today)+".txt" #@param {type:"raw"}
descriptor_file_name = 'urls_'+site_descriptor+str(today)+".txt" #@param {type:"raw"}
output_file_name = descriptor_file_name # @param ["urls.txt", "default_file_name", "descriptor_file_name"] {type:"raw"}

print("The following will change to your selection when you run the notebook:")
print("selected file name:", output_file_name)

The following will change to your selection when you run the notebook:
selected file name: urls_google2024-01-27.txt


---
# Below this is set, but you can look at the outputs for information.

In [None]:
#@title import packages
from bs4 import BeautifulSoup
import requests


In [None]:
#@title define methods
def response_message(response):
    code = response.status_code
    if code == 200:
        print(code, ": Response OK")
    elif code == 403:
        print(code, ": Forbidden - crawling is likely not allowed on this domain.")
    elif code == 301:
        print(code, ": Moved permanently - the page has been moved.")
    elif code == 401:
        print(code, ": Unauthorized - you may need to log in or provide credentials for this page.")
    elif code == 404:
       print(code, ": Not found - the server can't find this page, may be gone or 'unpublished'.")
    elif code == 500:
       print (code, ": Internal server error - generic error message. Something unexpected may have happened. You should be able to try again soon.")
    else:
        print("Some other code:", code)

def scrape(site):
    r = requests.get(site)
    response_message(r)
    s = BeautifulSoup(r.text,"html.parser")
    urls=[]
    stubs=[]
    for i in s.find_all("a"):
        try:
          href = i.attrs['href'].strip()
        except Exception as e:
          print("Exception", e, ". Ignoring a tag that's not a link:", i)
        if href.startswith("/"):
            if href not in stubs:
                stubs.append(href)
        else:
            if href not in urls:
                urls.append(href)

    return stubs, urls


In [None]:
#@title get all the links
stubs, urls = scrape(site)

200 : Response OK


In [None]:
#@title write links to a file
mode = 'w' if overwrite else 'a'

with open(output_file_name, mode) as f:
    for s in stubs:
        f.write(str(s) + "\n")
    for u in urls:
        f.write(str(u) + "\n")


In [None]:
#@title read file
#@markdown If you don't see a list of urls printed below, there may have been an error, check the code displayed in "get all the links."
with open(output_file_name, 'r') as f:
  lines = f.readlines()
  for line in lines:
    print(line.strip())

/preferences?hl=en
/advanced_search?hl=en&authuser=0
/intl/en/ads/
/services/
/intl/en/about.html
/intl/en/policies/privacy/
/intl/en/policies/terms/
https://www.google.com/imghp?hl=en&tab=wi
https://maps.google.com/maps?hl=en&tab=wl
https://play.google.com/?hl=en&tab=w8
https://www.youtube.com/?tab=w1
https://news.google.com/?tab=wn
https://mail.google.com/mail/?tab=wm
https://drive.google.com/?tab=wo
https://www.google.com/intl/en/about/products?tab=wh
http://www.google.com/history/optout?hl=en
https://accounts.google.com/ServiceLogin?hl=en&passive=true&continue=https://www.google.com/&ec=GAZAAQ
