# Task 1: Setting up Your Scaper

### Importing the packages

In [4]:
pip install BeautifulSoup4

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [None]:
# Load the packages
import requests
import BeautifulSoup as bs

### Making a GET request

In [7]:
# Defining the url of the site
url = 'https://kr1s7on.github.io/KRYJ-Hotel/about.html'

# Making a get request and get status code
response = requests.get(url)
print(f"Status Code: {response.status_code}")

Status Code: 200


In [8]:
# Extracting the HTML
html = response.content

# Checking that the reply is indeed an HTML code by inspecting the first 100 symbols
html[:100]



### Making the soup

In [None]:
# Convert HTML to a BeautifulSoup object. This will allow us to parse out content from the HTML more easily.
# Using the default parser as it is included in Python
soup = BeautifulSoup[html, "html.parser"]

### Exporting the HTML to a file

In [None]:
# It is extremely useful to be able to check this file when searching where some info is located
# or to see how was the document parsed

# Exporting the HTML to a file
with open('about.html', 'wb') as file:
    file.writesoup.prettify('utf-8()')




# the 'with' statement is shorthand for a 'try-finally' block
# open is function for opening/creating a file to edit
# the 'wb' argument signifies the mode in which to edit the file - Writing in Bytes format
# .prettify() modifies the HTML code with additional indentations for better readability

# Task 2: Searching and navigating the HTML tree

## Task 2A: Searching - find() and find_all()

In [None]:
# The soup variable (BeautifulSoup object) we defined earlier can be seen as representing the whole document



In [None]:
# We can search by tag name
# This returns as the element with all its contents and nested elements inside



In [None]:
# If there is no result it returns None
# Note: None is not displayed in IPython unless print() or repr() is used



In [None]:
# Display the None value



In [None]:
# verify the type of output



In [None]:
# .find() returns only the first such result



In [None]:
# If we want all the results we use find_all() 




In [None]:
# find_all returns a list of all results



In [None]:
# We must be careful when using find_all()
# If no result is found it returns an empty list



In [None]:
# How many links are on the page?




In [None]:
# Usually, we prefer to store the result in a variable
# Let's store the body of a table in a table variable



In [None]:
# Inspect the value of the variable



In [None]:
# Inspect the type of the variable



In [None]:
# A tag can be searched in the same way we search the whole document



In [None]:
# Since we used find_all, the result is a list



## Task 2B: Navigating the tree

In [None]:
# A tag's children are stored in a list, accessed with .contents



In [None]:
# How big is the table content?



In [None]:
# Get the first element in the table content




In [None]:
# We can also go up the tree with .parent



In [None]:
# table.parent is also a tag
# Thus, we can use .parent on it as well



In [None]:
# We use .parent to go up the tree
# But what about .children?



In [None]:
# If we want a list of an element's children, we need to use table.contents as shown before
# .children is an iterator over that list, 
# which means we can use it in a for loop to iterate over all the children





# Task 3: Searching by attributes

In [None]:
# We can search for tags based on their attributes, in addition to their name



In [None]:
# There are two ways in which we can do that:

### Passing attributes as function parameters

In [None]:
# By writing them as function parameters
# Notice that since class is a reserved word, we write class_



In [None]:
# We can filter against multiple attributes at once



### Placing the attributes in a dictionary

In [None]:
# By writting the attributes in a dictionary



In [None]:
# find the div with value of id is footer 




# Task 4: Extracting data from the HTML tree

In [None]:
# Let's use some placeholder object to manipulate in the examples below



In [None]:
# We can obtain the name of the tag with the .name attribute



## Getting the attribute value

In [None]:
# We can access a tag’s attributes by treating the tag just like a dictionary

In [None]:
# First way



In [None]:
# Notice how multi-valued attributes, such as class, return a list



In [None]:
# Second way



In [None]:
# Again, class returns a list



#### Differences between these methods manifest when the key is missing

In [None]:
# tag['missing-key'] returns an error
# a['id'] will raise an error, if uncommented

In [None]:
# tag.get('missing-key') returns a default value None



In [None]:
# We can use repr() function to display all special characters and combinations (None, \n...)



In [None]:
# We can also get all attribute name-value pairs in a dictionary



## Extracting the text

### .string vs .text

In [None]:
# We can access the raw string of an element by using .string



In [None]:
# Alternativelly we can use .text



#### They exhibit different behaviour when the element contains more than one distinct string

In [None]:
# This paragraph has many nested elements, with lots of different fragments of text



In [None]:
# .text returns everything inside the element



In [None]:
# .string returns None when there is more than 1 string



In [None]:
# We can stack different operations one after the other



In [None]:
# semi-properly displayed text



In [None]:
# We can also use .get_text() instead of .text



In [None]:
# We can also extract the whole text of the webpage
# CAUTION: This includes JavaScript text, CSS and other not directly displayed text


### .strings and .stripped_strings

In [None]:
# All strings inside an element can be accessed separatelly by using the .strings iterator

In [None]:
# The extra whitespace can be removed by using the .stripped_strings iterator instead



# Practical Example 1

## Links - absolute path URL

In [None]:
# Let's use the variable links we defined a couple of lectures ago for this example
# It contains all the 'a' tags on this page



In [None]:
# Let's choose one link to manipulate



In [None]:
# Get the link's text



In [None]:
# Extract the link's URL



In [None]:
# This is a relative URL
# To obtain the absolute URL address we will use urljoin




In [None]:
# Now we need the address of the current page + the relative URL to compute the full-path URL



## Processing multiple links at once

In [None]:
# We will work with:



In [None]:
# Examining the link's addresses
[l.get('href') for l in links]   # Note that if l['href'] was written instead of l.get('href'), this would produce an error

In [None]:
# Notice that some links don't have URL (None appears)

# Dropping the links without href attribute



In [None]:
# Obtaining the relative URLs



In [None]:
# Transforming to absolute path URLs



In [None]:
# Extracting only URLs pointing to Wikipedia (internal URLs)



# Task 5: Extracting data from nested tags

In [None]:
# Our objective now is to extract all links that can be found under a section heading
# Marked as 'Main article:' or 'See also:'
# By quick inspection, we see that these are contained in div tags with attribute 'role' set to 'note'




In [None]:
# We can apply find() and find_all() to a tag in the same way we do it to the whole document



In [None]:
# A naive approach to get all links would be to use find



In [None]:
# However, some divs have more than 1 link



In [None]:
# This div has 6 links in it



In [None]:
# Therefore we need to use find_all
# Let's use a for loop

# Define initially empty list of links


    
    # Need to add every link from anchors to div_links

    
    
    # Can use div_links.extend(anchors) instead of the for loop
    

In [None]:
# We now have a complete list



In [None]:
# Let's get the URLs



# Task 6: Scraping multiple pages automatically - Extracting all the text from the note URLs

In [None]:
# We will use the links we obtained above



In [None]:
# The objective is to get all the useful text from those wikipedia pages

# We will do that by extracting all text contained in a paragraph element,
# for all paragraphs on a page,
# for all pages (in note_urls)

In [None]:
# initialize list to store paragraph text for each webpage
par_text = []


# creating a loop counter
i = 0

# Loop through each URL in note_urls
for url in note_urls:
    
    # connect to every webpage
    note_resp = requests.get(url)
    
    # checking if the request is successful
    if note_resp.status_code == 200:            # Everything is OK!
        print('URL #{0}: {1}'.format(i+1,url))    # print out the number of iteration and the URL to keep track of place in loop
    
    else:                                       # Something is wrong!
        print('Status code {0}: Skipping URL #{1}: {2}'.format(note_resp.status_code, i+1, url))
        i = i+1
        continue
        
    
    # get HTML from webpage
    note_html = note_resp.content
    
    # convert HTML to BeautifulSoup object
    note_soup = BeautifulSoup(note_html, 'lxml')
    
    # find all "p" tags on the webpage
    note_pars = note_soup.find_all("p")
    
    # Get the text from each "p" tag
    text = [p.text for p in note_pars]
    
    # Append text from each "p" tag to our list, par_text
    par_text.append(text)
    
    # Incrementing the loop counter
    i = i+1


In [None]:
# Inspecting the result for the first page
par_text[0]

In [None]:
# We see that we have a list of all paragraph strings
# It would be more useful to have all the text as one string, not as a list of strings

# Merging all paragraphs of the first page into one long string
page_text = "".join(par_text[0])
page_text

In [None]:
# Let's do that for all pages

# Merging all paragraphs for all pages
page_text = ["".join(text) for text in par_text]

# Inspect the result for some webpage
page_text[0]

In [None]:
# Inspect result
print(page_text[4])

In [None]:
# Creating a dictionary with the (key,value) pairs being (url,text)
url_to_text = dict(zip(note_urls, page_text))  # You don't need to know the specifics of these functions

In [None]:
print(url_to_text['https://en.wikipedia.org/wiki/Music_theory'])

In [None]:
# A word of caution:
# We have not extracted all of the main content's text,
# as some text may be contained in lists and tables, outside of paragraphs we scraped