# 1. Text extraction basics

In [50]:
from IPython.core.display import display, HTML

In [51]:
html_contents = "<html>Hello</html>"

In [52]:
def render(html):
    display(HTML(html))

In [53]:
render(html_contents)

## Next...maybe title?

In [54]:
title = "<title>Data Science is Fun</title>"

In [55]:
html_contents = f"<html>{title}Hello</html>"
render(html_contents)

**Our output is identical to what we’ve seen before!** The title does not appear in the body of the rendered HTML. Instead, it only appears in the title-bar of the web browser.

In [56]:
header = "<h1>Data Science is Fun</h1>"
body = f"<body>{header}Hello</body>"
html_contents = f"<html> {title} {body}</html>"
render(html_contents)

In [64]:
paragraphs = ''
for i in range(2):
    paragraph_string = f"Paragraph {i} " * 40
    paragraphs += f"<p>{paragraph_string}</p>"

body = f"<body>{header}{paragraphs}</body>"
html_contents = f"<html> {title} {body}</html>"
render(html_contents)

In [68]:
paragraphs = ''
for i in range(2):
    paragraph_string = f"Paragraph {i} " * 40
    attribute = f"id='paragraph {i}'"
    paragraphs += f"<p {attribute}>{paragraph_string}</p>"

body = f"<body>{header}{paragraphs}</body>"
html_contents = f"<html> {title} {body}</html>"

In [69]:
link_text = "Data Science Bookcamp"
url = "https://www.manning.com/books/data-science-bookcamp"
hyperlink = f"<a href='{url}'>{link_text}</a>"
new_paragraph = f"<p id='paragraph 2'>Here is a link to {hyperlink}</p>"
paragraphs += new_paragraph
body = f"<body>{header}{paragraphs}</body>"
html_contents = f"<html> {title} {body}</html>"
render(html_contents)

#### Add some libraries...

In [70]:
libraries = ['NumPy', 'Scipy', 'Pandas', 'Scikit-Learn']

In [71]:
items = ''
for library in libraries:
    items += f"<li>{library}</li>"

In [72]:
unstructured_list = f"<ul>{items}</ul>"
header2 = '<h2>Common Data Science Libraries</h2>'
body = f"<body>{header}{paragraphs}{header2}{unstructured_list}</body>"
html_contents = f"<html> {title} {body}</html>"
render(html_contents)

In [74]:
div1 = f"<div id='paragraphs' class='text'>{paragraphs}</div>"
div2 = f"<div id='list' class='text'>{header2}{unstructured_list}</div>"
div3 = "<div id='empty' class='empty'></div>"
body = f"<body>{header}{div1}{div2}{div3}</body>"
html_contents = f"<html> {title}{body}</html>"
print(html_contents)

<html> <title>Data Science is Fun</title><body><h1>Data Science is Fun</h1><div id='paragraphs' class='text'><p id='paragraph 0'>Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 </p><p id='paragraph 1'>Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragra

In [75]:
split_contents = html_contents.split('>')
for i, substring in enumerate(split_contents):
    if substring.endswith('<title'):
        next_string = split_contents[i + 1]
        title = next_string.split('<')[0]
        print(title)
        break

Data Science is Fun


# Parsing HTML using Beautiful Soup

In [76]:
from bs4 import BeautifulSoup as bs

In [77]:
# Parsowanie html
soup = bs(html_contents)

In [78]:
print(soup.prettify())

<html>
 <head>
  <title>
   Data Science is Fun
  </title>
 </head>
 <body>
  <h1>
   Data Science is Fun
  </h1>
  <div class="text" id="paragraphs">
   <p id="paragraph 0">
    Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0
   </p>
   <p id="paragraph 1">
    Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Parag

## BS data access

What if you need to extract title from html? The soup object provide that acces throught its fint method

In [84]:
title = soup.find('title')
title

<title>Data Science is Fun</title>

The outputted title appears to be an HTML string that’s demarcated by the title tags. **However, our title variable is not a string.** Rather, it’s an initialized Beautiful Soup Tag class. We can verify by printing type(title)

In [85]:
print(type(title))

<class 'bs4.element.Tag'>


In [90]:
h1 = soup.find("h1")

In [93]:
h2 = soup.find("h2")

Each Tag object contains a text attribute, which maps to the text within the tag. Thus, printing title.text will return Data Science is Fun.

In [94]:
title.text

'Data Science is Fun'

In [95]:
h1.text

'Data Science is Fun'

In [96]:
h2.text

'Common Data Science Libraries'

We’ve accessed our title tag by running soup.find('title'). Additionally, we can access that same tag simply by running soup.title. Therefore, running soup.title.text will return a string that’s equal to title.text.

This listing Accessing the title’s text attribute from soup

In [99]:
assert soup.title.text == title.text

In [104]:
body = soup.body
body

<body><h1>Data Science is Fun</h1><div class="text" id="paragraphs"><p id="paragraph 0">Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 </p><p id="paragraph 1">Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 

In [106]:
body.p.text

'Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 '

In [107]:
soup.p.text

'Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 '

## Accesing bullet points in body

In [110]:
print([bullet.text for bullet in body.find_all("li")])

['NumPy', 'Scipy', 'Pandas', 'Scikit-Learn']


The find and find_all methods allow us to search the elements by tag-type, and also by attribute. Suppose we wish to access an element with a unique id of x. In order to search on that attribute id, we simply need to execute find(id=x). With this in mind, lets output the text of the final paragraph, whose assigned id is paragraph 2.

## Accesing links in paragraph

In [116]:
paragraph_2 = soup.find(id='paragraph 2')
print(paragraph_2.text)

Here is a link to Data Science Bookcamp


The contents of paragraph_2 include a web link to Data Science Bookcamp. The actual url is stored within the href attribute. Beautiful Soup permits us to access any attribute using the get method. Thus, running paragraph_2.get(id) will return paragraph 2. Subsequently, running paragraph_2.a.get(href) will return the url. Below, we’ll print that url.

In [118]:
assert paragraph_2.get('id') == 'paragraph 2'
print(paragraph_2.a.get('href'))

https://www.manning.com/books/data-science-bookcamp


Common Beautiful Soup Methods
soup = bs(html_contents): Initializes a BeautifulSoup object by from the HTML elements within the parsed html_contents).

* soup.prettify(): Returns the parsed HTML document in a clean, easily-readable format.

* title = soup.title: Returns a Tag object associated with the title element of a parsed document.

* title = soup.find('title'): Returns a Tag object associated with the title element of a parsed document.

* tag_object = soup.find('element_tag'): Returns a Tag object associated with the first HTML element that is demarcated by the specified element_tag tag.

* tag_objects = soup.find_all('element_tag'): Returns a list of all Tag objects that are demarcated by the specified element_tag tag.

* tag_object = soup.find(id='unique_id'): Returns a Tag object that contains the specified unique id attribute.

* tag_objects = soup.find_all('element_tag', class_='category_class'): Returns a list of Tag objects that are demarcated by the specified element_tag tag, and also contain the specified class attribute.

* tag_object = soup.new_tag('element_tag'): Creates a new Tag object, whose HTML element type is specified by element tag.

* tag_object.decompose(): Deletes the Tag object from soup.

* tab_object.apped(tag_object2): Given two Tag objects, tag_object and `tag_object2, this method inserts tag_object2 into tag_object.

tag_object.text: Returns all visible text within a Tag object.

tag_object.get('attribute'): Returns an HTML attribute that has been assigned to the Tag object.

## Downloading and Parsing Online Data

The Beautiful Soup library allows to easily parse, analyze, and edit HTML documents. In most cases, these documents must be downloaded directly from the web. Lets briefly review the procedure for downloading HTML files, using Python’s built-in urllib module. We’ll start by importing the urlopen function from urllib.request.

In [144]:
from urllib.request import urlopen,Request

**Note
The urlopen function is sufficient when downloading a single HTML document from a single, unsecured online page. However, for more complicated downloads, you should consider using the external Requests library. requests.readthedocs.io**

#### Add fake user agent

In [146]:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.3'}

In [160]:
reg_url = "https://www.manning.com/books/data-science-bookcamp"
req = Request(url=reg_url, headers=headers)
html_contents = urlopen(req).read()

In [161]:
print(html_contents[:1000])

b'\n<!DOCTYPE html>\n<!--[if lt IE 7 ]> <html lang="en" class="no-js ie6 ie"> <![endif]-->\n<!--[if IE 7 ]>    <html lang="en" class="no-js ie7 ie"> <![endif]-->\n<!--[if IE 8 ]>    <html lang="en" class="no-js ie8 ie"> <![endif]-->\n<!--[if IE 9 ]>    <html lang="en" class="no-js ie9 ie"> <![endif]-->\n<!--[if (gt IE 9)|!(IE)]><!--> <html lang="en" class="no-js"><!--<![endif]-->\n<head>\n    <title>Manning | Data Science Bookcamp</title>\n    \n\n\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">\n<meta http-equiv="X-UA-Compatible" content="IE=edge">\n<meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1, user-scalable=0">\n<meta name="application-name" content="Data Science Bookcamp"/>\n<meta name="apple-mobile-web-app-title" content="Data Science Bookcamp"/>\n\n<meta property="og:title" content="Data Science Bookcamp"/>\n<meta name="twitter:title" content="Data Science Bookcamp"/>\n\n<meta name="twitter:site" content="&#64;manningbook

## Extracting messy data by Beutiful Soup

#### Extracting title

In [162]:
soup = bs(html_contents)

In [164]:
print(soup.title.text)

Manning | Data Science Bookcamp


#### Extracting table of contents

In [186]:
for division in soup.find_all('div', class_='sect1 available'):
    result = (division.text.replace('\n\n\n', '\n'))
print(result)


17  Case Study 4 Solution
17.1 Overview
17.2 Extracting Skill Requirements from Job Posting Data

17.2.1 Exploring the HTML for Skill Descriptions

17.3 Filtering Jobs by Relevance
17.4 Clustering Skills in Relevant Job Postings

17.4.1 Grouping the Job Skills into 15 Clusters
17.4.2 Investigating the Technical Skill Clusters
17.4.3 Investing the Soft-Skill Clusters
17.4.4 Exploring Clusters at Alternative Values of K
17.4.5 Analyzing the 700 Most-Relevant Postings

17.5 Conclusion
17.6 Key Takeaways

