**HTML and XML Parsing with Beautiful Soup**

Jo, Eun Seo for ESUDH2018

In [None]:
!pip install bs4
!pip install requests

In [None]:
from bs4 import BeautifulSoup as BS
import requests

In [None]:
address = 'https://www.nytimes.com/2018/07/17/world/europe/trump-putin-summit.html?hp&action=click&pgtype=Homepage&clickSource=story-heading&module=a-lede-package-region&region=top-news&WT.nav=top-news'
nyt_article = requests.get(address)
nyt_article

In [None]:
nyt_article.text

In [None]:
#This transforms your data into a tree structure of several objects. You only really need to know 3 of them.
#1. BS Object
soup = BS(nyt_article.text, 'html.parser') #for xml, use "lxml" here
soup.name

In [None]:
print(soup.prettify())

In [None]:
#2. tags



tag_example ="""
<script>
   window.NYT_ENVIRONMENT = 'prd';
      window.NYT_RELEASE = '74e1ecdde8d6e7ded6c2dcf316c97e426c9f7f32';
      window.JKIDD_PATH = 'https://a.nytimes.com/svc/nyt/data-layer';
      window.WEDDINGS_PATH = 'https://content.api.nytimes.com';
      window.GDPR_PATH = 'https://us-central1-nyt-wfvi-prd.cloudfunctions.net/gdpr-email-form';
      window.SENTRY_SAMPLE_RATE = 10;
</script>
"""

tag_example2 ="""
<title data-rh="true">
   A Besieged Trump Says He Misspoke on Russian Election Meddling - The New York Times
  </title>
"""



soup.script

In [None]:
#tags have attributes
soup.title['data-rh']

In [None]:
soup.link.attrs

In [None]:
s = """
<p class="css-1i0edl6 e2kc3sl0">At one point during his remarks, the TV lights in the Cabinet Room, where Mr. Trump was meeting with lawmakers, switched off, plunging the room into gloomy shadows. “Whoops, they just turned off the lights,” Mr. Trump joked. “That must be the intelligence agencies.”</p>
"""

In [None]:
example = BS(s, "html.parser")

In [None]:
#prettify makes it easier to see the tree structure
print(example.prettify())

In [None]:
example.p['class']

In [None]:
#3. BS strings 

In [None]:
s = """
<p class="css-1i0edl6 e2kc3sl0">Even as he walked back his remarks, Mr. Trump repeated his assertion that there was no evidence of collusion between his campaign and the Russians. That line was scribbled in black marker onto a typewritten sheet of remarks on the table before him.</p>
"""

In [None]:
example = BS(s, "html.parser")
print(example.prettify())

In [None]:
example.p.string

In [None]:
#It's a TREE!

In [None]:
len(soup.find_all('div')) #find all tags: this returns a list

In [None]:
len(soup.body.find_all('div')) #what does this mean?

In [None]:
soup.body.find_all('div')[10] #because it's a list, you can access by index

In [None]:
#You can access children with .contents
soup.body.find_all('div')[10].contents

In [None]:
soup.body.find_all('div')[7] #What does this tree look like?

In [None]:
len(soup.body.find_all('div')[7].contents) #This guy had 5 children

In [None]:
soup.body.find_all('div')[7].contents[0].contents

In [None]:
#all of your children's children's children... Use the .descendents generator
list(soup.body.find_all('div')[10].descendants)

In [None]:
#compare with the one-generational difference
list(soup.body.find_all('div')[10].contents)

There are other things you can do to navigate through the tree such as going sideways (your siblings), going upwards (your parents and ancestry) but these are not as frequently used. See documentation online for your needs.

In [None]:
#find_all filters: Here are just a few most useful filters you can use for the find_all function

In [None]:
#First, let's get a set of all tag names in our soup
set_tags = set()
for tag in soup.body.find_all(True):
    set_tags.add(tag.name)
set_tags

In [None]:
#list of tag names (OR)
soup.body.find_all(['rect', 'button'])

In [None]:
#you can search by attribute

soup.body.find_all(id="Canvas")

In [None]:
soup.body.find_all(id=True)

In [None]:
#by CSS
soup.body.find_all(class_="css-vz7hjd")

In [None]:
#by string
soup.body.find_all(string="Europe")

In [None]:
#by regex!
import re
putin = re.compile('.*Putin.*')

In [None]:
soup.body.find_all(string=putin)

In [None]:
soup.body.find_all("p", string=putin)

In [None]:
#For this NYT article, let's grab just the body text

In [None]:
text_tags = soup.body.find_all("p", class_="css-1i0edl6 e2kc3sl0")

In [None]:
strings_only = [tags.string for tags in text_tags]
strings_only

Web-crawling with BS

In [None]:
our_website = "https://www.modernhoney.com/"

In [None]:
r  = requests.get(our_website)

In [None]:
html = BS(r.text, "html.parser")

In [None]:
for link in html.find_all('a'):
    print(link['href'])

In [None]:
huffington = requests.get("https://www.huffingtonpost.com/entry/best-movies-of-2018-so-far_us_5b3ba0e0e4b07b827cbb64c0?guccounter=1")

In [None]:
huffington_bs = BS(huffington.text, "html.parser")

In [None]:
print(huffington_bs.prettify())

In [None]:
article = huffington_bs.find_all("article")

In [None]:
print(article[0].prettify())

In [None]:
intro = article[0].find_all("div", class_="content-list-component yr-content-list-text text")

In [None]:
string_intro = []
for _ in intro:
    print(_)
    p = _.contents
    string_p = p[0].string
    string_intro.append(string_p)

In [None]:
string_intro

In [None]:
listicle = article[0].find_all("div", class_="listicle__slide-caption")

In [None]:
string_only = []
for item in listicle:
    print(item.string)
    hyperlinked = item.find_all("a")
    for hyperlink in hyperlinked:
        print(hyperlink.string)