## Webpage being used here is  :  https://keithgalli.github.io/web-scraping/example.html

In [1]:
import requests
from bs4 import BeautifulSoup as bs

## Load our first page

In [2]:
r=requests.get("https://keithgalli.github.io/web-scraping/example.html")

#Convert to besutifulsoup object
soup=bs(r.content)

#Print out our html
print(soup.prettify())

#print(soup)

<html>
 <head>
  <title>
   HTML Example
  </title>
 </head>
 <body>
  <div align="middle">
   <h1>
    HTML Webpage
   </h1>
   <p>
    Link to more interesting example:
    <a href="https://keithgalli.github.io/web-scraping/webpage.html">
     keithgalli.github.io/web-scraping/webpage.html
    </a>
   </p>
  </div>
  <h2>
   A Header
  </h2>
  <p>
   <i>
    Some italicized text
   </i>
  </p>
  <h2>
   Another header
  </h2>
  <p id="paragraph-id">
   <b>
    Some bold text
   </b>
  </p>
 </body>
</html>



# find() and find_all()

In [3]:
first_header=soup.find("h2") # will give the start h2 tag and the contents inside it.
print(first_header)

<h2>A Header</h2>


In [4]:
headers=soup.find_all("h2") # will give all the headers of h2.
print(headers)

[<h2>A Header</h2>, <h2>Another header</h2>]


In [5]:
header=soup.find(["h1","h2"]) #will give you the first header which is either h1 or h2. order doesnt matter here.
print(header)

<h1>HTML Webpage</h1>


In [6]:
header=soup.find_all(["h1","h2"]) #will give you all the headers, be it h1 or h2.
print(header)

[<h1>HTML Webpage</h1>, <h2>A Header</h2>, <h2>Another header</h2>]


### find() and find_all() using attributes

In [7]:
paragraph=soup.find_all("p",attrs={"id":"paragraph-id"}) #give only those paragraphs where the id is paragraph_id
print(paragraph)

[<p id="paragraph-id"><b>Some bold text</b></p>]


In [8]:
body=soup.find("body").find("div").find("h1")
body

<h1>HTML Webpage</h1>

In [9]:
print(soup.prettify())

<html>
 <head>
  <title>
   HTML Example
  </title>
 </head>
 <body>
  <div align="middle">
   <h1>
    HTML Webpage
   </h1>
   <p>
    Link to more interesting example:
    <a href="https://keithgalli.github.io/web-scraping/webpage.html">
     keithgalli.github.io/web-scraping/webpage.html
    </a>
   </p>
  </div>
  <h2>
   A Header
  </h2>
  <p>
   <i>
    Some italicized text
   </i>
  </p>
  <h2>
   Another header
  </h2>
  <p id="paragraph-id">
   <b>
    Some bold text
   </b>
  </p>
 </body>
</html>



### paragrap containg some substrings

In [10]:
#print(soup.find_all("p",string="Some bold text"))

# We can search specific strings in our find/find_all calls
import re

paragraphs = soup.find_all("p", string=re.compile("Some")) #find all the paragraphs with these substring : "Some"
print(paragraphs)
print("\n")
headers = soup.find_all("h2", string=re.compile("(H|h)eader")) #print all h2 headers which contain either Header or header.
print(headers)

[<p><i>Some italicized text</i></p>, <p id="paragraph-id"><b>Some bold text</b></p>]


[<h2>A Header</h2>, <h2>Another header</h2>]


# Select CSS

In [11]:
content=soup.select("body div p a")
content

[<a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a>]

In [12]:
# By this,we will get all the bold texts inside the paragraph with the id : paragraph-id
bold_text=soup.select("p#paragraph-id b")
print(bold_text)

[<b>Some bold text</b>]


## Select_one will be same as find.  https://stackoverflow.com/questions/38028384/beautifulsoup-difference-between-find-and-select  select() returns all the instances

In [13]:
# take the second , third ..... paragraphs under the body but NOT the first.

paragraphs = soup.select("body > p")
print(paragraphs)

[<p><i>Some italicized text</i></p>, <p id="paragraph-id"><b>Some bold text</b></p>]


In [14]:
#Print all the italics inside these paragraphs.
for paragraph in paragraphs:
  print(paragraph.select("i"))

[<i>Some italicized text</i>]
[]


In [15]:
# Grab by element with specific property
soup.select("[align=middle]")

[<div align="middle">
 <h1>HTML Webpage</h1>
 <p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
 </div>]

# Get different properties of HTML

In [16]:
# find the text inside the h2 tag.
# NOTE: This will work only if there is there is not other confusing things. like many tags inside this tag as well.
header = soup.find("h2")
header.string

'A Header'

### GET Text in recursive manner (if string does not work here.)

In [17]:
div = soup.find("div")
# if multiplee child elements use get_text()
print(div.get_text())


HTML Webpage
Link to more interesting example: keithgalli.github.io/web-scraping/webpage.html



### GET Specific properties from an element.

In [18]:
link=soup.find("a")
link['href']

'https://keithgalli.github.io/web-scraping/webpage.html'

In [19]:
# use [] to get the properties like, href,align,id etc.
test=soup.find("div",attrs={"align":"middle"})
test['align']

'middle'

## Path Syntax 

In [20]:
soup.div.h1.get_text()

'HTML Webpage'

In [21]:
soup.body.div.h1.string

'HTML Webpage'

# parent, child and siblings

In [22]:
print(soup.body.prettify()) 

<body>
 <div align="middle">
  <h1>
   HTML Webpage
  </h1>
  <p>
   Link to more interesting example:
   <a href="https://keithgalli.github.io/web-scraping/webpage.html">
    keithgalli.github.io/web-scraping/webpage.html
   </a>
  </p>
 </div>
 <h2>
  A Header
 </h2>
 <p>
  <i>
   Some italicized text
  </i>
 </p>
 <h2>
  Another header
 </h2>
 <p id="paragraph-id">
  <b>
   Some bold text
  </b>
 </p>
</body>



In [23]:
# NOTE : Here, the sibling of the div, is h2. parent of div is body and children of body is div,etc.

In [24]:
# Siblings
print(soup.body.find("div").find_next_siblings())

[<h2>A Header</h2>, <p><i>Some italicized text</i></p>, <h2>Another header</h2>, <p id="paragraph-id"><b>Some bold text</b></p>]


In [25]:
print(soup.body.find("div").find_next_sibling())
print(soup.body.find("div").find_previous_sibling())

<h2>A Header</h2>
None


In [26]:
print(soup.body.find("div").find_parent())

<body>
<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
</div>
<h2>A Header</h2>
<p><i>Some italicized text</i></p>
<h2>Another header</h2>
<p id="paragraph-id"><b>Some bold text</b></p>
</body>


In [27]:
print(soup.body.find("div").find_all_next())

[<h1>HTML Webpage</h1>, <p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>, <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a>, <h2>A Header</h2>, <p><i>Some italicized text</i></p>, <i>Some italicized text</i>, <h2>Another header</h2>, <p id="paragraph-id"><b>Some bold text</b></p>, <b>Some bold text</b>]


# Another website

In [28]:
# Load the webpage content
r = requests.get("https://keithgalli.github.io/web-scraping/webpage.html")

# Convert to a beautiful soup object
webpage = bs(r.content)


### Grab all social links from this webpage

In [29]:
links = webpage.select("ul.socials a")
actual_links = [link['href'] for link in links]
actual_links

['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

In [30]:
ulist = webpage.find("ul", attrs={"class": "socials"})
links = ulist.find_all("a")
actual_links = [link['href'] for link in links]
actual_links

['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

In [31]:
links = webpage.select("li.social a")
actual_links = [link['href'] for link in links]
actual_links

['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

### Grab the table

In [35]:
table=webpage.find("table",attrs={"class":"hockey-stats"})
# ALTERNATE WAY : table = webpage.select("table.hockey-stats")[0]
print(table)

<table class="hockey-stats">
<thead>
<tr>
<th class="season" data-sort="">S</th>
<th class="team" data-sort="team">Team</th>
<th class="league" data-sort="league">League</th>
<th class="regular gp" data-sort="gp">GP</th>
<th class="regular g" data-sort="g">G</th>
<th class="regular a" data-sort="a">A</th>
<th class="regular tp" data-sort="tp">TP</th>
<th class="regular pim" data-sort="pim">PIM</th>
<th class="regular pm" data-sort="pm">+/-</th>
<th class="separator"> </th>
<th class="postseason">POST</th>
<th class="postseason gp" data-sort="playoffs-gp">GP</th>
<th class="postseason g" data-sort="playoffs-g">G</th>
<th class="postseason a" data-sort="playoffs-a">A</th>
<th class="postseason tp" data-sort="playoffs-tp">TP</th>
<th class="postseason pim" data-sort="playoffs-pim">PIM</th>
<th class="postseason pm" data-sort="playoffs-pm">+/-</th>
</tr>
</thead>
<tbody>
<tr class="team-continent-NA">
<td class="season sorted">
                  2014-15
              </td>
<td class="team"

In [None]:
import pandas as pd

table = webpage.select("table.hockey-stats")[0]
columns = table.find("thead").find_all("th")
column_names = [c.string for c in columns]

table_rows = table.find("tbody").find_all("tr")
l = []
for tr in table_rows:
    td = tr.find_all('td')
    row = [str(tr.get_text()).strip() for tr in td]
    l.append(row)

df = pd.DataFrame(l, columns=column_names)
df.head()
