<a href="https://colab.research.google.com/github/MajkelStaniszewski/leetcode-and-mini-projects/blob/main/Web_Scraping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import requests
from bs4 import BeautifulSoup as bs

In [None]:
# Load the webpage content
r = requests.get("https://keithgalli.github.io/web-scraping/example.html")
# Convert to BeautifulSoup object
soup = bs(r.content)
# Print html
print(soup)

<html>
<head>
<title>HTML Example</title>
</head>
<body>
<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
</div>
<h2>A Header</h2>
<p><i>Some italicized text</i></p>
<h2>Another header</h2>
<p id="paragraph-id"><b>Some bold text</b></p>
</body>
</html>



In [None]:
# find and find_all()
first_header = soup.find("h2")
headers = soup.find_all("h2")
headers

[<h2>A Header</h2>, <h2>Another header</h2>]

In [None]:
# Pass in a list of elements to look for
headers = soup.find_all(["h2","h1"])
headers

[<h1>HTML Webpage</h1>, <h2>A Header</h2>, <h2>Another header</h2>]

In [None]:
# You can pass in attributes to find/find_all function
paragraph = soup.find_all("p",attrs={"id":"paragraph-id"})
paragraph

[<p id="paragraph-id"><b>Some bold text</b></p>]

In [None]:
# You can nest find/find_all calls
body = soup.find('body')
body

<body>
<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
</div>
<h2>A Header</h2>
<p><i>Some italicized text</i></p>
<h2>Another header</h2>
<p id="paragraph-id"><b>Some bold text</b></p>
</body>

In [None]:
# We can search for specific strings in our find/find_all calls
import re
paragraphs = soup.find_all("p", string=re.compile("Some"))
headers = soup.find_all("h2",string = re.compile("(H|h)eader"))
headers

[<h2>A Header</h2>, <h2>Another header</h2>]

##select (CSS selector)

In [None]:
content = soup.select("div p")
#content
print(soup.body.prettify())

<body>
 <div align="middle">
  <h1>
   HTML Webpage
  </h1>
  <p>
   Link to more interesting example:
   <a href="https://keithgalli.github.io/web-scraping/webpage.html">
    keithgalli.github.io/web-scraping/webpage.html
   </a>
  </p>
 </div>
 <h2>
  A Header
 </h2>
 <p>
  <i>
   Some italicized text
  </i>
 </p>
 <h2>
  Another header
 </h2>
 <p id="paragraph-id">
  <b>
   Some bold text
  </b>
 </p>
</body>



In [None]:
bold_text = soup.select("p#paragraph-id b")
bold_text

[<b>Some bold text</b>]

In [None]:
paragraphs = soup.select("h2 ~ p")
paragraphs

[<p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [None]:
paragraphs = soup.select("body > p")
print(paragraphs)

for paragraph in paragraphs:
  print(paragraph.select("i"))

[<p><i>Some italicized text</i></p>, <p id="paragraph-id"><b>Some bold text</b></p>]
[<i>Some italicized text</i>]
[]


In [None]:
soup.select("[align = middle]")

[<div align="middle">
 <h1>HTML Webpage</h1>
 <p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
 </div>]

## Get different properties of the HTML


In [None]:
header = soup.find("h2")
header.string

div = soup.find("div")
print(div.prettify())
print(div.get_text())

<div align="middle">
 <h1>
  HTML Webpage
 </h1>
 <p>
  Link to more interesting example:
  <a href="https://keithgalli.github.io/web-scraping/webpage.html">
   keithgalli.github.io/web-scraping/webpage.html
  </a>
 </p>
</div>


HTML Webpage
Link to more interesting example: keithgalli.github.io/web-scraping/webpage.html



In [None]:
# Get a specific property from an element
link = soup.find("a")
link["href"]

paragraphs = soup.select('p#paragraph-id')
paragraphs[0]['id']


'paragraph-id'

## Code navigation


In [None]:
# Path syntax
print(soup.body.div.h1.string)
print('\n Body: \n' + soup.body.prettify())

HTML Webpage

 Body: 
<body>
 <div align="middle">
  <h1>
   HTML Webpage
  </h1>
  <p>
   Link to more interesting example:
   <a href="https://keithgalli.github.io/web-scraping/webpage.html">
    keithgalli.github.io/web-scraping/webpage.html
   </a>
  </p>
 </div>
 <h2>
  A Header
 </h2>
 <p>
  <i>
   Some italicized text
  </i>
 </p>
 <h2>
  Another header
 </h2>
 <p id="paragraph-id">
  <b>
   Some bold text
  </b>
 </p>
</body>



In [None]:
# Know the terms: Parent, Sibling, Child

soup.body.find("div").find_next_siblings()

[<h2>A Header</h2>,
 <p><i>Some italicized text</i></p>,
 <h2>Another header</h2>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

### Exercises!


In [4]:
# Load the webpage
r = requests.get("https://keithgalli.github.io/web-scraping/webpage.html")

# Convert to a beautiful soup object
webpage = bs(r.content)

# Print out HTML
print(webpage.prettify())

<html>
 <head>
  <title>
   Keith Galli's Page
  </title>
  <style>
   table {
    border-collapse: collapse;
  }
  th {
    padding:5px;
  }
  td {
    border: 1px solid #ddd;
    padding: 5px;
  }
  tr:nth-child(even) {
    background-color: #f2f2f2;
  }
  th {
    padding-top: 12px;
    padding-bottom: 12px;
    text-align: left;
    background-color: #add8e6;
    color: black;
  }
  .block {
  width: 100px;
  /*float: left;*/
    display: inline-block;
    zoom: 1;
  }
  .column {
  float: left;
  height: 200px;
  /*width: 33.33%;*/
  padding: 5px;
  }

  .row::after {
    content: "";
    clear: both;
    display: table;
  }
  </style>
 </head>
 <body>
  <h1>
   Welcome to my page!
  </h1>
  <img src="./images/selfie1.jpg" width="300px"/>
  <h2>
   About me
  </h2>
  <p>
   Hi, my name is Keith and I am a YouTuber who focuses on content related to programming, data science, and machine learning!
  </p>
  <p>
   Here is a link to my channel:
   <a href="https://www.youtube.com/kgmi

### Grab all of the social links from the webpage

In [None]:
links = webpage.select('ul.socials a')

all_links = [link['href'] for link in links]
all_links

['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

In [None]:
links = webpage.find("ul", attrs = {"class":"socials"})

x = links.find_all("a")

jp = list(x)

for dobre_chlopaki in jp:
  print(dobre_chlopaki['href'])


https://www.instagram.com/keithgalli/
https://twitter.com/keithgalli
https://www.linkedin.com/in/keithgalli/
https://www.tiktok.com/@keithgalli


In [None]:
links = webpage.select("li.social a")
lista = [link['href'] for link in links]
#x = list(links)
#lista = []
#for i in x:
#  lista.append(i['href'])
lista

['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

### Scrape the webpage


In [24]:
import pandas as pd

table = webpage.select("table.hockey-stats")[0]
columns = table.find("thead").find_all("th")
columns_name = [c.string for c in columns]

table_rows = table.find("tbody").find_all("tr")
l =[]
for tr in table_rows:
  td = tr.find_all('td')
  row = [str(tr.text).strip() for tr in td]
  l.append(row)
l
df = pd.DataFrame(l, columns=columns_name)
df.head()

Unnamed: 0,S,Team,League,GP,G,A,TP,PIM,+/-,Unnamed: 10,POST,GP.1,G.1,A.1,TP.1,PIM.1,+/-.1
0,2014-15,MIT (Mass. Inst. of Tech.),ACHA II,17.0,3.0,9.0,12.0,20.0,,|,,,,,,,
1,2015-16,MIT (Mass. Inst. of Tech.),ACHA II,9.0,1.0,1.0,2.0,2.0,,|,,,,,,,
2,2016-17,MIT (Mass. Inst. of Tech.),ACHA II,12.0,5.0,5.0,10.0,8.0,0.0,|,,,,,,,
3,2017-18,Did not play,,,,,,,,|,,,,,,,
4,2018-19,MIT (Mass. Inst. of Tech.),ACHA III,8.0,5.0,10.0,15.0,8.0,,|,,,,,,,


In [27]:
df.loc[df['Team'] != 'Did not play']

Unnamed: 0,S,Team,League,GP,G,A,TP,PIM,+/-,Unnamed: 10,POST,GP.1,G.1,A.1,TP.1,PIM.1,+/-.1
0,2014-15,MIT (Mass. Inst. of Tech.),ACHA II,17,3,9,12,20,,|,,,,,,,
1,2015-16,MIT (Mass. Inst. of Tech.),ACHA II,9,1,1,2,2,,|,,,,,,,
2,2016-17,MIT (Mass. Inst. of Tech.),ACHA II,12,5,5,10,8,0.0,|,,,,,,,
4,2018-19,MIT (Mass. Inst. of Tech.),ACHA III,8,5,10,15,8,,|,,,,,,,


### Grab all fun facts that use word "is"

In [41]:
import re

facts = webpage.select("ul.fun-facts li")
facts = [fact.find(string = re.compile("is")) for fact in facts]
facts = [fact.find_parent().get_text() for fact in facts if fact] # NONE - is a false condition
facts

['Middle name is Ronald',
 'Dunkin Donuts coffee is better than Starbucks',
 "A favorite book series of mine is Ender's Game",
 'Current video game of choice is Rocket League',
 "The band that I've seen the most times live is the Zac Brown Band"]

### Download an image

In [49]:
url = "https://keithgalli.github.io/web-scraping/"
r = requests.get(url +"webpage.html")

webpage_2 = bs(r.content)
images = webpage_2.select('div.row div.column img')

image_url = images[0]['src']
full_url = url + image_url
full_url

'https://keithgalli.github.io/web-scraping/images/italy/lake_como.jpg'

In [52]:
img_data = requests.get(full_url).content
with open('como.jpg','wb') as handler:
  handler.write(img_data)

# It does not work in Google Colab

### Mystery Message Challenge!

In [5]:
files = webpage.select("div.block a")
relative_files = [f['href'] for f in files]
url = "https://keithgalli.github.io/web-scraping/"
count = 0
for f in relative_files:
  full_url = url + relative_files[count]
  print(f"Current URL: {full_url}")
  page = requests.get(full_url)
  bs_page = bs(page.content)
  secret_word_element = (bs_page.find('p',attrs = {'id':"secret-word"}))
  secret_word = secret_word_element.string
  print(secret_word)
  count += 1


Current URL: https://keithgalli.github.io/web-scraping/challenge/file_1.html
Make
Current URL: https://keithgalli.github.io/web-scraping/challenge/file_2.html
sure
Current URL: https://keithgalli.github.io/web-scraping/challenge/file_3.html
to
Current URL: https://keithgalli.github.io/web-scraping/challenge/file_4.html
smash
Current URL: https://keithgalli.github.io/web-scraping/challenge/file_5.html
that
Current URL: https://keithgalli.github.io/web-scraping/challenge/file_6.html
like
Current URL: https://keithgalli.github.io/web-scraping/challenge/file_7.html
button
Current URL: https://keithgalli.github.io/web-scraping/challenge/file_8.html
and
Current URL: https://keithgalli.github.io/web-scraping/challenge/file_9.html
subscribe
Current URL: https://keithgalli.github.io/web-scraping/challenge/file_10.html
!!!
