In [1]:
# If the webpage (HTML) has a table inside, we can easily extract it with Pandas and requests.
import pandas as pd
import requests

In [2]:
url = 'https://www.worldcoinindex.com/'
crypto_url = requests.get(url)
crypto_url

<Response [200]>

In [3]:
body = crypto_url.text

In [4]:
# Body now consists of full HTML source code of our webpage. Now if the HTML source has a table which is marked by the HTML tag <table></table> 
# (this tag is used for defining a table in HTML) Pandas uses read_html() to extract the table from the HTML document.

# *** Make sure has table to look nice in pandas

crypto_data = pd.read_html(body)
print(type(crypto_data))
print(len(crypto_data))

<class 'list'>
1


In [5]:
# now know list with one element in it - so grab it all
crypto_data = crypto_data[0]
crypto_data.head()

Unnamed: 0,#,Unnamed: 1,Name,Ticker,Last price,%,24 high,24 low,Price Charts 7d,24 volume,# Coins,Market cap
0,1,,Bitcoin,BTC,"$ 42,322",+4.07%,"$ 42,742","$ 40,589",,$ 21.51B,18.82M,$ 796.63B
1,2,,Ethereum,ETH,"$ 2,943.72",+6.66%,"$ 2,972.54","$ 2,739.42",,$ 16.12B,117.62M,$ 346.26B
2,3,,Solana,SOL,$ 130.20,+5.02%,$ 133.94,$ 122.34,,$ 3.89B,261.90M,$ 34.09B
3,4,,Cardano,ADA,$ 2.12,+6.78%,$ 2.16,$ 1.97,,$ 2.95B,31.94B,$ 67.70B
4,5,,Ripple,XRP,$ 0.929559,+6.31%,$ 0.948456,$ 0.868113,,$ 2.51B,46.71B,$ 43.42B


In [None]:
# if no table in html...
# use Beautiful Soup

# look at terms and condtions first before webscraping

# consider cacheing and build pauses eg time.sleep() to keep from overwhelming servers with too many requests


## web scraping

 tutorial found here: https://www.dataquest.io/blog/web-scraping-python-using-beautiful-soup/

In [6]:
import requests
page = requests.get("https://dataquestio.github.io/web-scraping-pages/simple.html")
page

<Response [200]>

In [7]:
page.status_code

200

In [8]:
page.content

b'<!DOCTYPE html>\n<html>\n    <head>\n        <title>A simple example page</title>\n    </head>\n    <body>\n        <p>Here is some simple content for this page.</p>\n    </body>\n</html>'

In [None]:
# We can use the BeautifulSoup library to parse this document, and extract the text from the p tag.

In [9]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(page.content, 'html.parser')

In [10]:
print(soup.prettify())

<!DOCTYPE html>
<html>
 <head>
  <title>
   A simple example page
  </title>
 </head>
 <body>
  <p>
   Here is some simple content for this page.
  </p>
 </body>
</html>


In [11]:
list(soup.children)  #.children is generator so need to call list on it

['html',
 '\n',
 <html>
 <head>
 <title>A simple example page</title>
 </head>
 <body>
 <p>Here is some simple content for this page.</p>
 </body>
 </html>]

In [12]:
[type(item) for item in list(soup.children)]

[bs4.element.Doctype, bs4.element.NavigableString, bs4.element.Tag]

In [13]:
html = list(soup.children)[2]

In [14]:
list(html.children)

['\n',
 <head>
 <title>A simple example page</title>
 </head>,
 '\n',
 <body>
 <p>Here is some simple content for this page.</p>
 </body>,
 '\n']

In [15]:
# for head
body = list(html.children)[3]

In [16]:
list(body.children)

['\n', <p>Here is some simple content for this page.</p>, '\n']

In [17]:
# isolate p
p = list(body.children)[1]

In [18]:
p.get_text()

'Here is some simple content for this page.'

In [19]:
# Find all instances of tag at once
soup = BeautifulSoup(page.content, 'html.parser')
soup.find_all('p')

[<p>Here is some simple content for this page.</p>]

In [20]:
# note - find all returns list so will need to loop through and parse
soup.find_all('p')[0].get_text()

'Here is some simple content for this page.'

In [21]:
# if only want first
soup.find('p')

<p>Here is some simple content for this page.</p>

In [23]:
# Search by tags and class and id
page = requests.get("https://dataquestio.github.io/web-scraping-pages/ids_and_classes.html")
soup = BeautifulSoup(page.content, 'html.parser')
print(soup.prettify())

<html>
 <head>
  <title>
   A simple example page
  </title>
 </head>
 <body>
  <div>
   <p class="inner-text first-item" id="first">
    First paragraph.
   </p>
   <p class="inner-text">
    Second paragraph.
   </p>
  </div>
  <p class="outer-text first-item" id="second">
   <b>
    First outer paragraph.
   </b>
  </p>
  <p class="outer-text">
   <b>
    Second outer paragraph.
   </b>
  </p>
 </body>
</html>


In [24]:
# search for p tag outer text
soup.find_all('p', class_='outer-text')

[<p class="outer-text first-item" id="second">
 <b>
                 First outer paragraph.
             </b>
 </p>,
 <p class="outer-text">
 <b>
                 Second outer paragraph.
             </b>
 </p>]

In [25]:
# find all tags with class"outer-text"
soup.find_all(class_="outer-text")

[<p class="outer-text first-item" id="second">
 <b>
                 First outer paragraph.
             </b>
 </p>,
 <p class="outer-text">
 <b>
                 Second outer paragraph.
             </b>
 </p>]

In [26]:
# or search by element
soup.find_all(id="first")

[<p class="inner-text first-item" id="first">
                 First paragraph.
             </p>]

#### css selectors

p a — finds all a tags inside of a p tag.

body p a — finds all a tags inside of a p tag inside of a body tag.

html body — finds all body tags inside of an html tag.

p.outer-text — finds all p tags with a class of outer-text.

p#first — finds all p tags with an id of first.

body p.outer-text — finds any p tags with a class of outer-text inside of a body tag.

In [27]:
# finds all p tags insie div
soup.select("div p")

[<p class="inner-text first-item" id="first">
                 First paragraph.
             </p>,
 <p class="inner-text">
                 Second paragraph.
             </p>]

In [None]:
 On chrome
    View -> Developer -> Developer Tools
    ensure elements slected

In [28]:
page = requests.get("https://forecast.weather.gov/MapClick.php?lat=37.7772&lon=-122.4168")
soup = BeautifulSoup(page.content, 'html.parser')
seven_day = soup.find(id="seven-day-forecast")
forecast_items = seven_day.find_all(class_="tombstone-container")
tonight = forecast_items[0]
print(tonight.prettify())

<div class="tombstone-container">
 <p class="period-name">
  Today
  <br/>
  <br/>
 </p>
 <p>
  <img alt="Today: Mostly sunny, with a high near 71. Breezy, with a west southwest wind 13 to 22 mph, with gusts as high as 29 mph. " class="forecast-icon" src="DualImage.php?i=sct&amp;j=wind_few" title="Today: Mostly sunny, with a high near 71. Breezy, with a west southwest wind 13 to 22 mph, with gusts as high as 29 mph. "/>
 </p>
 <p class="short-desc">
  Mostly Sunny
  <br/>
  then Sunny
  <br/>
  and Breezy
 </p>
 <p class="temp temp-high">
  High: 71 °F
 </p>
</div>


In [29]:
period = tonight.find(class_="period-name").get_text()
short_desc = tonight.find(class_="short-desc").get_text()
temp = tonight.find(class_="temp").get_text()
print(period)
print(short_desc)
print(temp)

Today
Mostly Sunnythen Sunnyand Breezy
High: 71 °F


In [30]:
img = tonight.find("img")
desc = img['title']
print(desc)

Today: Mostly sunny, with a high near 71. Breezy, with a west southwest wind 13 to 22 mph, with gusts as high as 29 mph. 



Select all items with the class period-name inside an item with the class tombstone-container in seven_day.
Use a list comprehension to call the get_text method on each BeautifulSoup objec


In [31]:
period_tags = seven_day.select(".tombstone-container .period-name")
periods = [pt.get_text() for pt in period_tags]
periods

['Today',
 'Tonight',
 'Thursday',
 'ThursdayNight',
 'Friday',
 'FridayNight',
 'Saturday',
 'SaturdayNight',
 'Sunday']

In [32]:
short_descs = [sd.get_text() for sd in seven_day.select(".tombstone-container .short-desc")]
temps = [t.get_text() for t in seven_day.select(".tombstone-container .temp")]
descs = [d["title"] for d in seven_day.select(".tombstone-container img")]print(short_descs)print(temps)print(descs)

SyntaxError: invalid syntax (<ipython-input-32-a6ade7bae453>, line 3)

In [33]:
import pandas as pd
weather = pd.DataFrame({
    "period": periods,
#     "short_desc": short_descs,
#     "temp": temps,
#     "desc":descs
})
weather

Unnamed: 0,period
0,Today
1,Tonight
2,Thursday
3,ThursdayNight
4,Friday
5,FridayNight
6,Saturday
7,SaturdayNight
8,Sunday


In [None]:
temp_nums = weather["temp"].str.extract("(?Pd+)", expand=False)
weather["temp_num"] = temp_nums.astype('int')
temp_nums

In [None]:
weather["temp_num"].mean()

In [None]:
is_night = weather["temp"].str.contains("Low")
weather["is_night"] = is_night
is_night

In [None]:
weather[is_night]