In [1]:
import pandas as pd
import requests

In [2]:
# Download the web page's source code to Python.

In [3]:
url = 'https://www.worldcoinindex.com/'
crypto_url = requests.get(url)
crypto_url

<Response [200]>

In [4]:
body = crypto_url.text

In [5]:
crypto_data = pd.read_html(body)
print(type(crypto_data))
print(len(crypto_data))

<class 'list'>
1


In [6]:
crypto_data = crypto_data[0]
crypto_data.head()

Unnamed: 0,#,Unnamed: 1,Name,Ticker,Last price,%,24 high,24 low,Price Charts 7d,24 volume,# Coins,Market cap
0,1,,Bitcoin,BTC,"$ 40,014",+1.41%,"$ 40,889","$ 38,854",,$ 16.49B,18.76M,$ 751.05B
1,2,,Ethereum,ETH,"$ 2,303.67",+0.14%,"$ 2,347.23","$ 2,249.92",,$ 10.90B,116.88M,$ 269.25B
2,3,,Ripple,XRP,$ 0.709218,+9.94%,$ 0.751880,$ 0.631915,,$ 3.90B,46.31B,$ 32.84B
3,4,,Axie Infinity,AXS,$ 46.00,-1.12%,$ 50.40,$ 44.34,,$ 3.78B,43.25M,$ 1.98B
4,5,,Dogecoin,DOGE,$ 0.205536,-0.21%,$ 0.213395,$ 0.201849,,$ 1.74B,129.40B,$ 26.59B


If we want to extract information from HTML, which doesn't have a table, we need to use a different approach: **Scraping**.

Fortunately, Python has a great package for this called `Beautiful Soup`.

## Web Scraping

Reference: https://www.dataquest.io/blog/web-scraping-python-using-beautiful-soup/

In [7]:
import requests
page = requests.get("https://dataquestio.github.io/web-scraping-pages/simple.html")
page

<Response [200]>

In [8]:
page.status_code

200

In [9]:
page.content

b'<!DOCTYPE html>\n<html>\n    <head>\n        <title>A simple example page</title>\n    </head>\n    <body>\n        <p>Here is some simple content for this page.</p>\n    </body>\n</html>'

#### Parsing a page with BeautifulSoup


In [10]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(page.content, 'html.parser')

In [11]:
print(soup.prettify())

<!DOCTYPE html>
<html>
 <head>
  <title>
   A simple example page
  </title>
 </head>
 <body>
  <p>
   Here is some simple content for this page.
  </p>
 </body>
</html>


In [12]:
list(soup.children)

['html',
 '\n',
 <html>
 <head>
 <title>A simple example page</title>
 </head>
 <body>
 <p>Here is some simple content for this page.</p>
 </body>
 </html>]

In [13]:
[type(item) for item in list(soup.children)]

[bs4.element.Doctype, bs4.element.NavigableString, bs4.element.Tag]

In [14]:
html = list(soup.children)[2]
print(html)

<html>
<head>
<title>A simple example page</title>
</head>
<body>
<p>Here is some simple content for this page.</p>
</body>
</html>


In [15]:
list(html.children)

['\n',
 <head>
 <title>A simple example page</title>
 </head>,
 '\n',
 <body>
 <p>Here is some simple content for this page.</p>
 </body>,
 '\n']

In [20]:
body = list(html.children)[3]
body

<body>
<p>Here is some simple content for this page.</p>
</body>

In [21]:
list(body.children)

['\n', <p>Here is some simple content for this page.</p>, '\n']

In [22]:
p = list(body.children)[1]
p.get_text()

'Here is some simple content for this page.'

#### Finding all instances of a tag at once

In [23]:
soup = BeautifulSoup(page.content, 'html.parser')
soup.find_all('p')

[<p>Here is some simple content for this page.</p>]

In [24]:
# Find a list of all matching tags
soup.find_all('p')[0].get_text()

'Here is some simple content for this page.'

In [25]:
# Find only the first instance of a tag
soup.find('p')

<p>Here is some simple content for this page.</p>

#### Searching for tags by class and id


In [27]:
page = requests.get("https://dataquestio.github.io/web-scraping-pages/ids_and_classes.html")
soup = BeautifulSoup(page.content, 'html.parser')
print(soup.prettify())

<html>
 <head>
  <title>
   A simple example page
  </title>
 </head>
 <body>
  <div>
   <p class="inner-text first-item" id="first">
    First paragraph.
   </p>
   <p class="inner-text">
    Second paragraph.
   </p>
  </div>
  <p class="outer-text first-item" id="second">
   <b>
    First outer paragraph.
   </b>
  </p>
  <p class="outer-text">
   <b>
    Second outer paragraph.
   </b>
  </p>
 </body>
</html>


Use the `find_all` method to search for items by class or by id:

In [28]:
# search for any p tag that has the class outer-text:
soup.find_all('p', class_='outer-text')

[<p class="outer-text first-item" id="second">
 <b>
                 First outer paragraph.
             </b>
 </p>,
 <p class="outer-text">
 <b>
                 Second outer paragraph.
             </b>
 </p>]

In [29]:
# look for any tag that has the class outer-text:
soup.find_all(class_="outer-text")

[<p class="outer-text first-item" id="second">
 <b>
                 First outer paragraph.
             </b>
 </p>,
 <p class="outer-text">
 <b>
                 Second outer paragraph.
             </b>
 </p>]

In [30]:
# Search for elements by id:
soup.find_all(id="first")

[<p class="inner-text first-item" id="first">
                 First paragraph.
             </p>]

#### Using CSS Selectors

Use CSS selectors to find all the `p` tags in our page that are inside of a `div` like this:

In [31]:
soup.select("div p")

[<p class="inner-text first-item" id="first">
                 First paragraph.
             </p>,
 <p class="inner-text">
                 Second paragraph.
             </p>]

## Downloading weather data

In [33]:
page = requests.get("https://forecast.weather.gov/MapClick.php?lat=37.7772&lon=-122.4168")
soup = BeautifulSoup(page.content, 'html.parser')
seven_day = soup.find(id="seven-day-forecast")
forecast_items = seven_day.find_all(class_="tombstone-container")
tonight = forecast_items[0]
print(tonight.prettify())

<div class="tombstone-container">
 <p class="period-name">
  Tonight
  <br/>
  <br/>
 </p>
 <p>
  <img alt="Tonight: Mostly cloudy, with a low around 57. Breezy, with a west southwest wind 19 to 24 mph decreasing to 11 to 16 mph after midnight. Winds could gust as high as 31 mph. " class="forecast-icon" src="DualImage.php?i=nwind_bkn&amp;j=nbkn" title="Tonight: Mostly cloudy, with a low around 57. Breezy, with a west southwest wind 19 to 24 mph decreasing to 11 to 16 mph after midnight. Winds could gust as high as 31 mph. "/>
 </p>
 <p class="short-desc">
  Mostly Cloudy
  <br/>
  and Breezy
  <br/>
  then Mostly
  <br/>
  Cloudy
 </p>
 <p class="temp temp-low">
  Low: 57 °F
 </p>
</div>


In [42]:
period = tonight.find(class_="period-name").get_text()
short_desc = tonight.find(class_="short-desc").get_text()
temp = tonight.find(class_="temp").get_text()
print(period)
print(short_desc)
print(temp)

Tonight
Mostly Cloudyand Breezythen MostlyCloudy
Low: 57 °F


In [43]:
img = tonight.find("img")
desc = img['title']
print(desc)

Tonight: Mostly cloudy, with a low around 57. Breezy, with a west southwest wind 19 to 24 mph decreasing to 11 to 16 mph after midnight. Winds could gust as high as 31 mph. 


In [44]:
# Extracting all the information from the page

period_tags = seven_day.select(".tombstone-container .period-name")
periods = [pt.get_text() for pt in period_tags]
periods

['Tonight',
 'Thursday',
 'ThursdayNight',
 'Friday',
 'FridayNight',
 'Saturday',
 'SaturdayNight',
 'Sunday',
 'SundayNight']

In [48]:
short_descs = [sd.get_text() for sd in seven_day.select(".tombstone-container .short-desc")]
temps = [t.get_text() for t in seven_day.select(".tombstone-container .temp")]
descs = [d["title"] for d in seven_day.select(".tombstone-container img")]
print(short_descs)
print(temps)
print(descs)

['Mostly Cloudyand Breezythen MostlyCloudy', 'Partly Sunnythen MostlySunny andBreezy', 'Partly Cloudyand Breezythen MostlyCloudy', 'Mostly Sunnythen Sunnyand Breezy', 'Partly Cloudyand Breezythen MostlyCloudy', 'Partly Sunny', 'Mostly Cloudy', 'Mostly Sunny', 'Partly Cloudy']
['Low: 57 °F', 'High: 70 °F', 'Low: 56 °F', 'High: 69 °F', 'Low: 56 °F', 'High: 67 °F', 'Low: 56 °F', 'High: 69 °F', 'Low: 56 °F']
['Tonight: Mostly cloudy, with a low around 57. Breezy, with a west southwest wind 19 to 24 mph decreasing to 11 to 16 mph after midnight. Winds could gust as high as 31 mph. ', 'Thursday: Mostly sunny, with a high near 70. Breezy, with a west southwest wind 11 to 16 mph increasing to 20 to 25 mph in the afternoon. Winds could gust as high as 33 mph. ', 'Thursday Night: Mostly cloudy, with a low around 56. Breezy, with a west southwest wind 14 to 22 mph, with gusts as high as 29 mph. ', 'Friday: Mostly sunny, with a high near 69. Breezy, with a west southwest wind 14 to 23 mph, with gu

## Combining our data into a Pandas Dataframe

In [49]:
import pandas as pd
weather = pd.DataFrame({
    "period": periods,
    "short_desc": short_descs,
    "temp": temps,
    "desc":descs
})
weather

Unnamed: 0,period,short_desc,temp,desc
0,Tonight,Mostly Cloudyand Breezythen MostlyCloudy,Low: 57 °F,"Tonight: Mostly cloudy, with a low around 57. ..."
1,Thursday,Partly Sunnythen MostlySunny andBreezy,High: 70 °F,"Thursday: Mostly sunny, with a high near 70. B..."
2,ThursdayNight,Partly Cloudyand Breezythen MostlyCloudy,Low: 56 °F,"Thursday Night: Mostly cloudy, with a low arou..."
3,Friday,Mostly Sunnythen Sunnyand Breezy,High: 69 °F,"Friday: Mostly sunny, with a high near 69. Bre..."
4,FridayNight,Partly Cloudyand Breezythen MostlyCloudy,Low: 56 °F,"Friday Night: Mostly cloudy, with a low around..."
5,Saturday,Partly Sunny,High: 67 °F,"Saturday: Partly sunny, with a high near 67."
6,SaturdayNight,Mostly Cloudy,Low: 56 °F,"Saturday Night: Mostly cloudy, with a low arou..."
7,Sunday,Mostly Sunny,High: 69 °F,"Sunday: Mostly sunny, with a high near 69."
8,SundayNight,Partly Cloudy,Low: 56 °F,"Sunday Night: Partly cloudy, with a low around..."


In [68]:
import re

In [85]:
temp_nums = weather['temp'].apply(lambda x:re.findall(r'\d+', x)[0])

#temp_nums = weather["temp"].str.extract(r'\d+', expand=False)
weather["temp_num"] = temp_nums.astype('int')
temp_nums

0    57
1    70
2    56
3    69
4    56
5    67
6    56
7    69
8    56
Name: temp, dtype: object

In [81]:
weather["temp_num"].mean()

61.77777777777778

In [82]:
is_night = weather["temp"].str.contains("Low")
weather["is_night"] = is_night
is_night

0     True
1    False
2     True
3    False
4     True
5    False
6     True
7    False
8     True
Name: temp, dtype: bool

In [83]:
weather[is_night]

Unnamed: 0,period,short_desc,temp,desc,temp_num,is_night
0,Tonight,Mostly Cloudyand Breezythen MostlyCloudy,Low: 57 °F,"Tonight: Mostly cloudy, with a low around 57. ...",57,True
2,ThursdayNight,Partly Cloudyand Breezythen MostlyCloudy,Low: 56 °F,"Thursday Night: Mostly cloudy, with a low arou...",56,True
4,FridayNight,Partly Cloudyand Breezythen MostlyCloudy,Low: 56 °F,"Friday Night: Mostly cloudy, with a low around...",56,True
6,SaturdayNight,Mostly Cloudy,Low: 56 °F,"Saturday Night: Mostly cloudy, with a low arou...",56,True
8,SundayNight,Partly Cloudy,Low: 56 °F,"Sunday Night: Partly cloudy, with a low around...",56,True
