In [3]:
#Example 2 
#Web Scrappping
#Importing the requests Library, downloading the html page and extract the codes using Beautiful soup
import requests
from bs4 import BeautifulSoup

In [4]:
page = requests.get("https://dataquestio.github.io/web-scraping-pages/ids_and_classes.html")
soup = BeautifulSoup(page.content, 'html.parser')
soup

<html>
<head>
<title>A simple example page</title>
</head>
<body>
<div>
<p class="inner-text first-item" id="first">
                First paragraph.
            </p>
<p class="inner-text">
                Second paragraph.
            </p>
</div>
<p class="outer-text first-item" id="second">
<b>
                First outer paragraph.
            </b>
</p>
<p class="outer-text">
<b>
                Second outer paragraph.
            </b>
</p>
</body>
</html>

In [7]:
soup.find_all('p', class_='outer-text')

[<p class="outer-text first-item" id="second">
 <b>
                 First outer paragraph.
             </b>
 </p>,
 <p class="outer-text">
 <b>
                 Second outer paragraph.
             </b>
 </p>]

In [8]:
soup.find_all(class_='outer-text')

[<p class="outer-text first-item" id="second">
 <b>
                 First outer paragraph.
             </b>
 </p>,
 <p class="outer-text">
 <b>
                 Second outer paragraph.
             </b>
 </p>]

# Web Scrapping for Weather Information

In [9]:
#Downloading the page when we want to scrap
page = requests.get("https://forecast.weather.gov/MapClick.php?lat=37.7772&lon=-122.4168")
soup = BeautifulSoup(page.content, 'html.parser')
soup

<!DOCTYPE html>

<html class="no-js">
<head>
<!-- Meta -->
<meta content="width=device-width" name="viewport"/>
<link href="http://purl.org/dc/elements/1.1/" rel="schema.DC"/><title>National Weather Service</title><meta content="National Weather Service" name="DC.title"><meta content="NOAA National Weather Service National Weather Service" name="DC.description"/><meta content="US Department of Commerce, NOAA, National Weather Service" name="DC.creator"/><meta content="" name="DC.date.created" scheme="ISO8601"/><meta content="EN-US" name="DC.language" scheme="DCTERMS.RFC1766"/><meta content="weather, National Weather Service" name="DC.keywords"/><meta content="NOAA's National Weather Service" name="DC.publisher"/><meta content="National Weather Service" name="DC.contributor"/><meta content="//www.weather.gov/disclaimer.php" name="DC.rights"/><meta content="General" name="rating"/><meta content="index,follow" name="robots"/>
<!-- Icons -->
<link href="./images/favicon.ico" rel="shortcut 

In [10]:
seven_day_forecast = soup.find(id='seven-day-forecast')
forecast_items = seven.find_all(class_='tombstone-container')

In [12]:
tonight = forecast_items[0]

In [13]:
print(tonight.prettify())

<div class="tombstone-container">
 <p class="period-name">
  Today
  <br/>
  <br/>
 </p>
 <p>
  <img alt="Today: Sunny, with a high near 74. West southwest wind 10 to 15 mph increasing to 16 to 21 mph in the afternoon. Winds could gust as high as 25 mph. " class="forecast-icon" src="newimages/medium/few.png" title="Today: Sunny, with a high near 74. West southwest wind 10 to 15 mph increasing to 16 to 21 mph in the afternoon. Winds could gust as high as 25 mph. "/>
 </p>
 <p class="short-desc">
  Sunny
 </p>
 <p class="temp temp-high">
  High: 74 °F
 </p>
</div>


In [14]:
period = tonight.find(class_='period-name').get_text()
short_desc = tonight.find(class_='short-desc').get_text()
temp = tonight.find(class_='temp').get_text()
print(period)
print(short_desc)
print(temp)

Today
Sunny
High: 74 °F


In [15]:
#Extracting the title information
img = tonight.find("img")
desc = img['title']
print(desc)

Today: Sunny, with a high near 74. West southwest wind 10 to 15 mph increasing to 16 to 21 mph in the afternoon. Winds could gust as high as 25 mph. 


In [16]:
period_tags = seven_day_forecast.select(".tombstone-container .period-name")
periods = [pd.get_text() for pd in period_tags]
periods

['Today',
 'Tonight',
 'Friday',
 'FridayNight',
 'Saturday',
 'SaturdayNight',
 'Sunday',
 'SundayNight',
 'LaborDay']

In [17]:
short_descs = [sd.get_text() for sd in seven_day_forecast.select(".tombstone-container .short-desc")]
temps = [t.get_text() for t in seven_day_forecast.select(".tombstone-container .temp")]
descs = [d['title'] for d in seven_day_forecast.select(".tombstone-container img")]
print(short_descs)
print(temps)
print(descs)

['Sunny', 'Mostly Clearand Breezythen PartlyCloudy', 'Mostly Sunnythen Sunnyand Breezy', 'Mostly Clear', 'Sunny', 'Mostly Clear', 'Sunny', 'Mostly Clear', 'Sunny']
['High: 74 °F', 'Low: 57 °F', 'High: 74 °F', 'Low: 59 °F', 'High: 78 °F', 'Low: 60 °F', 'High: 82 °F', 'Low: 62 °F', 'High: 81 °F']
['Today: Sunny, with a high near 74. West southwest wind 10 to 15 mph increasing to 16 to 21 mph in the afternoon. Winds could gust as high as 25 mph. ', 'Tonight: Mostly clear, with a low around 57. Breezy, with a west southwest wind 13 to 22 mph, with gusts as high as 26 mph. ', 'Friday: Partly sunny, then gradually becoming sunny, with a high near 74. Breezy, with a west southwest wind 14 to 23 mph, with gusts as high as 29 mph. ', 'Friday Night: Mostly clear, with a low around 59. West wind 15 to 20 mph decreasing to 9 to 14 mph after midnight. Winds could gust as high as 24 mph. ', 'Saturday: Sunny, with a high near 78. West wind 5 to 10 mph increasing to 12 to 17 mph in the afternoon. Wind

In [18]:
#importing the Pandas Library
#Transforming our information scrapped into Pandas DataFrame
import pandas as pd
weather = pd.DataFrame({
    'period': periods,
    'short_desc': short_descs,
    'temp': temps,
    'desc': descs
})

In [19]:
weather.head()

Unnamed: 0,period,short_desc,temp,desc
0,Today,Sunny,High: 74 °F,"Today: Sunny, with a high near 74. West southw..."
1,Tonight,Mostly Clearand Breezythen PartlyCloudy,Low: 57 °F,"Tonight: Mostly clear, with a low around 57. B..."
2,Friday,Mostly Sunnythen Sunnyand Breezy,High: 74 °F,"Friday: Partly sunny, then gradually becoming ..."
3,FridayNight,Mostly Clear,Low: 59 °F,"Friday Night: Mostly clear, with a low around ..."
4,Saturday,Sunny,High: 78 °F,"Saturday: Sunny, with a high near 78. West win..."


In [24]:
temp_nums = weather["temp"].str.extract("(?P<temp_num>\d+)", expand=False)
weather["temp_num"] = temp_nums.astype('int')
temp_nums

0    74
1    57
2    74
3    59
4    78
5    60
6    82
7    62
8    81
Name: temp_num, dtype: object

In [25]:
weather['temp_num'].mean()

69.66666666666667

In [26]:
is_night = weather["temp"].str.contains("Low")
weather["is_night"] = is_night
is_night

0    False
1     True
2    False
3     True
4    False
5     True
6    False
7     True
8    False
Name: temp, dtype: bool

In [27]:
weather

Unnamed: 0,period,short_desc,temp,desc,temp_num,is_night
0,Today,Sunny,High: 74 °F,"Today: Sunny, with a high near 74. West southw...",74,False
1,Tonight,Mostly Clearand Breezythen PartlyCloudy,Low: 57 °F,"Tonight: Mostly clear, with a low around 57. B...",57,True
2,Friday,Mostly Sunnythen Sunnyand Breezy,High: 74 °F,"Friday: Partly sunny, then gradually becoming ...",74,False
3,FridayNight,Mostly Clear,Low: 59 °F,"Friday Night: Mostly clear, with a low around ...",59,True
4,Saturday,Sunny,High: 78 °F,"Saturday: Sunny, with a high near 78. West win...",78,False
5,SaturdayNight,Mostly Clear,Low: 60 °F,"Saturday Night: Mostly clear, with a low aroun...",60,True
6,Sunday,Sunny,High: 82 °F,"Sunday: Sunny, with a high near 82.",82,False
7,SundayNight,Mostly Clear,Low: 62 °F,"Sunday Night: Mostly clear, with a low around 62.",62,True
8,LaborDay,Sunny,High: 81 °F,"Labor Day: Sunny, with a high near 81.",81,False
