In [43]:
import requests
from bs4  import BeautifulSoup
import pandas as pd

## 1. Web page

In [44]:
# Extract web page
url = 'https://weather.com/weather/tenday/l/San+Francisco+CA?canonicalCityId=dfdaba8cbe3a4d12a8796e1f7b1ccc7174b4b0a2d5ddb1c8566ae9f154fa638c'
response = requests.get(url)
print(response)

# Scraper
soup = BeautifulSoup(response.content, 'html.parser') 

<Response [200]>


**Understading what the elements I need have in common**

* Week day
class =  `"DailyContent--daypartDate--2A3Wi"`

* Weather description
class = `"DailyContent--narrative--hplRl"`

* Temperature (low and high)
class = `"DailyContent--temp--3d4dn"`




## 2. Extracting information

In [45]:
# Class names
weekday_class = 'DailyContent--daypartDate--2A3Wi'
weather_description_class = 'DailyContent--narrative--hplRl'
temperature_class = "DailyContent--temp--3d4dn"

# Extracting information I need
raw_data = {weekday_class: [],
        weather_description_class: [],
        temperature_class: []
        }

for tag_class in raw_data:
    tags= soup.find_all(class_ = tag_class)
    for element in tags:
        raw_data[tag_class].append( element.text )
            

## 3. Cleaning data

In [46]:
data = pd.DataFrame(raw_data)

# reset column names to meaningful ones
data.columns = ['weekday', 'description', 'temp_degree']


In [47]:
# cleaning temperature
data['temp_degree'] = (data['temp_degree']                  # grab `temp_degree` column
                        .str.replace("°", '')               # remove `°` symbol 
                        .apply(lambda x: pd.to_numeric(x))  # change string to number
                        )


## 4. Adding date and temperature state columns

In [48]:
# Date column

yesterday = pd.to_datetime("today") - pd.Timedelta(1, unit = 'D')

date = pd.Series((pd.date_range(start= yesterday, periods=15 )
                .repeat(2)  # repeat each datetime by 2 to account for day and night pattern
                .date))   

# Comply with website change of weekday 
# In the morning, first weekday is yesterday, around noon this changes to current weekday

first_value_week = soup.find("h2", text = "Tonight").text.lower()
if  first_value_week == 'tonight':
    data['date'] = date[1:].reset_index(drop=True)
    
else:
    data['date'] = date[2:]

In [49]:
# temperature state (i.e. low or high)
# this information is embedded in the description column


temp_state = (data['description']                       # grab `description` column
                    .str.lower()                        # lower all characters for each description (row)
                    .str.contains('high')               # check for the presence of `high` within each description (True or False)
                    .map({True:'high', False:'low'})    # change `True` -> 'high' and `False` -> `low`
            )

# Adding `temp_state` to data
data['temp_state'] = temp_state

In [50]:
# Reorder column positions and final overview for check

data = data.reindex(columns= ['date', 'weekday', 'temp_degree', 'temp_state','description'])

data.head(10)

Unnamed: 0,date,weekday,temp_degree,temp_state,description
0,2022-03-12,Sat 12,49,low,Cloudy with a few showers. Low 49F. Winds SW a...
1,2022-03-13,Sun 13,57,high,Cloudy early with partial sunshine expected la...
2,2022-03-13,Sun 13,44,low,Partly cloudy early with increasing clouds ove...
3,2022-03-14,Mon 14,61,high,Partly cloudy skies. High 61F. Winds SW at 10 ...
4,2022-03-14,Mon 14,53,low,Showers early becoming a steady light rain lat...
5,2022-03-15,Tue 15,59,high,"Showers in the morning, then cloudy in the aft..."
6,2022-03-15,Tue 15,46,low,Partly cloudy. Low 46F. Winds WNW at 10 to 15 ...
7,2022-03-16,Wed 16,62,high,Some clouds in the morning will give way to ma...
8,2022-03-16,Wed 16,48,low,Partly cloudy skies early will become overcast...
9,2022-03-17,Thu 17,61,high,Morning clouds will give way to sunshine for t...
