In [251]:
import requests
from bs4  import BeautifulSoup
import pandas as pd

## 1. Web page

In [252]:
# Extract web page
url = 'https://weather.com/weather/tenday/l/San+Francisco+CA?canonicalCityId=dfdaba8cbe3a4d12a8796e1f7b1ccc7174b4b0a2d5ddb1c8566ae9f154fa638c'
response = requests.get(url)
print(response)

# Scraper
soup = BeautifulSoup(response.content, 'html.parser') 

<Response [200]>


**Understading what the elements I need have in common**

* Week day
class =  `"DailyContent--daypartDate--2A3Wi"`

* Weather description
class = `"DailyContent--narrative--hplRl"`

* Temperature (low and high)
class = `"DailyContent--temp--3d4dn"`




## 2. Extracting information

In [253]:
# Class names
weekday_class = 'DailyContent--daypartDate--2A3Wi'
weather_description_class = 'DailyContent--narrative--hplRl'
temperature_class = "DailyContent--temp--3d4dn"

# Extracting information I need
raw_data = {weekday_class: [],
        weather_description_class: [],
        temperature_class: []
        }

for tag_class in raw_data:
    tags= soup.find_all(class_ = tag_class)
    for element in tags:
        raw_data[tag_class].append( element.text )
            

## 3. Cleaning data

In [254]:
data = pd.DataFrame(raw_data)

# reset column names to meaningful ones
data.columns = ['weekday', 'description', 'temp_degree']


In [255]:
# cleaning temperature
data['temp_degree'] = (data['temp_degree']                  # grab `temp_degree` column
                        .str.replace("°", '')               # remove `°` symbol 
                        .apply(lambda x: pd.to_numeric(x))  # change string to number
                        )


## 4. Adding date and temperature state columns

In [256]:
# Creating date

date = (pd.date_range(start= pd.to_datetime("today"), periods=15)       # generate a range of 15 datetime objets, starting from today.
        .repeat(2)                                                      # repeat each datetime by 2 to account for day and night pattern
        .date                                                           # extract only the date part from the datetime
        )


# Adding date to data
data['date'] = date

In [257]:
# temperature state (i.e. low or high)
# this information is embedded in the description column


temp_state = (data['description']                       # grab `description` column
                    .str.lower()                        # lower all characters for each description (row)
                    .str.contains('high')               # check for the presence of `high` within each description (True or False)
                    .map({True:'high', False:'low'})    # change `True` -> 'high' and `False` -> `low`
            )

# Adding `temp_state` to data
data['temp_state'] = temp_state

In [259]:
# Reorder column positions and final overview for check

data = data.reindex(columns= ['date', 'weekday', 'temp_degree', 'temp_state','description'])

data.head(10)

Unnamed: 0,date,weekday,temp_degree,temp_state,description
20,2022-03-22,Tue 22,67,high,Mostly sunny skies. High 67F. Winds WNW at 10 ...
21,2022-03-22,Tue 22,47,low,Mainly clear skies. Low 47F. Winds WNW at 10 t...
22,2022-03-23,Wed 23,68,high,Intervals of clouds and sunshine. High 68F. Wi...
23,2022-03-23,Wed 23,46,low,Mainly clear skies. Low 46F. Winds WNW at 10 t...
24,2022-03-24,Thu 24,67,high,Partly cloudy skies. High 67F. Winds WNW at 10...
25,2022-03-24,Thu 24,46,low,Partly cloudy. Low 46F. Winds WNW at 10 to 15 ...
26,2022-03-25,Fri 25,65,high,Mostly sunny skies. High near 65F. Winds WNW a...
27,2022-03-25,Fri 25,45,low,Clear skies with a few passing clouds. Low nea...
28,2022-03-26,Sat 26,65,high,Partly cloudy. High around 65F. Winds WNW at 1...
29,2022-03-26,Sat 26,45,low,Mostly clear skies. Low near 45F. Winds WNW at...
