In [251]:
import requests
from bs4  import BeautifulSoup
import pandas as pd

## 1. Web page

In [252]:
# Extract web page
url = 'https://weather.com/weather/tenday/l/San+Francisco+CA?canonicalCityId=dfdaba8cbe3a4d12a8796e1f7b1ccc7174b4b0a2d5ddb1c8566ae9f154fa638c'
response = requests.get(url)
print(response)

# Scraper
soup = BeautifulSoup(response.content, 'html.parser') 

<Response [200]>


**Understading what the elements I need have in common**

* Week day
class =  `"DailyContent--daypartDate--2A3Wi"`

* Weather description
class = `"DailyContent--narrative--hplRl"`

* Temperature (low and high)
class = `"DailyContent--temp--3d4dn"`




## 2. Extracting information

In [253]:
# Class names
weekday_class = 'DailyContent--daypartDate--2A3Wi'
weather_description_class = 'DailyContent--narrative--hplRl'
temperature_class = "DailyContent--temp--3d4dn"

# Extracting information I need
raw_data = {weekday_class: [],
        weather_description_class: [],
        temperature_class: []
        }

for tag_class in raw_data:
    tags= soup.find_all(class_ = tag_class)
    for element in tags:
        raw_data[tag_class].append( element.text )
            

## 3. Cleaning data

In [254]:
data = pd.DataFrame(raw_data)

# reset column names to meaningful ones
data.columns = ['weekday', 'description', 'temp_degree']


In [255]:
# cleaning temperature
data['temp_degree'] = (data['temp_degree']                  # grap `temp_degree` column
                        .str.replace("°", '')               # remove `°` symbol 
                        .apply(lambda x: pd.to_numeric(x))  # change string to number
                        )


## 4. Adding date and temperature state columns

In [256]:
# Creating date

date = (pd.date_range(start= pd.to_datetime("today"), periods=15)       # generate a range of 15 datetime objets, starting from today.
        .repeat(2)                                                      # repeat each datetime by 2 to account for day and night pattern
        .date                                                           # extract only the date part from the datetime
        )


# Adding date to data
data['date'] = date

In [257]:
# temperature state (i.e. low or high)
# this information is embedded in the description column


temp_state = (data['description']                       # grap `description` column
                    .str.lower()                        # lower all characters for each description (row)
                    .str.contains('high')               # check for the presence of `high` within each description (True or False)
                    .map({True:'high', False:'low'})    # change `True` -> 'high' and `False` -> `low`
            )

# Adding `temp_state` to data
data['temp_state'] = temp_state

In [258]:
# Reorder column positions and final overview for check

data = data.reindex(columns= ['date', 'weekday', 'temp_degree', 'temp_state','description'])

data.head(10)

Unnamed: 0,date,weekday,temp_degree,temp_state,description
0,2022-03-12,Sat 12,60,high,"Sunshine to start, then a few afternoon clouds..."
1,2022-03-12,Sat 12,49,low,Cloudy. Slight chance of a rain shower. Low 49...
2,2022-03-13,Sun 13,58,high,Cloudy early with partial sunshine expected la...
3,2022-03-13,Sun 13,44,low,Mostly cloudy skies. Low 44F. Winds W at 5 to ...
4,2022-03-14,Mon 14,60,high,Mostly cloudy. High around 60F. Winds SSW at 5...
5,2022-03-14,Mon 14,52,low,Rain showers in the evening becoming a steady ...
6,2022-03-15,Tue 15,59,high,"Showers in the morning, then cloudy in the aft..."
7,2022-03-15,Tue 15,45,low,Mainly clear skies. Low near 45F. Winds WNW at...
8,2022-03-16,Wed 16,62,high,Mostly sunny skies. High 62F. Winds WNW at 10 ...
9,2022-03-16,Wed 16,47,low,Partly cloudy skies during the evening will gi...
