<img src="https://i.ibb.co/k1jrqqc/NA.png" style="float: left; margin: 20px; height: 55px; border-radius:50%;">

# WEB SCRAPING: `Weather DATA`

_Author: Naresh V

----

# Importing Libraries

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import pickle

# Web Scraping

In [2]:
# Data Dictionary to store scraped data

data = {
    'Day':[],
    'Average temperature':[], 
    'Average humidity':[],
    'Average dewpoint' :[], 
    'Average barometer':[],
    'Average windspeed':[], 
    'Average gustspeed':[],
    'Average direction':[], 
    'Rainfall for month':[],
    'Rainfall for year':[], 
    'Maximum rain per minute':[],
    'Maximum temperature':[], 
    'Minimum temperature':[],
    'Maximum humidity':[], 
    'Minimum humidity':[], 
    'Maximum pressure':[],
    'Minimum pressure':[], 
    'Maximum windspeed':[],
    'Maximum gust speed':[], 
    'Maximum heat index':[]
}

In [3]:
# To scrape the given URL

def scrape(url, year, m):
    
    # To get response from the URL
    req = requests.get(url)
    
    # To get HTML from the response
    soup = BeautifulSoup(req.content,'html5lib')
    
    # To get all the TABLEs from the HTML response
    tables = soup.findAll("table")
    day = 1
    
    # Iterating through all the TABLEs
    for table in tables[:]:
        
        # To get all the ROWs in a TABLE
        rows = table.findAll("tr")
        
        # To get heading of the TABLE
        header = rows[0].td.text
        
        # To avoid additional TABLEs
        if header.find("Month") == -1 and header.find("Rain") == -1:
            flag = False
            
            # Iterating through all the ROWs
            for row in rows[1:]:
                
                # To get DATA in ROW
                td = row.findAll('td')
                
                # Default value
                value = '0'
                
                # Storing DATA in Data Dictionary
                if len(td) > 1:
                    value = td[1].text.strip().split()
                    if len(value) > 0:
                        value = value[0]
                        
                if len(td) >= 1 and data.get(td[0].text) != None:
                    data[td[0].text].append(value)
                    flag = True
            if flag:
                data["Day"].append(year + "-" + m + "-"  +str(day).zfill(2))
        day += 1

In [4]:
# To iterate through given range of years
for year in range(2009, 2021):
    
    # To iterate through all the months
    for month in range(1, 13):
        
        # Adding leading zeors
        m = str(month).zfill(2)
        
        # Building URL
        url = "http://www.estesparkweather.net/archive_reports.php?date=" + str(year) + m
        
        # To scrape DATA
        scrape(url, str(year), m)
        
    print(f"Data scraped for the year - {year}")

Data scraped for the year - 2009
Data scraped for the year - 2010
Data scraped for the year - 2011
Data scraped for the year - 2012
Data scraped for the year - 2013
Data scraped for the year - 2014
Data scraped for the year - 2015
Data scraped for the year - 2016
Data scraped for the year - 2017
Data scraped for the year - 2018
Data scraped for the year - 2019
Data scraped for the year - 2020


In [5]:
# To store Data Dictionary as Dataframe
df = pd.DataFrame.from_dict(data)
df.head()

Unnamed: 0,Day,Average temperature,Average humidity,Average dewpoint,Average barometer,Average windspeed,Average gustspeed,Average direction,Rainfall for month,Rainfall for year,Maximum rain per minute,Maximum temperature,Minimum temperature,Maximum humidity,Minimum humidity,Maximum pressure,Minimum pressure,Maximum windspeed,Maximum gust speed,Maximum heat index
0,2009-01-01,37.8,35,12.7,29.7,26.4,36.8,274,0.0,0.0,0.0,40.1,34.5,44,27,29.762,29.596,41.4,59,40.1
1,2009-01-02,43.2,32,14.7,29.5,12.8,18.0,240,0.0,0.0,0.0,52.8,37.5,43,16,29.669,29.268,35.7,51,52.8
2,2009-01-03,25.7,60,12.7,29.7,8.3,12.2,290,0.0,0.0,0.0,41.2,6.7,89,35,30.232,29.26,25.3,38,41.2
3,2009-01-04,9.3,67,0.1,30.4,2.9,4.5,47,0.0,0.0,0.0,19.4,-0.0,79,35,30.566,30.227,12.7,20,32.0
4,2009-01-05,23.5,30,-5.3,29.9,16.7,23.1,265,0.0,0.0,0.0,30.3,15.1,56,13,30.233,29.568,38.0,53,32.0


# Data Cleaning

In [6]:
# To change date format
df["Day"] = pd.to_datetime(df["Day"], format = "%Y-%m-%d")

# To set date as index
df.set_index("Day", inplace = True)

In [7]:
# To remove special characters from DATA
for col in df.columns:
    df[col] = df[col].str.replace("%",'')
    df[col] = df[col].str.replace("°F",'')
    df[col] = df[col].str.replace("°",'')
    df[col] = pd.to_numeric(df[col])

In [8]:
# New column names
new_name ={
           "Average temperature":'Average temperature (°F)', 
           "Average humidity":'Average humidity (%)',
           "Average dewpoint":'Average dewpoint (°F)', 
           "Average barometer":'Average barometer (in)',
           "Average windspeed":'Average windspeed (mph)', 
           "Average gustspeed":'Average gustspeed (mph)',
           "Average direction": 'Average direction (°deg)', 
           "Rainfall for month":'Rainfall for month (in)',
           "Rainfall for year":'Rainfall for year (in)', 
           "Maximum rain per minute":'Maximum rain per minute',
           "Maximum temperature": 'Maximum temperature (°F)', 
           "Minimum temperature" :'Minimum temperature (°F)',
           "Maximum humidity":'Maximum humidity (%)', 
           "Minimum humidity":'Minimum humidity (%)', 
           "Maximum pressure":'Maximum pressure',
           "Minimum pressure":'Minimum pressure', 
           "Maximum windspeed":'Maximum windspeed (mph)',
           "Maximum gust speed":'Maximum gust speed (mph)', 
           "Maximum heat index":'Maximum heat index (°F)'
    
          }

In [9]:
# To rename columns
df.rename(columns=new_name, inplace=True)

In [10]:
# To get DATA within specific window
df = df.loc[(df.index >= "2009-01-01") & (df.index <= "2021-10-30")]

# Exporting Data

In [11]:
# To export DATA as pickle file
with open("weather_data.pk", "wb") as file:
    pickle.dump(df, file)