# Dependencies

In [1]:
import pandas as pd

from selenium import webdriver
from bs4 import BeautifulSoup

# Other requirements 
You will also need to download a __Microsoft Edge Webdriver__ (e.g. from __https://developer.microsoft.com/en-us/microsoft-edge/tools/webdriver/__). This scraper works with Microsoft Edge by default; however, it can be easily adjusted to any common browser by using a different webdriver e.g. Chrome webdriver. 

In [None]:
# Start Web Driver 
edge_driver_path = "C:/Users/wyrwa/Speckled/msedgedriver.exe"
driver = webdriver.Edge(edge_driver_path)

# Select target URL
This scraper definitely works with the URL given below (Indira Gandhi Airport); however, it will also work with most other location; thus, when looking for weather data in different locations, replace the __URL__ variable. 

In [None]:
# Open the URL you want to execute JavaScript commands on
URL = 'https://rp5.ru/Weather_archive_in_New_Delhi,_Indira_Gandhi_(airport),_METAR'
driver.get(URL)

# Main scraper body
The main, and the only essential, part of the scraper is the generator below. For each of the dates between (inclusive) the dates given, it fetches a dataframe of weather data. Note that the format of the arguments is __MM/DD/YYYY__. 

In [5]:
def scrape_gen(start_date: str, end_date: str) -> pd.DataFrame:
    """
    start_date: MM/DD/YYYY (inclusive)
    end_date: MM/DD/YYYY (inclusive)
    """
    for date in pd.date_range(start_date, end_date):
        # Query data for the given date using a JS command 
        driver.execute_script(
            f"jQuery.datepick._selectDate('#calender_archive', '{date.strftime('%d.%m.%Y')}'); fMetarConfirm()"
        )
        # Yield dataframe for given day
        yield pd.read_html(str(BeautifulSoup(driver.page_source, 'html.parser').select_one('#archiveTable')))[0]

# Data processing 
This processing below is specific to the Indira Gandhi Airport data; however, it will likely be helpful for other locations as well. 

In [3]:
def preprocess(df):
    """
    This function will be executed on each dataframe yielded by scrape_gen. 
    """
    # The first row in the dataframe contains column names, we remove it.
    return df.drop(df.index[0])

In [6]:
# Column names and date format for Indira Gandhi Airport data
COLUMNS = [
        'Timestamp', 'Temperature', 'Pressure (Station)', 'Pressure', 'Humidity', 'Wind Direction', 
        'Wind Speed', 'Gust', 'Phenomenon', 'Phenomenon (Other)', 'Clouds', 'Visibility', 'Dewpoint',
]
DATE_FORMAT = '%Y%B\xa0%d,%A%H:%M'


def postprocess(df, start_date=None, end_date=None, inplace=False):
    """
    This function will be applied to the dataframe combined from all dataframes yielded from scrape_gen.
    """
    if inplace is False: 
        df = df.copy() 
        
    # Intially, there are two columns for the date and time of the measurement. We merge them together and 
    # convert to datetime. 
    df[0] += df.pop(1)
    df[0] = pd.to_datetime(df[0], format=DATE_FORMAT, errors='coerce')
    
    # Some rows have year missing from their timestamp cells; however, these rows are always duplicated, due 
    # to how the data is represented on the website, and can be safely deleted.
    df.dropna(subset=0, inplace=True)
    
    df.columns = COLUMNS
    df.set_index('Timestamp', inplace=True)
    df.sort_index(inplace=True)
    
    # You sometimes get a rogue reading from the day following whatever day you run it on, 
    # so, if limits are set, this gets rid of this faulty reading. 
    if start_date is not None or end_date is not None: 
        df = df[
            (df.index >= (df.index.min() if start_date is None else start_date)) & 
            (df.index <= (df.index.max() if end_date is None else end_date))
        ]
    
    return df 

In [7]:
def scrape(start_date: str, end_date: str, *, postprocess=lambda x: x, preprocess=lambda x: x) -> pd.DataFrame:
    """
    Convenience function for (optionally) preprocessing each dataframe from scrape_gen, concatenating them 
    together, and (optionally) applying a postprocessing function to the concatenatenated dataframe. 
    """
    return postprocess(pd.concat(map(preprocess, scrape_gen(start_date=start_date, end_date=end_date))))

# Example

In [None]:
scrape('08/08/2018', '08/11/2018')