In [1]:
# Import the packages and modules
import pandas as pd
from bs4 import BeautifulSoup
import requests

In [None]:
base_url = 'https://www.wunderground.com/history/monthly/us/il/chicago/KMDW/date/2018-1'

# Fetch the HTML content
page = requests.get(base_url)

# Parse the HTML content
soup = BeautifulSoup(page.content, 'html.parser')

# Check the HTML
soup

## We got into a problem. There are no tables in the downloaded HTML. The tables seem to be populated only when the JavaScript is run. Sadly, it seems that the package 'requests' does not run the JavaScript.

## I decided to use an alternative package that suits the purpose: 'Selenium' as it uses Google Chrome to download the complete webpage and runs the JavaScript.

In [1]:
# Import the packages and modules
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

# Set up Selenium
options = webdriver.ChromeOptions()
options.add_argument('--headless')  # Run headless Chrome to avoid opening a new window everytime the webpage is accessed
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')

# Initialize a Chrome webdriver
driver = webdriver.Chrome(options=options)

# URL template to scrape multiple pages
base_url_2018 = 'https://www.wunderground.com/history/monthly/us/il/chicago/KMDW/date/2018-{page}'
base_url_2019 = 'https://www.wunderground.com/history/monthly/us/il/chicago/KMDW/date/2019-{page}'

# Create an empty list to store DataFrames from each page
all_dfs = []


In [None]:
# Loop through pages 1 to 12 for 2018 data
for page_num in range(1, 13):
    # Generate URL for the current page
    url = base_url_2018.format(page=page_num)
    
    # Load the page
    driver.get(url)
    
    # Wait until the table is present on the page
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, 'table[aria-labelledby="Days data"]'))
    )
    
    # Get page source
    html = driver.page_source
    
    # Parse HTML with BeautifulSoup
    soup = BeautifulSoup(html, 'html.parser')
    
    # Find all tables with aria-labelledby="Days data"
    tables = soup.find_all('table', attrs={'aria-labelledby': 'Days data'})
    
    # Check if tables were found
    if tables:
        # Create a dictionary to store data from each table
        table_data = {}
    
        # Extract data from each table and store it in the dictionary
        for i, table in enumerate(tables):
            table_name = f'Table_{i+1}'
            table_data[table_name] = []
            
            # Extract data rows
            rows = table.find_all('tr')
            for row in rows:
                cells = row.find_all(['td', 'th'])
                row_data = [cell.text.strip() for cell in cells]
                table_data[table_name].append(row_data)
    
        # Create pandas DataFrame from the dictionary
        dfs = []
        for table_name, data in table_data.items():
            df = pd.DataFrame(data)
            dfs.append(df)
    
        # Concatenate DataFrames along axis 1 to combine them into a single DataFrame
        combined_df = pd.concat(dfs, axis=1)
        
        # Append the DataFrame to the list of DataFrames
        all_dfs.append(combined_df)


In [None]:
# Loop through pages 1 to 12 for 2019 data
for page_num in range(1, 13):
    # Generate URL for the current page
    url = base_url_2019.format(page=page_num)
    
    # Load the page
    driver.get(url)
    
    # Wait until the table is present on the page
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, 'table[aria-labelledby="Days data"]'))
    )
    
    # Get page source
    html = driver.page_source
    
    # Parse HTML with BeautifulSoup
    soup = BeautifulSoup(html, 'html.parser')
    
    # Find all tables with aria-labelledby="Days data"
    tables = soup.find_all('table', attrs={'aria-labelledby': 'Days data'})
    
    # Check if tables were found
    if tables:
        # Create a dictionary to store data from each table
        table_data = {}
    
        # Extract data from each table and store it in the dictionary
        for i, table in enumerate(tables):
            table_name = f'Table_{i+1}'
            table_data[table_name] = []
            
            # Extract data rows
            rows = table.find_all('tr')
            for row in rows:
                cells = row.find_all(['td', 'th'])
                row_data = [cell.text.strip() for cell in cells]
                table_data[table_name].append(row_data)
    
        # Create pandas DataFrame from the dictionary
        dfs = []
        for table_name, data in table_data.items():
            df = pd.DataFrame(data)
            dfs.append(df)
    
        # Concatenate DataFrames along axis 1 to combine them into a single DataFrame
        combined_df = pd.concat(dfs, axis=1)
        
        # Append the DataFrame to the list of DataFrames
        all_dfs.append(combined_df)

In [None]:
# Close the driver
driver.quit()

## Combine the pandas dataframes and save it to CSV file.

In [None]:
# Concatenate all DataFrames from different pages along axis 0
final_df = pd.concat(all_dfs, axis=0, ignore_index=True)

# Save DataFrame to CSV file without the index
final_df.to_csv('WeatherData_2018_2019_UNPROCESSED.csv', index=False)