# Weather Data Extraction

This notebook extracts weather data for Canberra, Australia, using the Bureau of Meteorology's website.

## Import Necessary Libraries

```python

In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

In [2]:
def extract_weather_tables(start_month=1, end_month=11):
    # Base URL
    base_url = "https://reg.bom.gov.au/climate/dwo/2024{month:02}/html/IDCJDW2801.2024{month:02}.shtml"
    month_name = ["January", "February", "March", "April", "May", "June",
                  "July", "August", "September", "October", "November", "December"]
    
    meandf = []  # List to hold mean DataFrames for each month
    all_data = []  # List to hold all weather data

    # Loop through the months
    for month in range(start_month, end_month):
        # Create the URL for the current month
        url = base_url.format(month=month)
        print(f"Fetching data from: {month_name[month-1]}")
        
        # Send a request
        response = requests.get(url)

        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Find the table
            table = soup.find('table')
            if table:
                # Convert the table into a DataFrame
                df = pd.read_html(str(table))[0]

                # Extract the mean values and store it in meandf
                if len(df) > 4:
                    mean_row = df.iloc[[-4]].copy()  # Use copy to avoid warnings
                    meandf.append(mean_row)  # Append the DataFrame

                # Remove the last 5 rows
                if len(df) > 5:
                    df = df.iloc[:-5]

                all_data.append(df)
            else:
                print(f"No table found in {url}.")
        else:
            print(f"Failed to retrieve data from {url} - Status code: {response.status_code}")

    # Concatenate all DataFrames into one
    if all_data:
        combined_df = pd.concat(all_data, ignore_index=True)

        # Change the date field to datetime objects
        if 'Date' in combined_df.columns:
            # Generate date range from January 1 to November 14
            date_range = pd.date_range(start='2024-01-01', end='2024-10-31')
            combined_df['Date'] = date_range[:len(combined_df)]
        
        # Create a DataFrame for the mean values
        if meandf:
            mean_combined_df = pd.concat(meandf, ignore_index=True)
            return combined_df, mean_combined_df
        else:
            return combined_df, None
    else:
        return None, None

## Execute the Function

In [5]:
weather_data, mean_data = extract_weather_tables()
print(weather_data.head())
print(mean_data)

Fetching data from: January


  df = pd.read_html(str(table))[0]


Fetching data from: February


  df = pd.read_html(str(table))[0]


Fetching data from: March


  df = pd.read_html(str(table))[0]


Fetching data from: April


  df = pd.read_html(str(table))[0]


Fetching data from: May


  df = pd.read_html(str(table))[0]


Fetching data from: June


  df = pd.read_html(str(table))[0]


Fetching data from: July


  df = pd.read_html(str(table))[0]


Fetching data from: August


  df = pd.read_html(str(table))[0]


Fetching data from: September


  df = pd.read_html(str(table))[0]


Fetching data from: October
        Date Day Temps       Rain Evap   Sun Max wind gust              ...  \
        Date Day   Min   Max Rain Evap   Sun           Dir  Spd   Time  ...   
        Date Day    °C    °C   mm   mm hours          km/h km/h  local  ...   
0 2024-01-01  Mo  14.5  27.6    0  NaN   NaN            NE   30  18:14  ...   
1 2024-01-02  Tu  16.9  27.7  0.6  NaN   NaN           NNE   31  17:45  ...   
2 2024-01-03  We  17.2  30.2  0.4  NaN   NaN            SE   37  19:27  ...   
3 2024-01-04  Th  16.9  24.1  4.8  NaN   NaN           ESE   31  20:52  ...   
4 2024-01-05  Fr  14.4  23.9  4.2  NaN   NaN           ENE   43  16:15  ...   

  9 am                    3 pm                            
   Cld  Dir  Spd    MSLP  Temp  RH Cld  Dir  Spd    MSLP  
   8th km/h km/h     hPa    °C   % 8th km/h km/h     hPa  
0    8    E   13  1022.5  25.0  53   5    S    9  1018.0  
1    8    S    9  1018.3  26.2  56   1    N   11  1015.5  
2    7   SE    4  1016.2  28.7  54   8  WNW 

  df = pd.read_html(str(table))[0]


In [7]:
# Save the weather_data DataFrame to a CSV file
weather_data.to_csv('weather_data.csv', index=False)
print("Weather data saved to 'weather_data.csv'.")

Weather data saved to 'weather_data.csv'.


In [9]:
weather_data.head(5)

Unnamed: 0_level_0,Date,Day,Temps,Temps,Rain,Evap,Sun,Max wind gust,Max wind gust,Max wind gust,...,9 am,9 am,9 am,9 am,3 pm,3 pm,3 pm,3 pm,3 pm,3 pm
Unnamed: 0_level_1,Date,Day,Min,Max,Rain,Evap,Sun,Dir,Spd,Time,...,Cld,Dir,Spd,MSLP,Temp,RH,Cld,Dir,Spd,MSLP
Unnamed: 0_level_2,Date,Day,°C,°C,mm,mm,hours,km/h,km/h,local,...,8th,km/h,km/h,hPa,°C,%,8th,km/h,km/h,hPa
0,2024-01-01,Mo,14.5,27.6,0.0,,,NE,30,18:14,...,8,E,13,1022.5,25.0,53,5,S,9,1018.0
1,2024-01-02,Tu,16.9,27.7,0.6,,,NNE,31,17:45,...,8,S,9,1018.3,26.2,56,1,N,11,1015.5
2,2024-01-03,We,17.2,30.2,0.4,,,SE,37,19:27,...,7,SE,4,1016.2,28.7,54,8,WNW,13,1013.0
3,2024-01-04,Th,16.9,24.1,4.8,,,ESE,31,20:52,...,8,SE,7,1016.0,23.5,73,6,ESE,15,1016.2
4,2024-01-05,Fr,14.4,23.9,4.2,,,ENE,43,16:15,...,8,SSE,17,1021.5,22.4,54,8,E,17,1019.8
