In [1]:
import os
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Directory to store the downloaded files
directory_path = 'D:/Documents/COURSES/GBC - BIG DATA TOOLS & TECHNO. II/Canada daily temperature/'

# Create directory if it doesn't exist
if not os.path.exists(directory_path):
    os.makedirs(directory_path)

# Base URL for the temperature records
base_url = 'https://dd.weather.gc.ca/climate/ltce/daily/temperature/'

# List of province directories
provinces = ['AB/', 'BC/', 'MB/', 'NB/', 'NL/', 'NS/', 'NT/', 'NU/', 'ON/', 'PE/', 'QC/', 'SK/', 'YT/']

def download_file(url, directory):
    response = requests.get(url)
    if response.status_code == 200:
        file_name = url.split('/')[-1]
        file_path = os.path.join(directory, file_name)
        with open(file_path, 'wb') as file:
            file.write(response.content)
        print(f"Downloaded: {file_name}")
    else:
        print(f"Failed to download: {url}")

def download_province_files(province):
    province_url = base_url + province
    response = requests.get(province_url)
    soup = BeautifulSoup(response.content, 'html.parser')
    for link in soup.find_all('a'):
        file_name = link.get('href')
        if file_name.endswith('.csv'):  # Assuming the files are in CSV format
            file_url = province_url + file_name
            download_file(file_url, directory_path)


# Download files from each province
for province in provinces:
    download_province_files(province)




Downloaded: climate_LTCE_Temperature-Records_AB_VSAB10V.csv
Downloaded: climate_LTCE_Temperature-Records_AB_VSAB11V.csv
Downloaded: climate_LTCE_Temperature-Records_AB_VSAB12V.csv
Downloaded: climate_LTCE_Temperature-Records_AB_VSAB14V.csv
Downloaded: climate_LTCE_Temperature-Records_AB_VSAB15V.csv
Downloaded: climate_LTCE_Temperature-Records_AB_VSAB16V.csv
Downloaded: climate_LTCE_Temperature-Records_AB_VSAB17V.csv
Downloaded: climate_LTCE_Temperature-Records_AB_VSAB18V.csv
Downloaded: climate_LTCE_Temperature-Records_AB_VSAB19V.csv
Downloaded: climate_LTCE_Temperature-Records_AB_VSAB1VV.csv
Downloaded: climate_LTCE_Temperature-Records_AB_VSAB20V.csv
Downloaded: climate_LTCE_Temperature-Records_AB_VSAB22V.csv
Downloaded: climate_LTCE_Temperature-Records_AB_VSAB23V.csv
Downloaded: climate_LTCE_Temperature-Records_AB_VSAB24V.csv
Downloaded: climate_LTCE_Temperature-Records_AB_VSAB25V.csv
Downloaded: climate_LTCE_Temperature-Records_AB_VSAB26V.csv
Downloaded: climate_LTCE_Temperature-Rec

In [71]:
# Desired columns
columns = [
    'LONGITUDE', 'LATITUDE', 'PROVINCE_CODE', 'VIRTUAL_STATION_NAME_E', 'VIRTUAL_CLIMATE_ID',
    'LOCAL_MONTH', 'LOCAL_DAY', 'RECORD_HIGH_MAX_TEMP', 'RECORD_HIGH_MAX_TEMP_YR',
    'PREV_RECORD_HIGH_MAX_TEMP', 'PREV_RECORD_HIGH_MAX_TEMP_YR', 'RECORD_LOW_MAX_TEMP',
    'RECORD_LOW_MAX_TEMP_YR', 'PREV_RECORD_LOW_MAX_TEMP', 'PREV_RECORD_LOW_MAX_TEMP_YR',
    'RECORD_HIGH_MIN_TEMP', 'RECORD_HIGH_MIN_TEMP_YR', 'PREV_RECORD_HIGH_MIN_TEMP',
    'PREV_RECORD_HIGH_MIN_TEMP_YR', 'RECORD_LOW_MIN_TEMP', 'RECORD_LOW_MIN_TEMP_YR',
    'PREV_RECORD_LOW_MIN_TEMP', 'PREV_RECORD_LOW_MIN_TEMP_YR', 'FIRST_HIGH_MAX_TEMP',
    'FIRST_HIGH_MAX_TEMP_YEAR', 'SECOND_HIGH_MAX_TEMP', 'SECOND_HIGH_MAX_TEMP_YEAR',
    'THIRD_HIGH_MAX_TEMP', 'THIRD_HIGH_MAX_TEMP_YEAR', 'FOURTH_HIGH_MAX_TEMP',
    'FOURTH_HIGH_MAX_TEMP_YEAR', 'FIFTH_HIGH_MAX_TEMP', 'FIFTH_HIGH_MAX_TEMP_YEAR',
    'FIRST_LOW_MAX_TEMP', 'FIRST_LOW_MAX_TEMP_YEAR', 'SECOND_LOW_MAX_TEMP',
    'SECOND_LOW_MAX_TEMP_YEAR', 'THIRD_LOW_MAX_TEMP', 'THIRD_LOW_MAX_TEMP_YEAR',
    'FOURTH_LOW_MAX_TEMP', 'FOURTH_LOW_MAX_TEMP_YEAR', 'FIFTH_LOW_MAX_TEMP',
    'FIFTH_LOW_MAX_TEMP_YEAR', 'FIRST_HIGH_MIN_TEMP', 'FIRST_HIGH_MIN_TEMP_YEAR',
    'SECOND_HIGH_MIN_TEMP', 'SECOND_HIGH_MIN_TEMP_YEAR', 'THIRD_HIGH_MIN_TEMP',
    'THIRD_HIGH_MIN_TEMP_YEAR', 'FOURTH_HIGH_MIN_TEMP', 'FOURTH_HIGH_MIN_TEMP_YEAR',
    'FIFTH_HIGH_MIN_TEMP', 'FIFTH_HIGH_MIN_TEMP_YEAR', 'FIRST_LOW_MIN_TEMP',
    'FIRST_LOW_MIN_TEMP_YEAR', 'SECOND_LOW_MIN_TEMP', 'SECOND_LOW_MIN_TEMP_YEAR',
    'THIRD_LOW_MIN_TEMP', 'THIRD_LOW_MIN_TEMP_YEAR', 'FOURTH_LOW_MIN_TEMP',
    'FOURTH_LOW_MIN_TEMP_YEAR', 'FIFTH_LOW_MIN_TEMP', 'FIFTH_LOW_MIN_TEMP_YEAR'
]

def combine_datasets(directory):
    combined_df = pd.DataFrame()
    for file_name in os.listdir(directory):
        if file_name.endswith('.csv'):
            file_path = os.path.join(directory, file_name)
            print(f"Processing file: {file_path}")
            try:
                df = pd.read_csv(file_path, encoding='utf-8', index_col=None)
            except UnicodeDecodeError:
                df = pd.read_csv(file_path, encoding='ISO-8859-1', index_col=None)
            # Select only the desired columns
            df = df[columns]
            combined_df = pd.concat([combined_df, df], ignore_index=False)
    return combined_df


    
# Combine all downloaded datasets into one
combined_df = combine_datasets(directory_path)


Processing file: D:/Documents/COURSES/GBC - BIG DATA TOOLS & TECHNO. II/Canada daily temperature/climate_LTCE_Temperature-Records_AB_VSAB10V.csv
Processing file: D:/Documents/COURSES/GBC - BIG DATA TOOLS & TECHNO. II/Canada daily temperature/climate_LTCE_Temperature-Records_AB_VSAB11V.csv
Processing file: D:/Documents/COURSES/GBC - BIG DATA TOOLS & TECHNO. II/Canada daily temperature/climate_LTCE_Temperature-Records_AB_VSAB12V.csv
Processing file: D:/Documents/COURSES/GBC - BIG DATA TOOLS & TECHNO. II/Canada daily temperature/climate_LTCE_Temperature-Records_AB_VSAB14V.csv
Processing file: D:/Documents/COURSES/GBC - BIG DATA TOOLS & TECHNO. II/Canada daily temperature/climate_LTCE_Temperature-Records_AB_VSAB15V.csv
Processing file: D:/Documents/COURSES/GBC - BIG DATA TOOLS & TECHNO. II/Canada daily temperature/climate_LTCE_Temperature-Records_AB_VSAB16V.csv
Processing file: D:/Documents/COURSES/GBC - BIG DATA TOOLS & TECHNO. II/Canada daily temperature/climate_LTCE_Temperature-Records_

In [63]:
combined_df.head()

Unnamed: 0,LONGITUDE,LATITUDE,PROVINCE_CODE,VIRTUAL_STATION_NAME_E,VIRTUAL_CLIMATE_ID,LOCAL_MONTH,LOCAL_DAY,RECORD_HIGH_MAX_TEMP,RECORD_HIGH_MAX_TEMP_YR,PREV_RECORD_HIGH_MAX_TEMP,...,FIRST_LOW_MIN_TEMP,FIRST_LOW_MIN_TEMP_YEAR,SECOND_LOW_MIN_TEMP,SECOND_LOW_MIN_TEMP_YEAR,THIRD_LOW_MIN_TEMP,THIRD_LOW_MIN_TEMP_YEAR,FOURTH_LOW_MIN_TEMP,FOURTH_LOW_MIN_TEMP_YEAR,FIFTH_LOW_MIN_TEMP,FIFTH_LOW_MIN_TEMP_YEAR
-113.28,54.72,AB,AB-10,ATHABASCA,1,1,5.0,1984,4.4,1927.0,...,1928,-45.6,1911,-44.4,1950,-42.8,1934,-38.3,1947,1900-04-01
-113.28,54.72,AB,AB-10,ATHABASCA,1,2,9.5,1985,8.0,1984.0,...,1950,-42.8,1911,-40.6,1928,-38.3,1924,-38.3,1947,1900-04-01
-113.28,54.72,AB,AB-10,ATHABASCA,1,3,10.0,1985,7.2,1906.0,...,1950,-42.6,2009,-41.1,1949,-40.6,1901,-38.3,1924,1900-04-01
-113.28,54.72,AB,AB-10,ATHABASCA,1,4,9.4,1958,7.2,1947.0,...,1941,-44.4,1950,-43.3,1942,-41.7,1951,-41.1,1901,1900-04-01
-113.28,54.72,AB,AB-10,ATHABASCA,1,5,8.9,1928,6.1,1924.0,...,1909,-41.7,1901,-41.1,1951,-41.1,1959,-40.6,1950,1900-04-01
-113.28,54.72,AB,AB-10,ATHABASCA,1,6,7.2,1947,6.1,1928.0,...,1909,-45.6,1930,-43.3,1968,-40.0,1950,-38.9,1912,1900-04-01
-113.28,54.72,AB,AB-10,ATHABASCA,1,7,12.5,2003,9.4,1934.0,...,1909,-44.4,1953,-42.8,1950,-42.2,1968,-41.7,1930,1900-04-01
-113.28,54.72,AB,AB-10,ATHABASCA,1,8,10.6,1934,8.9,1928.0,...,1930,-41.7,1909,-41.7,1976,-40.0,1901,-40.0,1950,1900-04-01
-113.28,54.72,AB,AB-10,ATHABASCA,1,9,9.1,2012,8.9,1928.0,...,1930,-42.8,1901,-42.2,1912,-41.7,1957,-41.1,1969,1900-04-01
-113.28,54.72,AB,AB-10,ATHABASCA,1,10,10.0,1996,9.0,1986.0,...,1950,-43.9,1912,-43.3,1916,-43.3,1918,-42.2,1913,1900-04-01


In [72]:
# Rename the index column as 'LONGITUDE' and reset the index
combined_df.reset_index(inplace=True)
combined_df.rename(columns={'index': 'LONGITUDE'}, inplace=True)

combined_df.head()

Unnamed: 0,LONGITUDE,LONGITUDE.1,LATITUDE,PROVINCE_CODE,VIRTUAL_STATION_NAME_E,VIRTUAL_CLIMATE_ID,LOCAL_MONTH,LOCAL_DAY,RECORD_HIGH_MAX_TEMP,RECORD_HIGH_MAX_TEMP_YR,...,FIRST_LOW_MIN_TEMP,FIRST_LOW_MIN_TEMP_YEAR,SECOND_LOW_MIN_TEMP,SECOND_LOW_MIN_TEMP_YEAR,THIRD_LOW_MIN_TEMP,THIRD_LOW_MIN_TEMP_YEAR,FOURTH_LOW_MIN_TEMP,FOURTH_LOW_MIN_TEMP_YEAR,FIFTH_LOW_MIN_TEMP,FIFTH_LOW_MIN_TEMP_YEAR
0,-113.28,54.72,AB,AB-10,ATHABASCA,1,1,5.0,1984,4.4,...,1928,-45.6,1911,-44.4,1950,-42.8,1934,-38.3,1947,1900-04-01
1,-113.28,54.72,AB,AB-10,ATHABASCA,1,2,9.5,1985,8.0,...,1950,-42.8,1911,-40.6,1928,-38.3,1924,-38.3,1947,1900-04-01
2,-113.28,54.72,AB,AB-10,ATHABASCA,1,3,10.0,1985,7.2,...,1950,-42.6,2009,-41.1,1949,-40.6,1901,-38.3,1924,1900-04-01
3,-113.28,54.72,AB,AB-10,ATHABASCA,1,4,9.4,1958,7.2,...,1941,-44.4,1950,-43.3,1942,-41.7,1951,-41.1,1901,1900-04-01
4,-113.28,54.72,AB,AB-10,ATHABASCA,1,5,8.9,1928,6.1,...,1909,-41.7,1901,-41.1,1951,-41.1,1959,-40.6,1950,1900-04-01


In [73]:
# Rename columns to the name of the column at its right and drop the last column
new_columns = combined_df.columns[1:].tolist() + [None]
combined_df.columns = new_columns
combined_df = combined_df.iloc[:, :-1]

combined_df.head()

Unnamed: 0,LONGITUDE,LATITUDE,PROVINCE_CODE,VIRTUAL_STATION_NAME_E,VIRTUAL_CLIMATE_ID,LOCAL_MONTH,LOCAL_DAY,RECORD_HIGH_MAX_TEMP,RECORD_HIGH_MAX_TEMP_YR,PREV_RECORD_HIGH_MAX_TEMP,...,FIRST_LOW_MIN_TEMP,FIRST_LOW_MIN_TEMP_YEAR,SECOND_LOW_MIN_TEMP,SECOND_LOW_MIN_TEMP_YEAR,THIRD_LOW_MIN_TEMP,THIRD_LOW_MIN_TEMP_YEAR,FOURTH_LOW_MIN_TEMP,FOURTH_LOW_MIN_TEMP_YEAR,FIFTH_LOW_MIN_TEMP,FIFTH_LOW_MIN_TEMP_YEAR
0,-113.28,54.72,AB,AB-10,ATHABASCA,1,1,5.0,1984,4.4,...,-46.1,1928,-45.6,1911,-44.4,1950,-42.8,1934,-38.3,1947
1,-113.28,54.72,AB,AB-10,ATHABASCA,1,2,9.5,1985,8.0,...,-45.0,1950,-42.8,1911,-40.6,1928,-38.3,1924,-38.3,1947
2,-113.28,54.72,AB,AB-10,ATHABASCA,1,3,10.0,1985,7.2,...,-47.2,1950,-42.6,2009,-41.1,1949,-40.6,1901,-38.3,1924
3,-113.28,54.72,AB,AB-10,ATHABASCA,1,4,9.4,1958,7.2,...,-45.0,1941,-44.4,1950,-43.3,1942,-41.7,1951,-41.1,1901
4,-113.28,54.72,AB,AB-10,ATHABASCA,1,5,8.9,1928,6.1,...,-43.3,1909,-41.7,1901,-41.1,1951,-41.1,1959,-40.6,1950


In [74]:
# Save the combined dataset to a CSV file
combined_csv_path = os.path.join(directory_path, 'combined_temperature_data.csv')
combined_df.to_csv(combined_csv_path, index=False)
print(f"Combined dataset saved to: {combined_csv_path}")

Combined dataset saved to: D:/Documents/COURSES/GBC - BIG DATA TOOLS & TECHNO. II/Canada daily temperature/combined_temperature_data.csv
