In [27]:
# Import necessary libraries
import requests  # For making HTTP requests
import pandas as pd  # Essential for data manipulation and analysis

# Define API key and parameters for data retrieval
API_KEY = 'ed5aeebe-3bd2-4a32-b00d-de83a9cf113d'  # DMI API key

# Specify the URLs for each weather parameter and for different time periods
PARAMETERS = {
    'mean_temp': {
        '2018-2023': 'https://dmigw.govcloud.dk/v2/climateData/collections/countryValue/items?datetime=2018-02-12T00%3A00%3A00Z%2F..&parameterId=mean_daily_max_temp&timeResolution=day&limit=100000&bbox-crs=https%3A%2F%2Fwww.opengis.net%2Fdef%2Fcrs%2FOGC%2F1.3%2FCRS84',
        'before-2018': 'https://dmigw.govcloud.dk/v2/climateData/collections/countryValue/items?datetime=..%2F2018-03-18T12%3A31%3A12Z&parameterId=mean_temp&timeResolution=day&limit=100000&bbox-crs=https%3A%2F%2Fwww.opengis.net%2Fdef%2Fcrs%2FOGC%2F1.3%2FCRS84'
    },
    'mean_wind_speed': {
        '2018-2023': 'https://dmigw.govcloud.dk/v2/climateData/collections/countryValue/items?datetime=2018-02-12T00%3A00%3A00Z%2F..&parameterId=mean_wind_speed&timeResolution=day&limit=100000&bbox-crs=https%3A%2F%2Fwww.opengis.net%2Fdef%2Fcrs%2FOGC%2F1.3%2FCRS84',
        'before-2018': 'https://dmigw.govcloud.dk/v2/climateData/collections/countryValue/items?datetime=..%2F2018-03-18T12%3A31%3A12Z&parameterId=mean_wind_speed&timeResolution=day&limit=100000&bbox-crs=https%3A%2F%2Fwww.opengis.net%2Fdef%2Fcrs%2FOGC%2F1.3%2FCRS84'
    },
    'acc_precip': {
       '2018-2023': 'https://dmigw.govcloud.dk/v2/climateData/collections/countryValue/items?datetime=2018-02-12T00%3A00%3A00Z%2F..&parameterId=acc_precip&timeResolution=day&limit=100000&bbox-crs=https%3A%2F%2Fwww.opengis.net%2Fdef%2Fcrs%2FOGC%2F1.3%2FCRS84',
        'before-2018': 'https://dmigw.govcloud.dk/v2/climateData/collections/countryValue/items?datetime=..%2F2018-03-18T12%3A31%3A12Z&parameterId=acc_precip&timeResolution=day&limit=100000&bbox-crs=https%3A%2F%2Fwww.opengis.net%2Fdef%2Fcrs%2FOGC%2F1.3%2FCRS84'
    }
}

# Common parameters for all requests
params = {
    'api-key': API_KEY  # Using the API key for authentication
}

def fetch_and_process_data(url, params):
    """
    Fetches climate data from the DMI API and returns it as a DataFrame.
    
    Parameters:
        url (str): The URL endpoint to fetch data from.
        params (dict): The parameters for the API request.
        
    Returns:
        pd.DataFrame: Processed data in a pandas DataFrame format.
    """
    # Send GET request
    response = requests.get(url, params=params)
    # Ensure the request was successful
    response.raise_for_status()
    # Convert the received JSON data to a pandas DataFrame
    json_data = response.json()
    df = pd.json_normalize(json_data['features'])
    # Filter only the needed columns
    df = df[["properties.from", "properties.value"]]
    # Extract only the date part for easier interpretation
    df['properties.from'] = df['properties.from'].str[:10]
    
    return df

# Store individual DataFrames for each weather parameter here
dfs = {}

# For each parameter, fetch the data for both time periods and concatenate them
for parameter, urls in PARAMETERS.items():
    df_2018_2023 = fetch_and_process_data(urls['2018-2023'], params)
    df_before_2018 = fetch_and_process_data(urls['before-2018'], params)
    dfs[parameter] = pd.concat([df_2018_2023, df_before_2018], ignore_index=True)


In [28]:
# Initializing the main DataFrame to combine all parameters
df_combined_all_parameters = None

# Loop through each weather parameter to fetch and process data
for parameter, urls in PARAMETERS.items():
    # Fetch data for the two specified time periods
    df_2018_2023 = fetch_and_process_data(urls['2018-2023'], params)
    df_before_2018 = fetch_and_process_data(urls['before-2018'], params)
    
    # Concatenate both DataFrames to get a combined dataset
    df_parameter = pd.concat([df_2018_2023, df_before_2018], ignore_index=True)
    
    # Renaming the 'properties.value' column to the current weather parameter for clarity
    df_parameter.rename(columns={'properties.value': parameter}, inplace=True)

    # If the main DataFrame isn't initialized, use the current parameter's data
    if df_combined_all_parameters is None:
        df_combined_all_parameters = df_parameter
    else:
        # If already initialized, merge the current DataFrame based on the date column
        df_combined_all_parameters = pd.merge(df_combined_all_parameters, df_parameter, on='properties.from', how='outer')

# Sorting the combined DataFrame based on the date
df_combined_all_parameters = df_combined_all_parameters.sort_values(by='properties.from')

# Filtering the dataframe to remove entries after "2023-08-14", as this is the cut-off date for news articles
df_combined_all_parameters = df_combined_all_parameters[pd.to_datetime(df_combined_all_parameters['properties.from']) <= '2023-08-14']

# Resetting index after filtering
df_combined_all_parameters = df_combined_all_parameters.reset_index(drop=True)

# Display the combined DataFrame with all parameters
df_combined_all_parameters


Unnamed: 0,properties.from,mean_temp,mean_wind_speed,acc_precip
0,2011-01-01,2.2,7.8,0.5
1,2011-01-02,-1.8,2.7,0.0
2,2011-01-03,-3.7,2.3,0.2
3,2011-01-04,-0.3,5.3,2.1
4,2011-01-05,-1.2,6.6,1.4
...,...,...,...,...
4842,2023-08-10,17.5,6.1,0.1
4843,2023-08-11,21.7,3.5,0.0
4844,2023-08-12,21.1,2.7,7.3
4845,2023-08-13,21.5,3.8,0.3


In [29]:
# Define the complete date_range from the earliest to the latest date in the dataframe
start_date = df_combined_all_parameters['properties.from'].min()
end_date = df_combined_all_parameters['properties.from'].max()
date_range = pd.date_range(start_date, end_date).strftime('%Y-%m-%d').tolist()

# Convert 'properties.from' column to string for consistency
df_dates = df_combined_all_parameters['properties.from'].astype(str)

# Identify the missing dates
missing = set(date_range) - set(df_dates)
print(f"Number of Missing Dates: {len(missing)}\n")

# Check for duplicates in the combined dataframe
duplicated_rows = df_combined_all_parameters[df_combined_all_parameters.duplicated(subset='properties.from', keep='first')]
num_duplicates = duplicated_rows.shape[0]
print(f"Number of Duplicated Dates: {num_duplicates}\n")

# Drop duplicates from the combined dataframe
df_combined_all_parameters = df_combined_all_parameters.drop_duplicates(subset='properties.from', keep='first')

# Print the number of duplicates removed
print(f"{num_duplicates} duplicates have been removed from the dataframe.")

# Print the number of rows left in the dataframe
print(f"Number of rows left in the dataframe: {df_combined_all_parameters.shape[0]}")


Number of Missing Dates: 0

Number of Duplicated Dates: 238

238 duplicates have been removed from the dataframe.
Number of rows left in the dataframe: 4609


In [30]:
df_combined_all_parameters

Unnamed: 0,properties.from,mean_temp,mean_wind_speed,acc_precip
0,2011-01-01,2.2,7.8,0.5
1,2011-01-02,-1.8,2.7,0.0
2,2011-01-03,-3.7,2.3,0.2
3,2011-01-04,-0.3,5.3,2.1
4,2011-01-05,-1.2,6.6,1.4
...,...,...,...,...
4842,2023-08-10,17.5,6.1,0.1
4843,2023-08-11,21.7,3.5,0.0
4844,2023-08-12,21.1,2.7,7.3
4845,2023-08-13,21.5,3.8,0.3


In [34]:
# Renaming the column 'properties.from' to 'date' and ensuring it modifies the original DataFrame
df_combined_all_parameters = df_combined_all_parameters.rename(columns={'properties.from': 'date'})

# Convert 'date' column to datetime type for easier filtering
df_combined_all_parameters['date'] = pd.to_datetime(df_combined_all_parameters['date'])

# Filtering to keep only rows on or before "2023-08-14"
df_combined_all_parameters = df_combined_all_parameters[df_combined_all_parameters['date'] <= '2023-08-14']

# Extracting the month from the 'date' column using `.loc` to avoid the SettingWithCopyWarning
df_combined_all_parameters.loc[:, 'month'] = df_combined_all_parameters['date'].dt.month.astype(str).str.zfill(2)  # Using dt accessor to get month and then format it

# Calculating the monthly averages across all years for the specified weather parameters
monthly_means = df_combined_all_parameters.groupby('month').agg({
    'mean_temp': 'mean',
    'mean_wind_speed': 'mean',
    'acc_precip': 'mean'
}).reset_index()

# Merging the computed monthly averages with the original DataFrame on the 'month' column
df_combined_all_parameters = df_combined_all_parameters.merge(monthly_means, on='month', how='left', suffixes=('', '_monthly_avg'))

# Calculating deviations for each weather parameter by subtracting the monthly average 
# from the actual value and rounding the result to one decimal place
df_combined_all_parameters['mean_temp_deviation'] = (df_combined_all_parameters['mean_temp'] - df_combined_all_parameters['mean_temp_monthly_avg']).round(1)
df_combined_all_parameters['mean_wind_speed_deviation'] = (df_combined_all_parameters['mean_wind_speed'] - df_combined_all_parameters['mean_wind_speed_monthly_avg']).round(1)
df_combined_all_parameters['acc_precip_deviation'] = (df_combined_all_parameters['acc_precip'] - df_combined_all_parameters['acc_precip_monthly_avg']).round(1)

# Dropping the temporary monthly average columns and the 'month' column for a cleaner DataFrame
df_combined_all_parameters.drop(columns=['mean_temp_monthly_avg', 'mean_wind_speed_monthly_avg', 'acc_precip_monthly_avg', 'month'], inplace=True)

# Display the combined DataFrame with all parameters
df_combined_all_parameters


Unnamed: 0,date,mean_temp,mean_wind_speed,acc_precip,mean_temp_deviation,mean_wind_speed_deviation,acc_precip_deviation
0,2011-01-01,2.2,7.8,0.5,-0.6,2.5,-1.7
1,2011-01-02,-1.8,2.7,0.0,-4.6,-2.6,-2.2
2,2011-01-03,-3.7,2.3,0.2,-6.5,-3.0,-2.0
3,2011-01-04,-0.3,5.3,2.1,-3.1,-0.0,-0.1
4,2011-01-05,-1.2,6.6,1.4,-4.0,1.3,-0.8
...,...,...,...,...,...,...,...
4604,2023-08-10,17.5,6.1,0.1,-1.3,2.0,-2.6
4605,2023-08-11,21.7,3.5,0.0,2.9,-0.6,-2.7
4606,2023-08-12,21.1,2.7,7.3,2.3,-1.4,4.6
4607,2023-08-13,21.5,3.8,0.3,2.7,-0.3,-2.4


In [35]:
# To ensure a backup of the cleaned data, we'll save the DataFrame into a CSV format.
# This creates a persistent copy of the weather data that includes daily average deviations.

df_combined_all_parameters.to_csv('weather_data_daily.csv', index=False)

# Confirmation
print("🎉 The weather data with daily average deviations has been successfully saved to 'weather_data_daily.csv'! 🎉")


🎉 The weather data with daily average deviations has been successfully saved to 'weather_data_daily.csv'! 🎉
