In [3]:
# Import necessary libraries
import requests   # Library to make HTTP requests
import pandas as pd  # Library for data manipulation and analysis

# API key for accessing DMI
API_KEY = 'ed5aeebe-3bd2-4a32-b00d-de83a9cf113d'  # API key for accessing DMI

# Dictionary defining the URLs for each weather parameter and time period
PARAMETERS = {
    'mean_temp': {
        '2018-2023': 'https://dmigw.govcloud.dk/v2/climateData/collections/countryValue/items?datetime=2018-02-12T00%3A00%3A00Z%2F..&parameterId=mean_temp&timeResolution=month&bbox-crs=https%3A%2F%2Fwww.opengis.net%2Fdef%2Fcrs%2FOGC%2F1.3%2FCRS84',
        'before-2018': 'https://dmigw.govcloud.dk/v2/climateData/collections/countryValue/items?datetime=..%2F2018-03-18T12%3A31%3A12Z&parameterId=mean_temp&timeResolution=month&bbox-crs=https%3A%2F%2Fwww.opengis.net%2Fdef%2Fcrs%2FOGC%2F1.3%2FCRS84'
    },
    'mean_daily_max_temp': {
        '2018-2023': 'https://dmigw.govcloud.dk/v2/climateData/collections/countryValue/items?datetime=2018-02-12T00%3A00%3A00Z%2F..&parameterId=mean_daily_max_temp&timeResolution=month&bbox-crs=https%3A%2F%2Fwww.opengis.net%2Fdef%2Fcrs%2FOGC%2F1.3%2FCRS84',
        'before-2018': 'https://dmigw.govcloud.dk/v2/climateData/collections/countryValue/items?datetime=..%2F2018-03-18T12%3A31%3A12Z&parameterId=mean_daily_max_temp&timeResolution=month&bbox-crs=https%3A%2F%2Fwww.opengis.net%2Fdef%2Fcrs%2FOGC%2F1.3%2FCRS84'
    },
    'mean_daily_min_temp': {
        '2018-2023': 'https://dmigw.govcloud.dk/v2/climateData/collections/countryValue/items?datetime=2018-02-12T00%3A00%3A00Z%2F..&parameterId=mean_daily_min_temp&timeResolution=month&bbox-crs=https%3A%2F%2Fwww.opengis.net%2Fdef%2Fcrs%2FOGC%2F1.3%2FCRS84',
        'before-2018': 'https://dmigw.govcloud.dk/v2/climateData/collections/countryValue/items?datetime=..%2F2018-03-18T12%3A31%3A12Z&parameterId=mean_daily_min_temp&timeResolution=month&bbox-crs=https%3A%2F%2Fwww.opengis.net%2Fdef%2Fcrs%2FOGC%2F1.3%2FCRS84'
    },
    'mean_wind_speed': {
        '2018-2023': 'https://dmigw.govcloud.dk/v2/climateData/collections/countryValue/items?datetime=2018-02-12T00%3A00%3A00Z%2F..&parameterId=mean_wind_speed&timeResolution=month&bbox-crs=https%3A%2F%2Fwww.opengis.net%2Fdef%2Fcrs%2FOGC%2F1.3%2FCRS84',
        'before-2018': 'https://dmigw.govcloud.dk/v2/climateData/collections/countryValue/items?datetime=..%2F2018-03-18T12%3A31%3A12Z&parameterId=mean_wind_speed&timeResolution=month&bbox-crs=https%3A%2F%2Fwww.opengis.net%2Fdef%2Fcrs%2FOGC%2F1.3%2FCRS84'
    },
    'acc_precip': {
       '2018-2023': 'https://dmigw.govcloud.dk/v2/climateData/collections/countryValue/items?datetime=2018-02-12T00%3A00%3A00Z%2F..&parameterId=acc_precip&timeResolution=month&bbox-crs=https%3A%2F%2Fwww.opengis.net%2Fdef%2Fcrs%2FOGC%2F1.3%2FCRS84',
        'before-2018': 'https://dmigw.govcloud.dk/v2/climateData/collections/countryValue/items?datetime=..%2F2018-03-18T12%3A31%3A12Z&parameterId=acc_precip&timeResolution=month&bbox-crs=https%3A%2F%2Fwww.opengis.net%2Fdef%2Fcrs%2FOGC%2F1.3%2FCRS84'
    }
}


# Parameters common to all requests (for authentication)
params = {
    'api-key': API_KEY  
}

def fetch_and_process_data(url, params):
    """
    Fetch and process climate data from DMI.
    
    Args:
        url (str): Endpoint URL.
        params (dict): Request parameters.
        
    Returns:
        pd.DataFrame: Processed climate data.
    """
    
    # Send GET request to fetch data
    response = requests.get(url, params=params)
    response.raise_for_status()  # Ensure the response was successful
    
    # Parse the returned JSON data
    json_data = response.json()

    # Convert the JSON data to a DataFrame and select relevant columns
    df = pd.json_normalize(json_data['features'])
    df = df[["properties.from", "properties.value"]]

    # Extract and format the date from the "properties.from" column
    df['properties.from'] = df['properties.from'].str[:10]

    return df

# Fetch and store data for each parameter across specified time periods
dfs = {}  # Dictionary to store dataframes for each parameter

for parameter, urls in PARAMETERS.items():
    # Fetch data for the two defined periods (2018-2023 & before 2018)
    df_2018_2023 = fetch_and_process_data(urls['2018-2023'], params)
    df_before_2018 = fetch_and_process_data(urls['before-2018'], params)
    
    # Combine the DataFrames for this parameter into a single DataFrame
    dfs[parameter] = pd.concat([df_2018_2023, df_before_2018], ignore_index=True)

    


In [5]:
# Import necessary libraries 
# import requests
# import pandas as pd

# We'll use a main DataFrame to store combined data across all weather parameters
df_combined_all_parameters = None

# Loop through each weather parameter and its associated URLs
for parameter, urls in PARAMETERS.items():
    # Fetch data for both defined periods (2018-2023 & before 2018)
    df_2018_2023 = fetch_and_process_data(urls['2018-2023'], params)
    df_before_2018 = fetch_and_process_data(urls['before-2018'], params)
    
    # Merge the two DataFrames for the parameter
    df_parameter = pd.concat([df_2018_2023, df_before_2018], ignore_index=True)
    
    # Rename the value column to match the current parameter for clarity
    df_parameter.rename(columns={'properties.value': parameter}, inplace=True)

    # If this is our first parameter, initialize the main DataFrame
    # Otherwise, merge with existing data based on the date
    if df_combined_all_parameters is None:
        df_combined_all_parameters = df_parameter
    else:
        df_combined_all_parameters = pd.merge(df_combined_all_parameters, df_parameter, on='properties.from', how='outer')

# Sort the DataFrame based on the date
df_combined_all_parameters = df_combined_all_parameters.sort_values(by='properties.from')

# Reset the index after sorting
df_combined_all_parameters.reset_index(drop=True, inplace=True)

# Display the combined DataFrame with all parameters
df_combined_all_parameters


Unnamed: 0,properties.from,mean_temp,mean_daily_max_temp,mean_daily_min_temp,mean_wind_speed,acc_precip
0,2011-01-01,0.3,2.6,,4.2,47.8
1,2011-02-01,-0.1,1.6,,6.5,39.9
2,2011-03-01,3.0,,,5.1,29.9
3,2011-04-01,9.9,,,4.7,17.4
4,2011-05-01,11.3,,,4.9,54.4
...,...,...,...,...,...,...
178,2023-04-01,7.0,11.2,2.9,4.5,43.8
179,2023-05-01,11.2,16.0,6.4,4.6,14.1
180,2023-06-01,16.4,21.9,10.7,3.6,27.0
181,2023-07-01,15.9,19.9,11.9,4.7,140.8


In [6]:
# Filter out rows with dates later than "2023-07-01"
df_combined_all_parameters = df_combined_all_parameters[df_combined_all_parameters['properties.from'] <= '2023-07-01']

# Reset the index to ensure continuity
df_combined_all_parameters = df_combined_all_parameters.reset_index(drop=True)

# Display the consolidated and filtered DataFrame
df_combined_all_parameters


Unnamed: 0,properties.from,mean_temp,mean_daily_max_temp,mean_daily_min_temp,mean_wind_speed,acc_precip
0,2011-01-01,0.3,2.6,,4.2,47.8
1,2011-02-01,-0.1,1.6,,6.5,39.9
2,2011-03-01,3.0,,,5.1,29.9
3,2011-04-01,9.9,,,4.7,17.4
4,2011-05-01,11.3,,,4.9,54.4
...,...,...,...,...,...,...
177,2023-03-01,3.5,6.8,0.2,4.7,78.9
178,2023-04-01,7.0,11.2,2.9,4.5,43.8
179,2023-05-01,11.2,16.0,6.4,4.6,14.1
180,2023-06-01,16.4,21.9,10.7,3.6,27.0


In [7]:
# Rename columns for better clarity and readability
df_combined_all_parameters.rename(columns={
    "properties.from": "month",
    "mean_temp": "mean temp",
    "mean_daily_max_temp": "mean daily max temp",
    "mean_daily_min_temp": "mean daily min temp",
    "mean_wind_speed": "mean wind speed",
    "acc_precip": "accumulated precipitation"
}, inplace=True)

# Simplify the month representation by removing the day component
df_combined_all_parameters['month'] = df_combined_all_parameters['month'].str[:-3]

# Store the cleaned data in a CSV file
df_combined_all_parameters.to_csv('Weather_data.csv', index=False)
print("Data saved to 'Weather_data.csv'")


Data saved to 'Weather_data.csv'


In [8]:
# Detect duplicates across all columns
duplicates = df_combined_all_parameters[df_combined_all_parameters.duplicated(keep=False)]

# If duplicates exist, print them, remove them, and reset the index
if not duplicates.empty:
    print(f"{len(duplicates)} duplicates found:")
    print(duplicates)
    
    # Remove duplicates, retaining the first occurrence
    df_combined_all_parameters.drop_duplicates(keep='first', inplace=True)
    df_combined_all_parameters.reset_index(drop=True, inplace=True)
    print("\nDuplicates removed and index reset!")
else:
    print("No duplicates found.")


32 duplicates found:
       month  mean temp  mean daily max temp  mean daily min temp  \
86   2018-03        0.3                  2.6                 -1.9   
87   2018-03        0.3                  2.6                 -1.9   
88   2018-03        0.3                  2.6                 -1.9   
89   2018-03        0.3                  2.6                 -1.9   
90   2018-03        0.3                  2.6                 -1.9   
91   2018-03        0.3                  2.6                 -1.9   
92   2018-03        0.3                  2.6                 -1.9   
93   2018-03        0.3                  2.6                 -1.9   
94   2018-03        0.3                  2.6                 -1.9   
95   2018-03        0.3                  2.6                 -1.9   
96   2018-03        0.3                  2.6                 -1.9   
97   2018-03        0.3                  2.6                 -1.9   
98   2018-03        0.3                  2.6                 -1.9   
99   2018-03 

In [9]:
# Create a mask of rows without missing values across all columns
mask_no_nan = df_combined_all_parameters.notna().all(axis=1)

# Determine streaks of continuous months without missing values
streak_data = (mask_no_nan != mask_no_nan.shift()).cumsum()
streak_counts = streak_data[mask_no_nan].value_counts()

# Identify the number representing the longest streak of continuous months without NaNs
longest_streak_number = streak_counts.idxmax()

# Find the row indices for this longest streak
indices_of_longest_streak = streak_data[streak_data == longest_streak_number].index

# Identify the start and end months of this streak
start_month = df_combined_all_parameters.loc[indices_of_longest_streak[0], 'month']
end_month = df_combined_all_parameters.loc[indices_of_longest_streak[-1], 'month']

# Display the duration and timeframe of the longest streak
print(f"The longest streak of months without NaN entries is: {len(indices_of_longest_streak)} months.")
print(f"It started in {start_month} and ended in {end_month}.")


The longest streak of months without NaN entries is: 67 months.
It started in 2018-01 and ended in 2023-07.


In [None]:
# List of weather parameters for which we want to compute anomalies
columns_to_compute = [
    "mean temp", 
    "mean daily max temp", 
    "mean daily min temp", 
    "mean wind speed", 
    "accumulated precipitation"
]

# Compute the average for each parameter for each month across all years
monthly_avg = df_combined_all_parameters.groupby(df_combined_all_parameters['month'].str[-2:]).mean()[columns_to_compute]

def compute_anomaly(row, col_name):
    """
    Computes the anomaly for a given row and column.
    
    Parameters:
        - row (pd.Series): A row of the dataframe.
        - col_name (str): The column name for which we want to compute the anomaly.
        
    Returns:
        - float: The computed anomaly for the given row's month and column, rounded to one decimal place.
    """
    month = row['month'][-2:]  # Extract month from the yyyy-mm format
    return round(row[col_name] - monthly_avg.at[month, col_name], 1)

# Calculate the anomaly for each column in the dataframe
for col in columns_to_compute:
    anomaly_col_name = col + ' anomaly'
    df_combined_all_parameters[anomaly_col_name] = df_combined_all_parameters.apply(lambda row: compute_anomaly(row, col), axis=1)

# Print the updated dataframe
df_combined_all_parameters



In [10]:
# Save the processed DataFrame to a CSV file to create a persistent backup of the cleaned data.
df_combined_all_parameters.to_csv('weather_data_monthly.csv', index=False)

# Confirm the data was saved successfully
print("🎉 The weather data has been successfully saved to 'weather_data_monthly.csv'! 🎉")


🎉 The weather data has been successfully saved to 'weather_data_monthly.csv'! 🎉
