In [None]:
#Script for correlation matrices
#First for climate variables, then for fewer climate variables, then for the selected climate variables, 
#and finally for all variables with r=<±0.7

In [None]:
import geopandas as gpd
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
file_path = r'D:\FOLDER FROM THESIS\THESIS\Processed data\Training ML\filled_manipulated_28_11.geojson'

# Load the .geojson file into a GeoDataFrame
gdf = gpd.read_file(file_path)

# Temporarily adjust display settings to show more columns
with pd.option_context('display.max_columns', None, 'display.max_rows', None):
    # Print the first 20 rows
    print("First 20 rows:")
    print(gdf.head(20))

    # Print the last 20 rows
    print("\nLast 20 rows:")
    print(gdf.tail(20))

In [None]:
# List of non-numeric columns to be excluded for correlation matrix
non_numeric_columns = [
    'Damage', 'geometry', 'byg021BygningensAnvendelse', 'byg032YdervæggensMateriale', 
    'byg033Tagdækningsmateriale', 'byg056Varmeinstallation', 'byg404Koordinat', 
    'byg406Koordinatsystem', 'eta006BygningensEtagebetegnelse', 'x', 'y', 
    'landscape', 'TSYM', 'byg021BygningensAnvendelse_grouped'
]

# Drop the specified non-numeric columns
numeric_gdf = gdf.drop(columns=non_numeric_columns)

# Now 'numeric_gdf' contains only numeric data

In [None]:
# Calculate the correlation matrix
corr_matrix = numeric_gdf.corr()

# Flatten the correlation matrix and reset index to convert it into a table
corr_table = corr_matrix.stack().reset_index()
corr_table.columns = ['Variable1', 'Variable2', 'Correlation']

# Remove self-correlation (correlation of variables with themselves)
corr_table = corr_table[corr_table['Variable1'] != corr_table['Variable2']]

# Sort the table by the absolute value of correlation
corr_table['Abs_Correlation'] = corr_table['Correlation'].abs()
sorted_corr_table = corr_table.sort_values(by='Abs_Correlation', ascending=False)

# Temporarily adjust display settings to show all data without truncation
with pd.option_context('display.max_colwidth', None, 'display.max_rows', None):
    # Display the top correlations
    print(sorted_corr_table.head(2000))  # Adjust the number as needed to see more or fewer pairs


In [None]:
#Correlation matrix for all climate variables

# List of original climate variable names
climate_variables = [
    'toerredage', 'doegnetste', 'vaekstsaes', 'maksimaldo', 'skybrud', 'aaretstemp', 
    'lavestetem', 'gennemsn_1', 'varmeboelg', 'toerreperi', 'potentielf', 'solindstra', 
    'dagligmaxt', 'dagligmint', 'frostdoegn', 'hedeboelge', 'hoejestete', 'ekstremvin', 
    'middelvind', 'maksimal5d', 'maksimal14', 'doegn10mm', 'doegn20mm', 'time2aarsh', 
    'time5aarsh', 'time10aars', 'time20aars', 'time50aars', 'time100aar', 'doegn2aars', 
    'doegn5aars', 'doegn10aar', 'doegn20aar', 'doegn50aar', 'doegn100aa'
]

# Create a DataFrame with only climate variables
climate_df = gdf[climate_variables]

# Define the translation dictionary
translation_dict = {
    'toerredage': 'Number of dry days',
    'doegnetste': 'Diurnal temperature range',
    'vaekstsaes': 'Growing season length',
    'maksimaldo': 'Max daily precipitation',
    'skybrud': 'Cloudburst per year',
    'aaretstemp': 'Annual temperature range',
    'lavestetem': 'Min temperature',
    'gennemsn_1': 'Mean precipitation',
    'varmeboelg': 'Warm-wave days',
    'toerreperi': 'Maximum dry spell length',
    'potentielf': 'Potential evaporation',
    'solindstra': 'Solar radiation',
    'dagligmaxt': '24-hour mean max temperature',
    'dagligmint': '24-hour mean min temperature',
    'frostdoegn': 'Frost days',
    'hedeboelge': 'Heatwave days',
    'hoejestete': 'Max temperature',
    'ekstremvin': 'Extreme wind',
    'middelvind': 'Mean wind speed',
    'maksimal5d': '5-day max precipitation',
    'maksimal14': '14-day max precipitation',
    'doegn10mm': 'Days over 10 mm. precipitation',
    'doegn20mm': 'Days over 20 mm. precipitation',
    'time2aarsh': '2-year event hourly precipitation',
    'time5aarsh': '5-year event hourly precipitation',
    'time10aars': '10-year event hourly precipitation',
    'time20aars': '20-year event hourly precipitation',
    'time50aars': '50-year event hourly precipitation',
    'time100aar': '100-year event hourly precipitation',
    'doegn2aars': '2-year event in 24-hour precipitation',
    'doegn5aars': '5-year event 24-hour precipitation',
    'doegn10aar': '10-year event 24-hour precipitation',
    'doegn20aar': '20-year event 24-hour precipitation',
    'doegn50aar': '50-year event 24-hour precipitation',
    'doegn100aa': '100-year event 24-hour precipitation'
}

# Rename columns in the DataFrame
climate_df.rename(columns=translation_dict, inplace=True)

# Calculate the correlation matrix
climate_corr_matrix = climate_df.corr()

# Plot the correlation matrix
plt.figure(figsize=(12, 10))
sns.heatmap(climate_corr_matrix, annot=False, cmap='coolwarm', vmin=-1, vmax=1)
plt.title("Correlation Matrix (all climate variables)")
plt.xticks(rotation=45, ha='right')  # Rotate x-axis labels 
plt.yticks(rotation=0)  # Rotate y-axis labels
plt.show()


In [None]:
#Correlation matrix for fewer selected climate variables

# List of climate variables
climate_variables = [
    'maksimal5d', 'doegn2aars', 'toerreperi', 'dagligmaxt', 'frostdoegn', 'middelvind',
]

# Create a DataFrame with only climate variables
climate_df_fewer = gdf[climate_variables]

# Rename columns in the DataFrame
climate_df_fewer.rename(columns=translation_dict, inplace=True)

# Calculate the correlation matrix
climate_corr_matrix = climate_df_fewer.corr()

# Set font size globally
plt.rcParams.update({'font.size': 20})  # Adjust font size here

# Plot the correlation matrix
plt.figure(figsize=(12, 10))
sns.heatmap(climate_corr_matrix, annot=False, cmap='coolwarm', vmin=-1, vmax=1)
plt.title("Correlation Matrix (fewer climate variables)")
plt.xticks(rotation=45, ha='right')  # Rotate x-axis labels
plt.yticks(rotation=0)
plt.colorbar(label='Correlation Coefficient')  # Add a colorbar with a label
plt.show()

In [None]:
#Selected climate variables

# List of climate variables
climate_variables = [
    'doegn2aars', 'dagligmaxt', 'frostdoegn', 'middelvind',
]

# Create a DataFrame with only climate variables
climate_df_fewer = gdf[climate_variables]

# Rename columns in the DataFrame
climate_df_fewer.rename(columns=translation_dict, inplace=True)

# Calculate the correlation matrix
climate_corr_matrix = climate_df_fewer.corr()

# Set font size globally
plt.rcParams.update({'font.size': 23})  # Adjust font size here

# Plot the correlation matrix
plt.figure(figsize=(12, 10))
sns.heatmap(climate_corr_matrix, annot=False, cmap='coolwarm', vmin=-1, vmax=1)
plt.title("Correlation Matrix (selected climate variables)")
plt.xticks(rotation=45, ha='right')  # Rotate x-axis labels
plt.yticks(rotation=0)
plt.colorbar(label='Correlation Coefficient')  # Add a colorbar with a label
plt.show()


In [None]:
# List of numeric variables to be dropped
numeric_variables_to_drop = [
    'maksimal5d', 'maksimal14', 'doegn10mm', 'doegn20mm', 'time2aarsh', 'time5aarsh', 
    'time10aars', 'time20aars', 'time50aars', 'time100aar', 'doegn5aars', 'doegn10aar', 
    'doegn20aar', 'doegn50aar', 'doegn100aa', 'toerredage', 'toerreperi', 'potentielf', 
    'solindstra', 'dagligmint', 'lavestetem', 'gennemsn_1', 'gennemsnit', 'varmeboelg', 'doegnetste', 
    'hedeboelge', 'hoejestete', 'vaekstsaes', 'ekstremvin', 'maksimaldo', 'skybrud', 
    'aaretstemp', 'e_value', 'g_value', 'count', 'building', 'clay_accu_', 'streamlake', 'sand_accu'
]

# List of non-numeric columns to be excluded
non_numeric_columns = [
    'geometry', 'byg021BygningensAnvendelse', 'byg032YdervæggensMateriale', 
    'byg033Tagdækningsmateriale', 'byg056Varmeinstallation', 'byg404Koordinat', 
    'byg406Koordinatsystem', 'eta006BygningensEtagebetegnelse', 'x', 'y', 
    'landscape', 'TSYM', 'byg021BygningensAnvendelse_grouped'
]

# Drop specified numeric variables and exclude non-numeric columns
gdf_reduced = gdf.drop(columns=numeric_variables_to_drop + non_numeric_columns)

In [None]:
#Final correlation matrix for all variables with r=<±0.7

# Translation dictionary
translation_dict = {
    'byg026Opførelsesår': 'Construction year',
    'byg027OmTilbygningsår': 'Re-construction year',
    'byg054AntalEtager': 'Floors (count)',
    'eta020SamletArealAfEtage': 'Basement (area)',
    'areasqm_2': 'Building polygon area',
    'height_mea_2': 'Building height',
    'redoxgrid_code': 'Depth to redox boundary',
    'b_div_c': 'Building value',
    'landmovelandmove_idw25': 'Land movement',
    'dtm20': 'Terrain',
    'slope20': 'Slope',
    'groundwate': 'Depth to groundwater surface',
    'sand_depth': 'Depth to sand',
    'clay_depth': 'Depth to chalk',
    'coast': 'Distance to coast',
    'waterbodies': 'Distance to waterbodies',
    'bluespot': 'Bluespot',
    'doegn2aars': 'Precipitation (2-year event in 24-hour)',
    'dagligmaxt': 'Temperature (daily max)',
    'frostdoegn': 'Frost days',
    'middelvind': 'Mean wind speed'
}

# Rename columns in gdf_reduced
gdf_reduced.rename(columns=translation_dict, inplace=True)

# Calculate the correlation matrix for the remaining variables
corr_matrix_reduced = gdf_reduced.corr()

# Plot the correlation matrix with the range from -1 to 1
plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix_reduced, annot=False, cmap='coolwarm', vmin=-1, vmax=1)
plt.title("Correlation Matrix (r=<±0.7)")
plt.xticks(rotation=45, ha='right')  # Rotate x-axis labels
plt.yticks(rotation=0)
plt.colorbar(label='Correlation Coefficient')  # Add a colorbar with a label
plt.show()