In [None]:
import geopandas as gpd
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import numpy as np

In [None]:
file_path = r'D:\FOLDER FROM THESIS\THESIS\Processed data\Training ML\filled_manipulated_28_11.geojson'

# Load the .geojson file into a GeoDataFrame
gdf = gpd.read_file(file_path)

# Temporarily adjust display settings to show more columns
with pd.option_context('display.max_columns', None, 'display.max_rows', None):
    # Print the first 20 rows
    print("First 20 rows:")
    print(gdf.head(20))

    # Print the last 20 rows
    print("\nLast 20 rows:")
    print(gdf.tail(20))

In [None]:
# Drop irrelevant fields
columns_to_drop = ['byg021BygningensAnvendelse', 'geometry', 'byg404Koordinat', 'byg406Koordinatsystem', 'x', 'y']
gdf = gdf.drop(columns=columns_to_drop)

# One-Hot Encode Categorical Variables
categorical_columns = ['byg032YdervæggensMateriale', 'byg033Tagdækningsmateriale', 'byg056Varmeinstallation', 
                       'eta006BygningensEtagebetegnelse', 'landscape', 'TSYM', 'byg021BygningensAnvendelse_grouped']
gdf_encoded = pd.get_dummies(gdf, columns=categorical_columns)

In [None]:
# Check columns
with pd.option_context('display.max_columns', None, 'display.max_rows', None):
    # Print the first 20 rows
    print("First 20 rows:")
    print(gdf_encoded.head(20))

    # Print the last 20 rows
    print("\nLast 20 rows:")
    print(gdf_encoded.tail(20))

In [None]:
# List of numeric variables to be dropped
coorelated_variables_to_drop = [
    'maksimal5d', 'maksimal14', 'doegn10mm', 'doegn20mm', 'time2aarsh', 'time5aarsh', 
    'time10aars', 'time20aars', 'time50aars', 'time100aar', 'doegn5aars', 'doegn10aar', 
    'doegn20aar', 'doegn50aar', 'doegn100aa', 'toerredage', 'toerreperi', 'potentielf', 
    'solindstra', 'dagligmint', 'lavestetem', 'gennemsn_1', 'gennemsnit', 'varmeboelg', 'doegnetste', 
    'hedeboelge', 'hoejestete', 'vaekstsaes', 'ekstremvin', 'maksimaldo', 'skybrud', 
    'aaretstemp', 'e_value', 'g_value', 'count', 'building', 'clay_accu_', 'streamlake', 'sand_accu'
]

# Drop specified numeric variables and exclude non-numeric columns
gdf_reduced = gdf_encoded.drop(columns=coorelated_variables_to_drop)

In [None]:
# Check columns again
with pd.option_context('display.max_columns', None, 'display.max_rows', None):
    # Print the first 20 rows
    print("First 20 rows:")
    print(gdf_reduced.head(20))

    # Print the last 20 rows
    print("\nLast 20 rows:")
    print(gdf_reduced.tail(20))

In [None]:
# Check for NaN values
nan_in_data = gdf_reduced.isnull().sum()

# Print columns with NaN values and their count
nan_columns = nan_in_data[nan_in_data > 0]
print("Columns with NaN values and their count:")
print(nan_columns)

# Check if there are any NaN values in the entire DataFrame
if gdf_encoded.isnull().values.any():
    print("There are NaN values in the DataFrame.")
else:
    print("There are no NaN values in the DataFrame.")

In [None]:
# Standardize the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(gdf_reduced)

In [None]:
# Initialize PCA and choose the number of components
pca = PCA(n_components=2)

# Fit and transform the scaled data
pca_result = pca.fit_transform(scaled_data)

# Explained variance ratio for each principal component
print("Explained Variance Ratio for 2 components:", pca.explained_variance_ratio_)


In [None]:
# Initialize PCA and choose the number of components
pca = PCA(n_components=10)

# Fit and transform the scaled data
pca_result = pca.fit_transform(scaled_data)

# Explained variance ratio for each principal component
print("Explained Variance Ratio for 10 components:", pca.explained_variance_ratio_)

In [None]:
#Scree plot for all PCs

# Determine the maximum number of components
max_components = min(scaled_data.shape)  # Minimum of the number of samples and features

# Perform PCA for the maximum possible components
pca_max = PCA(n_components=max_components)
pca_max_result = pca_max.fit_transform(scaled_data)

# Calculate the cumulative explained variance
cumulative_variance_max = pca_max.explained_variance_ratio_.cumsum()

# Plot the scree plot for the maximum components
plt.figure(figsize=(10, 6))
plt.plot(range(1, max_components + 1), cumulative_variance_max, marker='o')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('Scree Plot for Maximum Components')
plt.grid(True)
plt.show()

# Print the cumulative explained variance
print("Cumulative Explained Variance for Maximum Components:", cumulative_variance_max)

In [None]:
# Normalize the loadings (coefficients of the PCA)
coefficients = pca_2d.components_.T * np.sqrt(pca_2d.explained_variance_)

# Ensure that the PCA results are scaled correctly
pca_2d_result_scaled = pca_2d_result / pca_2d_result.std(axis=0)

In [None]:
#Biplot for first two PCs 
def pca_biplot_with_variance(score, coeff, pc_var, labels=None):
    xs = score[:, 0]
    ys = score[:, 1]
    n = coeff.shape[0]

    fig, ax = plt.subplots(figsize=(12, 8))

    # Draw the vectors from the origin
    for i in range(n):
        ax.arrow(0, 0, coeff[i, 0], coeff[i, 1], color='r', alpha=0.5, head_width=0.05, head_length=0.1)
        if labels is not None:
            ax.text(coeff[i, 0], coeff[i, 1], labels[i], color='g', ha='center', va='center')

    ax.set_xlim([-1, 1])
    ax.set_ylim([-1, 1])
    ax.set_xlabel(f"PC1 ({pc_var[0]:.2%} variance)")
    ax.set_ylabel(f"PC2 ({pc_var[1]:.2%} variance)")
    ax.grid()

# Explained variance for each principal component
pc_variance = pca_2d.explained_variance_ratio_

# Create the biplot with variance labels
pca_biplot_with_variance(pca_2d_result_scaled, coefficients, pc_variance, labels=gdf_reduced.columns.values)
plt.title('PCA Biplot with Variance for the First Two Principal Components')
plt.show()


In [None]:
# Second biplot (PC3 and PC4) all biplots were explored
pca_4d = PCA(n_components=4)
pca_4d_result = pca_4d.fit_transform(scaled_data_modified)

# Extract the loadings for PC3 and PC4
coefficients_3_4 = pca_4d.components_[2:4].T * np.sqrt(pca_4d.explained_variance_[2:4])

def pca_biplot_3_4(score, coeff, pc_var, labels=None):
    xs = score[:, 2]  # PC3
    ys = score[:, 3]  # PC4
    n = coeff.shape[0]

    fig, ax = plt.subplots(figsize=(12, 8))

    # Draw the vectors from the origin
    for i in range(n):
        ax.arrow(0, 0, coeff[i, 0], coeff[i, 1], color='r', alpha=0.5, head_width=0.05, head_length=0.1)
        if labels is not None:
            ax.text(coeff[i, 0], coeff[i, 1], labels[i], color='g', ha='center', va='center')

    ax.set_xlim([-1, 1])
    ax.set_ylim([-1, 1])
    ax.set_xlabel(f"PC3 ({pc_var[2]:.2%} variance)")
    ax.set_ylabel(f"PC4 ({pc_var[3]:.2%} variance)")
    ax.grid()

# Create the biplot for PC3 and PC4
pca_biplot_3_4(pca_4d_result, coefficients_3_4, pca_4d.explained_variance_ratio_, labels=gdf_reduced_modified.columns.values)
plt.title('PCA Biplot for PC3 and PC4')
plt.show()


In [None]:
#Scree plots using gdf_reduced which is the encoded gdf after dropping variables from correlation 

import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import numpy as np
import pandas as pd

# Standardize the Data
scaler = StandardScaler()
gdf_scaled = scaler.fit_transform(gdf_reduced)

# Perform PCA
pca = PCA()
principal_components = pca.fit_transform(gdf_scaled)

# Number of components
num_components = pca.n_components_

# Convert to DataFrame for further use
pca_df = pd.DataFrame(data=principal_components, columns=[f'PC{i+1}' for i in range(num_components)])

# Calculate the cumulative explained variance
cumulative_variance = pca.explained_variance_ratio_.cumsum()

# Plot the scree plot
plt.figure(figsize=(10, 6))
plt.plot(range(1, num_components + 1), cumulative_variance, marker='o')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('Scree Plot (including one-hot encoded variables)')
plt.grid(True)
plt.show()