# Import libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import random
import matplotlib.cm as cm
import pandas as pd
import os

# Define constants

In [None]:
comb_name = "TEST"
X_test_path = "C:/Users/elise/OneDrive - NTNU/Documents/A.NTNU10/forest_classification/data/TEST/HYPSO_final_107bands_X_test_TEST.npy"
X_train_path = "C:/Users/elise/OneDrive - NTNU/Documents/A.NTNU10/forest_classification/data/TEST/HYPSO_final_107bands_X_train_TEST.npy"
y_test_path = "C:/Users/elise/OneDrive - NTNU/Documents/A.NTNU10/forest_classification/data/TEST/HYPSO_final_y_test_TEST.npy"
y_train_path = "C:/Users/elise/OneDrive - NTNU/Documents/A.NTNU10/forest_classification/data/TEST/HYPSO_final_y_train_TEST.npy"

wavelength_file = "C:/Users/elise/Master/HYPSO/spectral_bands_HYPSO-1_v1.npz"

dir_path = f"C:/Users/elise/OneDrive - NTNU/Documents/A.NTNU10/Code/Preprocessing/comb{comb_name}/"

output_file = f"{dir_path}{comb_name}info.txt"

label_names = {0: "Coniferous", 1: "Deciduous"}

In [None]:
# Ensure the directory exists
os.makedirs(dir_path, exist_ok=True)

# Load data

In [None]:
X_train = np.load(X_train_path)
X_test = np.load(X_test_path)
y_train = np.load(y_train_path)
y_test = np.load(y_test_path)

print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

In [None]:
unique_labels, count = np.unique(y_train, return_counts=True)
count_classes = len(unique_labels)

print(unique_labels)
print(count)
print(count_classes)

# Generate color map

In [None]:
# Generate colors from Viridis colormap
unique_labels = np.unique(y_train)
viridis_colors = cm.viridis(np.linspace(0, 1, len(unique_labels)))

print(viridis_colors)

# Load band numbers

In [None]:
# Satellite bands (wavelengths in nanometers)
all_wavelengths = np.load(wavelength_file)["arr_0"]
print(f"Number of bands: {len(all_wavelengths)}")

bands_to_remove = [0,1,2,3,4,5, 106, 107, 108, 109, 119, 118, 117]
wavelengths = np.delete(all_wavelengths, bands_to_remove, axis=0)
print(f"Number of bands after removal: {len(wavelengths)}")
band_nrs = np.delete(np.arange(len(all_wavelengths)), bands_to_remove, axis=0) + 1
print(f"Band numbers after removal: {band_nrs}")

# Plot spectral signatures

In [None]:
def plot_spectral_signatures(input_array, label_array, save_name, labels=[1, 2, 3], pixel_nr = -1):
    """
    Plots the spectral response of random pixels for specified labels.
    
    Args:
        landsat_array (numpy array): 3D array (bands, height, width) of Landsat data.
        label_array (numpy array): 2D array (height, width) of label values.
        labels (list of int): List of label values to compare.
    """
    plt.figure(figsize=(8, 5))

    for i, label_x in enumerate(labels):
        # Find pixels with the specified label
        pixel_indices = np.argwhere(label_array == label_x)

        if pixel_indices.size == 0:
            print(f"No pixels found with label {label_x}, skipping...")
            continue

        # Select a random pixel
        indice = pixel_indices[random.randint(0, pixel_indices.shape[0] - 1) if pixel_nr == -1 else pixel_nr]
        # indice = pixel_indices[0]

        print(f"Selected pixel for {label_names.get(label_x, 'Unknown')}: {indice}")

        # Extract spectral values for the pixel
        spectrum = input_array[indice, :].flatten()

        # Plot the spectrum with Viridis colors
        plt.plot(
            wavelengths, spectrum, marker='o', linestyle='-',
            color=viridis_colors[i], label=f"{label_names.get(label_x, 'Unknown')}"
        )


    # Labels and formatting
    plt.xlabel("Wavelength (nm)")
    plt.ylabel("Reflectance")
    plt.title("Spectral Response")
    plt.grid(True)
    plt.legend()
    plt.savefig(f"{dir_path}{comb_name}spectral_signatures_{save_name}.png")
    plt.show()

# Function to compute statistics
def compute_spectral_statistics(input_array, label_array, labels=[0, 1]):
    stats = {}
    for label in labels:
        mask = label_array == label
        if np.sum(mask) == 0:
            print(f"No pixels found for {label_names[label]}, skipping...")
            continue
        
        spectra = input_array[mask, :]
        mean_spectrum = np.mean(spectra, axis=0)
        std_spectrum = np.std(spectra, axis=0)
        
        stats[label] = {
            "mean": mean_spectrum,
            "std": std_spectrum
        }
    
    return stats

In [None]:
plot_spectral_signatures(X_train, y_train, "all", labels=[0, 1])


In [None]:
# Compute statistics
spectral_stats = compute_spectral_statistics(X_train, y_train)

# Convert to DataFrame for better visualization
dataframes = {}
for label, stats in spectral_stats.items():
    df = pd.DataFrame({
        "Wavelength (nm)": wavelengths,
        "Mean Reflectance": stats["mean"],
        "Std Dev": stats["std"]
    })
    dataframes[label] = df
    # Save info in file
    df.to_csv(f"{dir_path}{comb_name}stats_{label_names[label]}.csv", index=False)
    print(f"Spectral Statistics for {label_names[label]}:")
    print(df.head(), "\n")

# Plot the mean spectral signatures
plt.figure(figsize=(8, 5))
viridis_colors = cm.viridis(np.linspace(0, 1, len(label_names)))
for i, (label, stats) in enumerate(spectral_stats.items()):
    plt.plot(
        wavelengths, stats["mean"], marker='o', linestyle='-',
        color=viridis_colors[i], label=f"{label_names[label]}"
    )
    plt.fill_between(wavelengths, stats["mean"] - stats["std"], stats["mean"] + stats["std"], color=viridis_colors[i], alpha=0.2)

plt.xlabel("Wavelength (nm)")
plt.ylabel("Reflectance")
plt.title("Mean Spectral Response with Variance")
plt.grid(True)
plt.legend()
plt.savefig(f"{dir_path}{comb_name}spectral_comparison.png")
plt.show()

plt.figure()
plt.suptitle('Variance of bands within each class')
viridis_colors = cm.viridis(np.linspace(0, 1, len(label_names)))
for i, (label, stats) in enumerate(spectral_stats.items()):
    plt.subplot(2, 1, i + 1)
    plt.subplots_adjust(hspace=0.5)  # Add more space between subplots
    plt.plot(
        wavelengths, stats["std"], marker='o', linestyle='-',
        color=viridis_colors[i], label=f"{label_names[label]}"
    )
    plt.title(f'{label_names[label]}')
    plt.xlabel("Wavelength (nm)")
    plt.ylabel("Standard Deviation")

plt.savefig(f"{dir_path}{comb_name}variance_bands_within_class.png")
plt.show()


In [None]:
import numpy as np
import pickle

sensor_name = "Hypso"

# Save wavelengths and spectral statistics
data_to_save = {
    "wavelengths": wavelengths,  # Adjust this for different sensors
    "spectral_stats": spectral_stats
}

#save in "Dataset" folder
with open(f"C:/Users/elise/OneDrive - NTNU/Documents/A.NTNU10/Code/Dataset/spectral_data_{sensor_name}_{comb_name}.pkl", "wb") as f:
    pickle.dump(data_to_save, f)


# Perform PCA

In [None]:
from sklearn.decomposition import PCA

pca = PCA()

pca_components = pca.fit_transform(X_train)

# Get absolute contributions (loadings)
contribution = np.abs(pca.components_)


# Plot contributions

In [None]:
# Set Viridis colormap
colors = plt.cm.viridis(np.linspace(0, 1, 4))

# Plot loadings for the first four principal components using stem plots
plt.figure(figsize=(10, 5))
plt.stem(band_nrs, contribution[0, :], label="PC1", linefmt='-', markerfmt='o', basefmt='b-')
plt.title("Loadings for Principal Component 1")
plt.xlabel("Spectral Band")
plt.ylabel("Contribution")
plt.legend()
plt.savefig(f"{dir_path}{comb_name}pc1.png")
plt.show()

plt.figure(figsize=(10, 5))
plt.stem(band_nrs, contribution[1, :], label="PC2", linefmt='-', markerfmt='o', basefmt='b-')
plt.title("Loadings for Principal Component 2")
plt.xlabel("Spectral Band")
plt.ylabel("Contribution")
plt.legend()
plt.savefig(f"{dir_path}{comb_name}pc2.png")
plt.show()

plt.figure(figsize=(10, 5))
plt.stem(band_nrs, contribution[2, :], label="PC3", linefmt='-', markerfmt='o', basefmt='b-')
plt.title("Loadings for Principal Component 3")
plt.xlabel("Spectral Band")
plt.ylabel("Contribution")
plt.legend()
plt.savefig(f"{dir_path}{comb_name}pc3.png")
plt.show()

plt.figure(figsize=(10, 5))
plt.stem(band_nrs, contribution[3, :], label="PC4", linefmt='-', markerfmt='o', basefmt='b-')
plt.title("Loadings for Principal Component 4")
plt.xlabel("Spectral Band")
plt.ylabel("Contribution")
plt.legend()
plt.savefig(f"{dir_path}{comb_name}pc4.png")
plt.show()

# Importance of PCs

In [None]:
pca_components_0   = []
pca_components_1   = []

for i in range(len(y_train)): 
    if y_train[i] == 0:
        pca_components_0.append(pca_components[i, :])
    elif y_train[i] == 1:
        pca_components_1.append(pca_components[i, :])



In [None]:
# get mean, corresponting to how much each PC typically is used to descripe each class, uses median instead of mean to be more robust to noise
pca_components_1_mean =   np.median(np.array(pca_components_1),   axis=0)
pca_components_0_mean =  np.median(np.array(pca_components_0),  axis=0)

# compute the difference in PCs between each class
tot_diff = np.abs(pca_components_1_mean - pca_components_0_mean) # sum differences linearly

# Here we see that the variance within a class mostly lies in the first PCs, this should maybe be incorporated into the decision..
plt.figure()
plt.suptitle('Variance of principal components within each class')
plt.subplot(3,1,1)
plt.plot(np.var(np.array(pca_components_0), axis=0), color=viridis_colors[0])
plt.title(f'{label_names[0]}')
plt.yscale('log')
plt.subplot(3,1,2)
plt.plot(np.var(np.array(pca_components_1), axis=0), color=viridis_colors[1])
plt.title(f'{label_names[1]}')
plt.yscale('log')
plt.savefig(f"{dir_path}{comb_name}variance_within_class.png")
plt.show()

percent_var_0 = np.var(np.array(pca_components_0), axis=0)/np.abs(np.array(pca_components_0_mean)) * 100
percent_var_1 = np.var(np.array(pca_components_1), axis=0)/np.abs(np.array(pca_components_1_mean)) * 100

print(f"Percent variance compared to mean per PC for {label_names[1]}: {percent_var_1}")
print(f"Percent variance compared to mean per PC for {label_names[0]}: {percent_var_0}")                                                               

plt.figure()
plt.title('PCs weighted based on importance for classifying tree species')
plt.stem(tot_diff[:10], linefmt='-', markerfmt='o', basefmt='b-')
plt.xlabel("Principal Component")
plt.ylabel("Weighted Importance")
plt.savefig(f"{dir_path}{comb_name}pcs_importance.png")
plt.show()

In [None]:
# Explained variance ratio
explained_variance = pca.explained_variance_ratio_

plt.figure(figsize=(8, 5))
plt.stem(range(len(explained_variance[:15])), explained_variance[:15], linefmt='-', markerfmt='o', basefmt='b-')
plt.xlabel("Principal Component")
plt.ylabel("Explained Variance")
plt.title("Explained Variance by Principal Components")
plt.grid()
plt.savefig(f"{dir_path}{comb_name}explained_variance.png")
plt.show()

plt.figure(figsize=(8, 5))
plt.plot(np.cumsum(explained_variance), marker='o', color=viridis_colors[0])
plt.xlabel("Principal Component")
plt.ylabel("Cumulative Explained Variance")
plt.title("Cumulative Explained Variance by Principal Components")
plt.grid()
plt.savefig(f"{dir_path}{comb_name}cumulative_explained_variance.png")
plt.show()

plt.figure(figsize=(8, 5))
plt.plot(np.cumsum(explained_variance)[:15], marker='o', color=viridis_colors[0])
plt.xlabel("Principal Component")
plt.ylabel("Cumulative Explained Variance")
plt.title("Cumulative Explained Variance by Principal Components")
plt.grid()
plt.savefig(f"{dir_path}{comb_name}cumulative_explained_variance_15.png")
plt.show()


constant_idx = 10  # Number of components to keep for the PCA
# Compute cumulative explained variance
var_threshold = 0.95
cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
idx_threshold = np.argmax(cumulative_variance >= var_threshold) + 1
print(f"95% of variance can be explained with {idx_threshold} principal components")

# Write to file
with open(output_file, "w") as file:
    file.write(f'\n\n{var_threshold*100}% of variance can be explained with {idx_threshold} principal components:\n')
    for i in range(idx_threshold):
        file.write(f"{i+1}. PC {i}\n")  # Principal components are naturally ordered

# Select the top `idx_threshold` principal components
pca_X_train_threshold = X_train[:, :idx_threshold]  # Select first idx_threshold components
print(f"Shape of original train data: {X_train.shape}")
print(f"Shape of reduced train data for threshold idx: {pca_X_train_threshold.shape}")

pca_X_test_threshold = pca.transform(X_test)[:, :idx_threshold]  # Same for test data
print(f"Shape of original test data: {X_test.shape}")
print(f"Shape of reduced test data for threshold idx: {pca_X_test_threshold.shape}")

pca_X_train_constant = X_train[:, :constant_idx]  
print(f"Shape of reduced train data for constant idx: {pca_X_train_constant.shape}")

pca_X_test_constant = pca.transform(X_test)[:, :constant_idx]
print(f"Shape of reduced test data for constant idx: {pca_X_test_constant.shape}")



In [None]:
# Find top contributing PCs to class differences
top_pcs = np.argsort(tot_diff)[::-1]  # Sort by highest difference

# Print top 5 PCs that explain most variance between classes
print("Top 5 Principal Components for Class Differentiation:", top_pcs[:5])

In [None]:
# Scatter plot of the two most important PCs
plt.figure(figsize=(8, 6))
scatter = plt.scatter(pca_components[:, top_pcs[0]], pca_components[:, top_pcs[1]], c=y_train, cmap=cm.colors.ListedColormap([viridis_colors[0], viridis_colors[1]]), alpha=0.7)
cbar = plt.colorbar(scatter, ticks=[0, 1])
cbar.ax.set_yticklabels([label_names[0], label_names[1]])
plt.xlabel(f"Principal Component {top_pcs[0]+1}")
plt.ylabel(f"Principal Component {top_pcs[1]+1}")
plt.title("Scatter Plot of the Two Most Important PCs")
plt.savefig(f"{dir_path}{comb_name}scatter_2D.png")
plt.show()

# Scatter plot of the three most important PCs
fig = plt.figure(figsize=(10, 6))
ax = fig.add_subplot(111, projection='3d')
scatter = ax.scatter(pca_components[:, top_pcs[0]], pca_components[:, top_pcs[1]], pca_components[:, top_pcs[2]], c=y_train, cmap=cm.colors.ListedColormap([viridis_colors[0], viridis_colors[1]]), alpha=0.7)
cbar = plt.colorbar(scatter, ticks=[0, 1])
cbar.ax.set_yticklabels([label_names[0], label_names[1]])
ax.set_xlabel(f"Principal Component {top_pcs[0]+1}")
ax.set_ylabel(f"Principal Component {top_pcs[1]+1}")
ax.set_zlabel(f"Principal Component {top_pcs[2]+1}")
plt.title("Scatter Plot of the Three Most Important PCs")
plt.savefig(f"{dir_path}{comb_name}scatter_3D.png")
plt.show()

# Importance of bands

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# 1. Normalize the loadings for each PC (each row sums to 1)
contribution_normalized = contribution / contribution.sum(axis=1, keepdims=True)

# 2. Calculate weighted contributions for each wavelength:
# Each PC’s weight is given by its explained variance (explained_variance_ordered)
weighted_wavelengths = pca.explained_variance_ratio_ @ contribution_normalized

# 3. Sort wavelengths in descending order of weighted contribution
sorted_indices = np.argsort(weighted_wavelengths)[::-1]

# Calculate the cumulative weighted contribution
total_weight = np.sum(weighted_wavelengths)
cumulative_weight = np.cumsum(weighted_wavelengths[sorted_indices])

# 4. Determine the number of bands required to reach the desired threshold (e.g., 95%)
var_threshold = 0.8
idx_threshold = np.searchsorted(cumulative_weight, var_threshold * total_weight) + 1

# Get the indices of the selected bands
selected_indices = sorted_indices[:idx_threshold]
band_indices = band_nrs[selected_indices]  # Assuming 'band_nrs' maps to original wavelength indices

print(f"Band indices: {band_indices}")
print(f"{var_threshold*100}% of variance can be explained with {idx_threshold} bands")

# 5. Reduce the training and test data to only the selected bands
# (Assuming rows correspond to wavelengths in your data matrices)
reduced_X_train = X_train[:, selected_indices]
reduced_X_test = X_test[:, selected_indices]

print(f"Shape of original train data: {X_train.shape}")
print(f"Shape of reduced train data: {reduced_X_train.shape}")
print(f"Shape of original test data: {X_test.shape}")
print(f"Shape of reduced test data: {reduced_X_test.shape}")

# 6. Plot the explained variance per band and cumulative explained variance
plt.figure()
plt.stem(weighted_wavelengths[sorted_indices][:idx_threshold])
plt.ylabel('Explained variance')
plt.xlabel('Band')
plt.savefig(f"{dir_path}{comb_name}explained_variance_per_band.png")

plt.figure()
plt.plot(cumulative_weight[:idx_threshold], marker='o')
plt.ylabel('Cumulative explained variance')
plt.xlabel('Band')
plt.savefig(f"{dir_path}{comb_name}cumulative_explained_variance_bands.png")


In [None]:
# Write to file
with open(output_file, "a") as file:
    file.write(f'\n\n{var_threshold*100}% of variance can be explained with {idx_threshold} bands:\n')
    for i, band in enumerate(band_indices):
        file.write(f"{i+1}. Band {band}, {all_wavelengths[band-1]} nm\n")

In [None]:
plot_spectral_signatures(X_train, y_train, "with_bands", labels=[1], pixel_nr=0)

# Create a mask of all indices
mask = np.ones(X_train.shape[1], dtype=bool)  # True for all bands initially
mask[selected_indices] = False  # Set selected bands to False (they remain unchanged)

# Copy data
reduced_data = np.copy(X_train)

# Set non-selected indices to zero
reduced_data[:, mask] = 0  

print(f"Selected indices: {selected_indices}")
print(f"Shape of original train data: {X_train.shape}")
print(f"Shape of reduced train data: {reduced_data.shape}")


plot_spectral_signatures(reduced_data, y_train, "removed_bands", labels=[1], pixel_nr=0)

# Save data

In [None]:
# Print shapes
print(f"Shape of original train data: {X_train.shape}")
print(f"Shape of original test data: {X_test.shape}")

print(f"Shape of pca 10 train data: {pca_X_train_constant.shape}")
print(f"Shape of pca 10 train data: {pca_X_test_constant.shape}")
print(f"Shape of pca threshold train data: {pca_X_train_threshold.shape}")
print(f"Shape of pca threshold test data: {pca_X_test_threshold.shape}")
print(f"Shape of reduced train data: {reduced_X_train.shape}")
print(f"Shape of reduced test data: {reduced_X_test.shape}")



In [None]:
# Save to numpy arrays

# train_model(pca_X_train, pca_X_test, y_train, y_test,"pca_")
np.save(f"{dir_path}{comb_name}_X_train_pca_{constant_idx}.npy", pca_X_train_constant)
np.save(f"{dir_path}{comb_name}_X_test_pca_{constant_idx}.npy", pca_X_test_constant)

# train_model(pca_X_train_threshold, pca_X_test_threshold, y_train, y_test,"pca_")
np.save(f"{dir_path}{comb_name}_X_train_pca_threshold_{idx_threshold}.npy", pca_X_train_threshold)
np.save(f"{dir_path}{comb_name}_X_test_pca_threshold_{idx_threshold}.npy", pca_X_test_threshold)

# train_model(reduced_X_train.T, reduced_X_test.T, y_train, y_test,"reduced_")
np.save(f"{dir_path}{comb_name}_X_train_reduced_80.npy", reduced_X_train)
np.save(f"{dir_path}{comb_name}_X_test_reduced_80.npy", reduced_X_test)

np.save(f"{dir_path}{comb_name}_y_train.npy", y_train)
np.save(f"{dir_path}{comb_name}_y_test.npy", y_test)

print("Data saved successfully!")
print(f"Data saved to {dir_path}{comb_name}")
