In [13]:
import os
from pathlib import Path
import rasterio
import numpy as np

for split in ['train', 'val', 'test']:
    # Define paths
    train_labels_dir = Path(f"Data/{split}_labels")
    train_images_dir = Path(f"Data/{split}_images")
    # Initialize min/max/mean values
    label_min, label_max = float('inf'), float('-inf')
    image_min, image_max = float('inf'), float('-inf')
    label_sum, label_count = 0, 0
    image_sum, image_count = 0, 0

    # Process labels
    for label_file in train_labels_dir.glob('*.tiff'):
        with rasterio.open(label_file) as src:
            label = src.read()
            label_min = min(label_min, np.min(label))
            label_max = max(label_max, np.max(label))
            label_sum += np.sum(label)
            label_count += label.size

    # Process images 
    for image_file in train_images_dir.glob('*.tiff'):
        with rasterio.open(image_file) as src:
            data = src.read()
            image_min = min(image_min, np.min(data))
            image_max = max(image_max, np.max(data))
            image_sum += np.sum(data)
            image_count += data.size

    label_mean = label_sum / label_count
    image_mean = image_sum / image_count

    print(f"Label value range: [{label_min}, {label_max}], mean: {label_mean:.4f}")
    print(f"Image value range: [{image_min}, {image_max}], mean: {image_mean:.4f}")


  dataset = DatasetReader(path, driver=driver, sharing=sharing, **kwargs)


Label value range: [-1.0, 5.9740142822265625], mean: -0.9317
Image value range: [-9999.0, 16348.0], mean: 17.1767
Label value range: [-1.0, 5.9740142822265625], mean: -0.9333
Image value range: [-9999.0, 8803.0], mean: 17.0648
Label value range: [-1.0, 5.9740142822265625], mean: -0.9328
Image value range: [-9999.0, 9562.0], mean: 17.8119


In [14]:
label[label>-1].shape



(834,)

In [8]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Read the CSV file
results_path = 'check_points/Portugal-lr_0.001-xnormby1.0-ynormby1.0/run_2024-12-07->11:43:45/test_results.csv'
df = pd.read_csv(results_path)
df['Actual'] = df['Actual']
df['Predicted'] = df['Predicted']
# Calculate absolute error (loss)
df['Error'] = np.abs(df['Actual'] - df['Predicted'])

# Create a figure with subplots
fig = plt.figure(figsize=(15, 10))
gs = fig.add_gridspec(2, 2)

# 1. Scatter plot of Predicted vs Actual
ax1 = fig.add_subplot(gs[0, 0])
ax1.scatter(df['Actual'], df['Predicted'], alpha=0.5)
ax1.plot([df['Actual'].min(), df['Actual'].max()], 
         [df['Actual'].min(), df['Actual'].max()], 
         'r--', label='Perfect Prediction')
ax1.set_xlabel('Actual Biomass')
ax1.set_ylabel('Predicted Biomass')
ax1.set_title('Predicted vs Actual Biomass')
ax1.legend()

# 2. Histogram of Actual Biomass
ax2 = fig.add_subplot(gs[0, 1])
sns.histplot(data=df, x='Actual', bins=30, ax=ax2)
ax2.set_xlabel('Actual Biomass')
ax2.set_ylabel('Count')
ax2.set_title('Distribution of Actual Biomass')

# 3. Error vs Actual Biomass
ax3 = fig.add_subplot(gs[1, 0])
ax3.scatter(df['Actual'], df['Error'], alpha=0.5)
ax3.set_xlabel('Actual Biomass')
ax3.set_ylabel('Absolute Error')
ax3.set_title('Error vs Actual Biomass')

# 4. Histogram of Errors
ax4 = fig.add_subplot(gs[1, 1])
sns.histplot(data=df, x='Error', bins=30, ax=ax4)
ax4.set_xlabel('Absolute Error')
ax4.set_ylabel('Count')
ax4.set_title('Distribution of Prediction Errors')

# Add overall title and adjust layout
plt.suptitle('Biomass Prediction Analysis', fontsize=16)
plt.tight_layout()

# Save the plot
plot_path = results_path.replace('test_results.csv', 'analysis_plot.png')
plt.savefig(plot_path, dpi=300, bbox_inches='tight')
plt.close()

# Print some basic statistics
print("\nStatistics:")
print(f"Mean Absolute Error: {df['Error'].mean():.4f}")
print(f"Standard Deviation of Error: {df['Error'].std():.4f}")
print(f"Mean Actual Biomass: {df['Actual'].mean():.4f}")
print(f"Mean Predicted Biomass: {df['Predicted'].mean():.4f}")
# Calculate R-squared
ss_res = ((df['Actual'] - df['Predicted']) ** 2).sum()
ss_tot = ((df['Actual'] - df['Actual'].mean()) ** 2).sum()
r2 = 1 - (ss_res / ss_tot)
print(f"R-squared: {r2:.4f}")



Statistics:
Mean Absolute Error: 1.1052
Standard Deviation of Error: 0.7157
Mean Actual Biomass: 3.0305
Mean Predicted Biomass: 3.7215
R-squared: 0.3547


In [6]:
import os
import numpy as np
import rasterio
from collections import Counter

# Directory containing training labels
train_labels_dir = "Data/balanced_train_labels"

# List to store biomass values
biomass_values = []

# Read all label files and collect biomass values
print("Collecting biomass values from training labels...")
for filename in os.listdir(train_labels_dir):
    if filename.endswith(".tiff"):
        filepath = os.path.join(train_labels_dir, filename)
        with rasterio.open(filepath) as src:
            data = src.read(1)  # Read first band
            # Collect non-zero and non-negative biomass values 
            valid_biomass = data[data > 0]
            biomass_values.extend(valid_biomass.flatten())

# Count occurrences of each biomass value
biomass_counts = Counter(biomass_values)

# Print counts sorted by biomass value
print("\nBiomass value distribution in training labels:")
print("Biomass Value | Count")
print("-" * 25)
for value in sorted(biomass_counts.keys()):
    print(f"{value:12.1f} | {biomass_counts[value]:6d}")

# Print total number of valid biomass pixels
total_pixels = sum(biomass_counts.values())
print(f"\nTotal number of valid biomass pixels: {total_pixels}")

# Print minimum value count
min_count = min(biomass_counts.values())
min_biomass = min(biomass_counts.keys())
print(f"\nMinimum count: {min_count} (for biomass value {min_biomass})")

# Create bins with resolution of 1
min_biomass_value = int(min(biomass_counts.keys()))
max_biomass_value = int(max(biomass_counts.keys()))
bins = range(min_biomass_value, max_biomass_value + 2)  # +2 to include the last value

# Count values in each bin
binned_counts = np.zeros(len(bins)-1)
for value, count in biomass_counts.items():
    bin_index = int(value) - min_biomass_value
    binned_counts[bin_index] += count

# Print binned distribution
print("\nBiomass value distribution by integer ranges:")
print("Range | Count")
print("-" * 25)
for i in range(len(bins)-1):
    range_start = bins[i]
    range_end = bins[i+1]
    count = int(binned_counts[i])
    if count > 0:  # Only print ranges that have values
        print(f"{range_start}-{range_end} | {count:6d}")




Collecting biomass values from training labels...

Biomass value distribution in training labels:
Biomass Value | Count
-------------------------
         0.7 |      3
         0.7 |      1
         0.7 |     54
         0.8 |      9
         0.8 |    353
         0.8 |     90
         0.8 |   1052
         0.9 |     25
         0.9 |   1356
         0.9 |    712
         0.9 |     56
         0.9 |   3164
         0.9 |      2
         0.9 |     12
         1.0 |    233
         1.0 |   4814
         1.0 |      6
         1.0 |     44
         1.0 |    138
         1.0 |    703
         1.0 |      2
         1.0 |     31
         1.1 |    850
         1.1 |     78
         1.1 |      1
         1.1 |     64
         1.1 |    940
         1.1 |      5
         1.2 |    137
         1.2 |    808
         1.2 |      1
         1.2 |     16
         1.2 |    311
         1.2 |    588
         1.2 |      2
         1.3 |     87
         1.3 |    796
         1.3 |      1
         1.3 |    