In [1]:
import os
import numpy as np
from skimage.io import imread
from skimage.transform import resize
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler

# Path to folder containing the images
image_folder = "earth-terrain-height-and-segmentation-map-images"

# Define RGB-to-label mapping for segmentation map
terrain_classes = {
    (17, 141, 215): 0,  # Water
    (225, 227, 155): 1,  # Grassland
    (127, 173, 123): 2,  # Forest
    (185, 122, 87): 3,  # Hills
    (230, 200, 181): 4,  # Desert
    (150, 150, 150): 5,  # Mountain
    (193, 190, 175): 6   # Tundra
}

# Helper function to load and preprocess a single image set
def load_and_preprocess(base_name, image_size=(64, 64)):
    # Build file paths
    terrain_path = os.path.join(image_folder, f'{base_name}_t.png')
    height_path = os.path.join(image_folder, f'{base_name}_h.png')
    segmentation_path = os.path.join(image_folder, f'{base_name}_i2.png')
    
    # Load images
    terrain_image = imread(terrain_path)
    height_image = imread(height_path)
    segmentation_image = imread(segmentation_path)
    
    # Resize images
    terrain_image = resize(terrain_image, image_size, anti_aliasing=True, preserve_range=True).astype('uint8')
    height_image = resize(height_image, image_size, anti_aliasing=True, preserve_range=True).astype('uint16')
    segmentation_image = resize(segmentation_image, image_size, anti_aliasing=True, preserve_range=True).astype('uint8')
    
    # Convert segmentation map to labels
    labels = np.apply_along_axis(
        lambda rgb: terrain_classes.get(tuple(rgb), -1), 2, segmentation_image
    ).flatten()
    
    # Filter out invalid labels (-1)
    valid_idx = labels != -1
    
    # Flatten and filter features
    terrain_flat = terrain_image.reshape(-1, terrain_image.shape[-1])[valid_idx]
    height_flat = height_image.flatten()[valid_idx]
    features = np.hstack([terrain_flat, height_flat.reshape(-1, 1)])
    
    return features, labels[valid_idx]

# Function to process images in batches
def process_in_batches(batch_start, batch_end, image_size=(64, 64)):
    batch_features = []
    batch_labels = []
    for i in range(batch_start, batch_end + 1):
        base_name = f"{str(i).zfill(4)}"
        try:
            # Load and preprocess
            features, labels = load_and_preprocess(base_name, image_size=image_size)
            batch_features.append(features)
            batch_labels.append(labels)
        except FileNotFoundError:
            print(f"Image set {base_name} not found, skipping.")
        except Exception as e:
            print(f"Error processing {base_name}: {e}")
    # Combine batch into numpy arrays
    return np.vstack(batch_features), np.hstack(batch_labels)

# Process images in manageable batches
batch_size = 500  # Number of images per batch
total_images = 1000  # Total number of images
X = []
y = []

for batch_start in range(1, total_images + 1, batch_size):
    batch_end = min(batch_start + batch_size - 1, total_images)
    print(f"Processing batch: {batch_start} to {batch_end}")
    batch_X, batch_y = process_in_batches(batch_start, batch_end)
    X.append(batch_X)
    y.append(batch_y)

# Convert to single numpy arrays
X = np.vstack(X)
y = np.hstack(y)

# Normalize features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Processing batch: 1 to 500
Processing batch: 501 to 1000


In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix


# Train Random Forest Classifier
print("Training Random Forest...")
rf_model = RandomForestClassifier(n_estimators=100, max_depth=8, random_state=42, n_jobs=-1)
rf_model.fit(X_train, y_train)

# Make predictions
rf_predictions = rf_model.predict(X_test)

# # Evaluate model
# print("Random Forest Classification Report:")
# print(classification_report(y_test, rf_predictions))

print(classification_report(y_test, rf_predictions, target_names=[
    "Water", "Grassland", "Forest", "Hills", "Desert", "Mountain", "Tundra"
]))

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, rf_predictions))

print('\nAccuracy: {:.2f}\n'.format(accuracy_score(y_test, rf_predictions)))

print('Micro Precision: {:.2f}'.format(precision_score(y_test, rf_predictions, average='micro')))
print('Micro Recall: {:.2f}'.format(recall_score(y_test, rf_predictions, average='micro')))
print('Micro F1-score: {:.2f}\n'.format(f1_score(y_test, rf_predictions, average='micro')))

print('Macro Precision: {:.2f}'.format(precision_score(y_test, rf_predictions, average='macro')))
print('Macro Recall: {:.2f}'.format(recall_score(y_test, rf_predictions, average='macro')))
print('Macro F1-score: {:.2f}\n'.format(f1_score(y_test, rf_predictions, average='macro')))

Training Random Forest...
              precision    recall  f1-score   support

       Water       1.00      1.00      1.00    216394
   Grassland       0.99      0.98      0.99     78798
      Forest       0.97      0.99      0.98    130847
       Hills       0.96      0.98      0.97     52089
      Desert       0.98      0.96      0.97     38999
    Mountain       1.00      1.00      1.00     20652
      Tundra       0.97      0.89      0.93     21534

    accuracy                           0.98    559313
   macro avg       0.98      0.97      0.97    559313
weighted avg       0.98      0.98      0.98    559313

Confusion Matrix:
[[216020     13    288      0      0      0     73]
 [     8  77355   1069     21    191      0    154]
 [    27    692 129323    483      0      1    321]
 [     0     13    453  50973    561     14     75]
 [     1    135     50   1482  37270      0     61]
 [     0      0      1     41      0  20610      0]
 [     5     35   1993    240     92      9  19

In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

# Function to train and evaluate Random Forest with a specific depth level
def Random_Forest(depth_level):
    print(f"=== Random Forest with max_depth={depth_level} ===")
    
    # Train Random Forest Classifier
    print("Training Random Forest...")
    rf_model = RandomForestClassifier(n_estimators=100, max_depth=depth_level, random_state=42, n_jobs=-1)
    rf_model.fit(X_train, y_train)
    
    # Make predictions
    rf_predictions = rf_model.predict(X_test)
    
    # Classification report
    print("\nClassification Report:")
    print(classification_report(y_test, rf_predictions, target_names=[
        "Water", "Grassland", "Forest", "Hills", "Desert", "Mountain", "Tundra"
    ]))
    
    # Confusion matrix
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, rf_predictions))
    
    # Evaluation metrics
    print('\nOverall Metrics:')
    print('Accuracy: {:.2f}'.format(accuracy_score(y_test, rf_predictions)))
    print('Micro Precision: {:.2f}'.format(precision_score(y_test, rf_predictions, average='micro')))
    print('Micro Recall: {:.2f}'.format(recall_score(y_test, rf_predictions, average='micro')))
    print('Micro F1-score: {:.2f}'.format(f1_score(y_test, rf_predictions, average='micro')))
    print('Macro Precision: {:.2f}'.format(precision_score(y_test, rf_predictions, average='macro')))
    print('Macro Recall: {:.2f}'.format(recall_score(y_test, rf_predictions, average='macro')))
    print('Macro F1-score: {:.2f}'.format(f1_score(y_test, rf_predictions, average='macro')))




In [4]:
Random_Forest(3)  # Train and evaluate with depth=3

=== Random Forest with max_depth=3 ===
Training Random Forest...

Classification Report:
              precision    recall  f1-score   support

       Water       1.00      0.99      1.00    216394
   Grassland       0.74      0.98      0.84     78798
      Forest       0.89      0.96      0.92    130847
       Hills       0.60      0.90      0.72     52089
      Desert       0.89      0.22      0.36     38999
    Mountain       0.98      0.39      0.56     20652
      Tundra       0.00      0.00      0.00     21534

    accuracy                           0.86    559313
   macro avg       0.73      0.64      0.63    559313
weighted avg       0.85      0.86      0.83    559313


Confusion Matrix:
[[214969     45   1380      0      0      0      0]
 [     0  77498   1183    115      2      0      0]
 [     1   3449 125727   1655      0     15      0]
 [     0      4   4791  47051    212     31      0]
 [     0  16578   1638  12101   8682      0      0]
 [     0      0   2377   9646    50

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Micro F1-score: 0.86
Macro Precision: 0.73
Macro Recall: 0.64
Macro F1-score: 0.63


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [5]:
Random_Forest(4)

=== Random Forest with max_depth=4 ===
Training Random Forest...

Classification Report:
              precision    recall  f1-score   support

       Water       1.00      0.99      1.00    216394
   Grassland       0.87      0.97      0.92     78798
      Forest       0.92      0.96      0.94    130847
       Hills       0.75      0.90      0.82     52089
      Desert       0.71      0.72      0.71     38999
    Mountain       1.00      0.92      0.96     20652
      Tundra       0.95      0.00      0.01     21534

    accuracy                           0.91    559313
   macro avg       0.89      0.78      0.77    559313
weighted avg       0.92      0.91      0.90    559313


Confusion Matrix:
[[215031     19   1318      0     26      0      0]
 [     1  76236    927    101   1533      0      0]
 [     1   3586 125376   1875      9      0      0]
 [     0     11   2139  47115   2780     40      4]
 [     0   4260    190   6546  28002      0      1]
 [     0      0   1509     44      

In [10]:
# Test with other depth values for random forest. Overfitting starts to really take around depth level 7 and 8.
# Further down its get more and more precise.
Random_Forest(5) 
Random_Forest(10) 
Random_Forest(20) 

=== Random Forest with max_depth=5 ===
Training Random Forest...

Classification Report:
              precision    recall  f1-score   support

       Water       1.00      1.00      1.00    216394
   Grassland       0.95      0.97      0.96     78798
      Forest       0.94      0.98      0.96    130847
       Hills       0.80      0.93      0.86     52089
      Desert       0.87      0.79      0.83     38999
    Mountain       1.00      1.00      1.00     20652
      Tundra       0.96      0.51      0.67     21534

    accuracy                           0.95    559313
   macro avg       0.93      0.88      0.90    559313
weighted avg       0.95      0.95      0.95    559313


Confusion Matrix:
[[215897     13    389      0      4      0     91]
 [     3  76134   1090     88   1248      0    235]
 [    11   1243 127684   1840      2      1     66]
 [     0      9   1143  48421   2426     27     63]
 [     1   1754    200   6350  30641      0     53]
 [     0      0      4     45      