# Data Exploration for Landcover Classification

This notebook explores both the LiDAR point cloud data and the multispectral orthophoto data used for landcover classification in the Wädenswil region.

**Contents:**
1. LiDAR Point Cloud Analysis
2. Multispectral Orthophoto Analysis
3. Class Distribution and Feature Correlations

In [None]:
# Import required libraries
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
from pathlib import Path
import pandas as pd
from mpl_toolkits.mplot3d import Axes3D
import rasterio
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

# Define paths
DATA_DIR = Path(r'A:\STUDIUM\05_Herbstsemester25\PA2\data')
PROCESSED_DIR = DATA_DIR / 'processed' / 'pointnet_tiles'
TRAINING_DATA_DIR = DATA_DIR / 'aerial' / 'training_data'

In [None]:
# Calculate class distribution
unique, counts = np.unique(all_labels, return_counts=True)
total = all_labels.size

print("\nClass Distribution (Complete Dataset):")
print("-" * 70)
print(f"{'Class ID':<10} {'Class Name':<20} {'Points':<15} {'Percentage':<10}")
print("-" * 70)

for class_id, count in zip(unique, counts):
    class_name = metadata['classes'][str(int(class_id))]
    percentage = (count / total) * 100
    print(f"{int(class_id):<10} {class_name:<20} {count:<15,} {percentage:>6.2f}%")

print("-" * 70)
print(f"{'Total':<10} {'':<20} {total:<15,} {100.00:>6.2f}%")
print("-" * 70)

imbalance_ratio = np.max(counts) / np.min(counts)
print(f"\nClass Imbalance Ratio: {imbalance_ratio:.1f}:1")

### 1.3 Class Distribution

In [None]:
# Load all data (combined train, val, test)
train_features = np.load(PROCESSED_DIR / 'train_features.npy')
val_features = np.load(PROCESSED_DIR / 'val_features.npy')
test_features = np.load(PROCESSED_DIR / 'test_features.npy')

train_labels = np.load(PROCESSED_DIR / 'train_labels.npy')
val_labels = np.load(PROCESSED_DIR / 'val_labels.npy')
test_labels = np.load(PROCESSED_DIR / 'test_labels.npy')

# Combine all data
all_features = np.concatenate([train_features, val_features, test_features], axis=0)
all_labels = np.concatenate([train_labels, val_labels, test_labels], axis=0)

print(f"Combined dataset shape: {all_features.shape}")
print(f"Total tiles: {all_features.shape[0]:,}")
print(f"Points per tile: {all_features.shape[1]:,}")
print(f"Features per point: {all_features.shape[2]}")
print(f"Total points: {all_labels.size:,}")

### 1.2 Load Complete Dataset

In [None]:
# Load metadata
with open(PROCESSED_DIR / 'metadata.json', 'r') as f:
    metadata = json.load(f)

print("=" * 60)
print("LIDAR DATASET METADATA")
print("=" * 60)
print(f"Tile Size: {metadata['tile_size']}m × {metadata['tile_size']}m")
print(f"Points per Tile: {metadata['points_per_tile']:,}")
print(f"\nTotal Tiles: {metadata['num_train_tiles'] + metadata['num_val_tiles'] + metadata['num_test_tiles']:,}")
print(f"  - Training:   {metadata['num_train_tiles']:,}")
print(f"  - Validation: {metadata['num_val_tiles']:,}")
print(f"  - Test:       {metadata['num_test_tiles']:,}")
print(f"\nClasses: {metadata['num_classes']}")
for class_id, class_name in metadata['classes'].items():
    print(f"  {class_id}: {class_name}")
print(f"\nFeatures: {len(metadata['feature_names'])}")
for feature in metadata['feature_names']:
    print(f"  - {feature}")
print("=" * 60)

## 1. LiDAR Point Cloud Data Analysis

### 1.1 Dataset Overview