# CSIRO Image2Biomass Prediction - Complete Solution

**Author:** Manish Kumar Singh  
**Competition:** CSIRO - Image2Biomass Prediction  
**Objective:** Predict pasture biomass from images using deep learning

---

## Table of Contents
1. Introduction & Competition Overview
2. Imports & Environment Setup
3. Data Loading & Exploration
4. Data Understanding (Long Format)
5. Image Visualization
6. Data Preprocessing (Pivot to Wide Format)
7. Train/Validation Split
8. Data Augmentation & Generators
9. Weighted R-Squared Metric
10. EfficientNetB0 Model Architecture
11. Training Configuration
12. Model Training
13. Training History Visualization
14. Validation Evaluation
15. Test Predictions & Submission
16. Predictions vs Ground Truth
17. Final Summary

---

## 1. Introduction

The CSIRO Image2Biomass Prediction competition challenges participants to estimate pasture biomass using drone and ground imagery. This helps improve farm efficiency, animal welfare, and soil sustainability.

### Evaluation Metric: Weighted R-Squared

The competition uses a weighted version of the R-squared coefficient:

### Target Variables & Weights:

| Target | Weight |
|---------|--------|
| Dry_Green_g | 0.1 |
| Dry_Dead_g | 0.1 |
| Dry_Clover_g | 0.1 |
| GDM_g | 0.2 |
| Dry_Total_g | 0.5 |

**Important:** The train.csv is in LONG format - each image has 5 rows (one per target).

---

## 2. Imports & Environment Setup

In [None]:
# Step 1: Import all necessary libraries
import os
import random
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import seaborn as sns
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.applications import EfficientNetB0
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint

# Step 2: Configure warnings and plot style
warnings.filterwarnings("ignore")
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

# Step 3: Set random seeds for reproducibility
SEED = 42
np.random.seed(SEED)
tf.random.set_seed(SEED)
random.seed(SEED)

# Step 4: Display library versions
print("="*60)
print("ENVIRONMENT SETUP")
print("="*60)
print(f"TensorFlow: {tf.__version__}")
print(f"NumPy: {np.__version__}")
print(f"Pandas: {pd.__version__}")
print(f"GPU Available: {len(tf.config.list_physical_devices('GPU')) > 0}")
print("="*60)

---

## 3. Data Loading & Exploration

In [None]:
# Step 1: Define data path
DATA_PATH = '/kaggle/input/csiro-biomass'

# Step 2: Load CSV files
train_df = pd.read_csv(f"{DATA_PATH}/train.csv")
test_df = pd.read_csv(f"{DATA_PATH}/test.csv")
sample_submission = pd.read_csv(f"{DATA_PATH}/sample_submission.csv")

# Step 3: Display basic information
print("="*60)
print("DATASET INFORMATION")
print("="*60)
print(f"\nTrain shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")
print(f"Sample submission shape: {sample_submission.shape}")

print("\nTrain columns:")
print(train_df.columns.tolist())

print("\nFirst 10 rows of training data:")
display(train_df.head(10))

print("\nData info:")
print(train_df.info())

---

## 4. Data Understanding (Long Format)

In [None]:
# Step 1: Understand the long format structure
print("="*60)
print("DATA STRUCTURE ANALYSIS")
print("="*60)

# Extract image_id from image_path
train_df['image_id'] = train_df['image_path'].apply(lambda x: Path(x).stem)
test_df['image_id'] = test_df['image_path'].apply(lambda x: Path(x).stem)

# Count unique images
n_train_images = train_df['image_id'].nunique()
n_test_images = test_df['image_id'].nunique()

print(f"\nUnique training images: {n_train_images}")
print(f"Unique test images: {n_test_images}")
print(f"Total training rows: {len(train_df)} (should be {n_train_images} x 5 targets)")

# Show target names
print(f"\nTarget names:")
print(train_df['target_name'].unique())

# Show one image with all its targets
print(f"\nExample: All targets for one image:")
sample_image = train_df['image_id'].iloc[0]
display(train_df[train_df['image_id'] == sample_image][['image_id', 'target_name', 'target']])

# Define target columns and weights
TARGET_COLS = ['Dry_Clover_g', 'Dry_Dead_g', 'Dry_Green_g', 'GDM_g', 'Dry_Total_g']
TARGET_WEIGHTS = {'Dry_Green_g': 0.1, 'Dry_Dead_g': 0.1, 'Dry_Clover_g': 0.1, 
                  'GDM_g': 0.2, 'Dry_Total_g': 0.5}

print(f"\nTarget columns: {TARGET_COLS}")
print(f"Target weights: {TARGET_WEIGHTS}")

---

## 5. Image Visualization

In [None]:
# Step 1: Get unique images for visualization
unique_images = train_df.drop_duplicates(subset=['image_id'])

# Step 2: Sample random images
sample_images = unique_images.sample(6, random_state=SEED)

# Step 3: Visualize
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.flatten()

for i, (idx, row) in enumerate(sample_images.iterrows()):
    img_path = f"{DATA_PATH}/{row['image_path']}"
    
    if os.path.exists(img_path):
        img = mpimg.imread(img_path)
        axes[i].imshow(img)
        
        # Get Dry_Total_g for this image
        total_biomass = train_df[(train_df['image_id'] == row['image_id']) & 
                                 (train_df['target_name'] == 'Dry_Total_g')]['target'].values[0]
        
        axes[i].set_title(f"{row['image_id']}\nTotal Biomass: {total_biomass:.1f}g", 
                         fontsize=10, fontweight='bold')
        axes[i].axis('off')
    else:
        axes[i].text(0.5, 0.5, f"Image not found\n{row['image_id']}", 
                    ha='center', va='center')
        axes[i].axis('off')

plt.suptitle("Sample Training Images", fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

---

## 6. Data Preprocessing (Pivot to Wide Format)

In [None]:
# Step 1: Pivot train data from long to wide format
# We need one row per image with 5 target columns
train_wide = train_df.pivot_table(
    index=['image_id', 'image_path'], 
    columns='target_name', 
    values='target'
).reset_index()

print("="*60)
print("DATA PREPROCESSING")
print("="*60)
print(f"\nWide format shape: {train_wide.shape}")
print(f"Expected: ({n_train_images}, 7) - image_id, image_path, and 5 targets")

print("\nWide format columns:")
print(train_wide.columns.tolist())

print("\nFirst 5 rows:")
display(train_wide.head())

# Step 2: Check for missing values
print("\nMissing values in targets:")
print(train_wide[TARGET_COLS].isnull().sum())

# Step 3: Statistics
print("\nTarget statistics:")
display(train_wide[TARGET_COLS].describe())

---

## 7. Train/Validation Split

In [None]:
# Step 1: Split data
VAL_SPLIT = 0.2
IMG_SIZE = 224
BATCH_SIZE = 16

train_data, val_data = train_test_split(
    train_wide, 
    test_size=VAL_SPLIT, 
    random_state=SEED,
    shuffle=True
)

print("="*60)
print("TRAIN/VALIDATION SPLIT")
print("="*60)
print(f"\nTraining samples: {len(train_data)}")
print(f"Validation samples: {len(val_data)}")
print(f"Image size: {IMG_SIZE}x{IMG_SIZE}")
print(f"Batch size: {BATCH_SIZE}")

---

## 8. Data Augmentation & Generators

In [None]:
# Step 1: Create data generators with augmentation
train_gen = ImageDataGenerator(
    rescale=1./255,
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    vertical_flip=True,
    fill_mode='nearest'
)

val_gen = ImageDataGenerator(rescale=1./255)

# Step 2: Create flow from dataframe
train_flow = train_gen.flow_from_dataframe(
    dataframe=train_data,
    directory=DATA_PATH,
    x_col='image_path',
    y_col=TARGET_COLS,
    target_size=(IMG_SIZE, IMG_SIZE),
    class_mode='raw',
    batch_size=BATCH_SIZE,
    seed=SEED,
    shuffle=True
)

val_flow = val_gen.flow_from_dataframe(
    dataframe=val_data,
    directory=DATA_PATH,
    x_col='image_path',
    y_col=TARGET_COLS,
    target_size=(IMG_SIZE, IMG_SIZE),
    class_mode='raw',
    batch_size=BATCH_SIZE,
    seed=SEED,
    shuffle=False
)

print("="*60)
print("DATA GENERATORS")
print("="*60)
print(f"\nData generators created successfully")
print(f"Training batches per epoch: {len(train_flow)}")
print(f"Validation batches per epoch: {len(val_flow)}")

---

## 9. Weighted R-Squared Metric

In [None]:
# Step 1: Define weighted R-squared function
def weighted_r2_score(y_true, y_pred, target_cols, target_weights):
    """
    Calculate weighted R-squared score.
    
    Args:
        y_true: True values (n_samples, n_targets)
        y_pred: Predicted values (n_samples, n_targets)
        target_cols: List of target column names
        target_weights: Dictionary of weights for each target
    
    Returns:
        weighted_r2: Weighted R-squared score
        individual_r2: List of individual R-squared scores
    """
    individual_r2 = []
    weights = []
    
    for i, col in enumerate(target_cols):
        # Calculate R-squared for this target
        r2 = r2_score(y_true[:, i], y_pred[:, i])
        individual_r2.append(r2)
        weights.append(target_weights[col])
    
    # Calculate weighted average
    weights = np.array(weights)
    individual_r2 = np.array(individual_r2)
    weighted_r2 = np.sum(individual_r2 * weights) / np.sum(weights)
    
    return weighted_r2, individual_r2

# Step 2: Test the metric
print("="*60)
print("WEIGHTED R-SQUARED METRIC")
print("="*60)
test_true = np.random.rand(100, 5)
test_pred = test_true + np.random.rand(100, 5) * 0.1
test_wr2, test_r2 = weighted_r2_score(test_true, test_pred, TARGET_COLS, TARGET_WEIGHTS)
print(f"\nMetric test successful")
print(f"Test weighted R-squared: {test_wr2:.4f}")
print(f"Individual R-squared scores: {[f'{r:.4f}' for r in test_r2]}")

---

## 10. EfficientNetB0 Model Architecture

In [None]:
# Step 1: Build model
def build_model(input_shape=(224, 224, 3), num_outputs=5):
    """
    Build EfficientNetB0 model for multi-output regression.
    
    Args:
        input_shape: Input image shape
        num_outputs: Number of target outputs
    
    Returns:
        Compiled Keras model
    """
    # Load pre-trained base model
    base = EfficientNetB0(
        include_top=False,
        input_shape=input_shape,
        weights='imagenet'
    )
    
    # Freeze base model
    base.trainable = False
    
    # Build custom top
    inputs = layers.Input(shape=input_shape)
    x = base(inputs, training=False)
    x = layers.GlobalAveragePooling2D()(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(0.3)(x)
    x = layers.Dense(256, activation='relu')(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(0.3)(x)
    x = layers.Dense(128, activation='relu')(x)
    x = layers.Dropout(0.2)(x)
    outputs = layers.Dense(num_outputs, activation='linear')(x)
    
    model = models.Model(inputs=inputs, outputs=outputs)
    
    # Compile
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
        loss='mse',
        metrics=['mae', 'mse']
    )
    
    return model

# Step 2: Create model
print("="*60)
print("MODEL ARCHITECTURE")
print("="*60)
model = build_model(num_outputs=len(TARGET_COLS))
print(f"\nModel created successfully")
print(f"Total parameters: {model.count_params():,}")
print(f"\nModel summary:")
model.summary()

---

## 11. Training Configuration

In [None]:
# Step 1: Define training parameters
EPOCHS = 30

# Step 2: Define callbacks
callbacks_list = [
    EarlyStopping(
        monitor='val_loss',
        patience=7,
        restore_best_weights=True,
        verbose=1
    ),
    ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.5,
        patience=3,
        min_lr=1e-7,
        verbose=1
    ),
    ModelCheckpoint(
        'best_model.h5',
        monitor='val_loss',
        save_best_only=True,
        verbose=1
    )
]

print("="*60)
print("TRAINING CONFIGURATION")
print("="*60)
print(f"\nMaximum epochs: {EPOCHS}")
print(f"Batch size: {BATCH_SIZE}")
print(f"Learning rate: 1e-3")
print(f"\nCallbacks:")
print(f"  - EarlyStopping (patience=7)")
print(f"  - ReduceLROnPlateau (patience=3, factor=0.5)")
print(f"  - ModelCheckpoint (save best model)")

---

## 12. Model Training

In [None]:
# Step 1: Train model
print("="*60)
print("MODEL TRAINING")
print("="*60)
print(f"\nTraining started...\n")

history = model.fit(
    train_flow,
    validation_data=val_flow,
    epochs=EPOCHS,
    callbacks=callbacks_list,
    verbose=1
)

print("\n" + "="*60)
print("TRAINING COMPLETED")
print("="*60)
print(f"\nEpochs trained: {len(history.history['loss'])}")
print(f"Best validation loss: {min(history.history['val_loss']):.4f}")
print(f"Best validation MAE: {min(history.history['val_mae']):.4f}")

---

## 13. Training History Visualization

In [None]:
# Step 1: Plot training history
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Plot loss
axes[0].plot(history.history['loss'], label='Train Loss', linewidth=2)
axes[0].plot(history.history['val_loss'], label='Val Loss', linewidth=2)
axes[0].set_title('Model Loss (MSE)', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Epoch', fontsize=12)
axes[0].set_ylabel('Loss', fontsize=12)
axes[0].legend(fontsize=11)
axes[0].grid(True, alpha=0.3)

# Plot MAE
axes[1].plot(history.history['mae'], label='Train MAE', linewidth=2)
axes[1].plot(history.history['val_mae'], label='Val MAE', linewidth=2)
axes[1].set_title('Model MAE', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Epoch', fontsize=12)
axes[1].set_ylabel('MAE', fontsize=12)
axes[1].legend(fontsize=11)
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

---

## 14. Validation Evaluation

In [None]:
# Step 1: Generate predictions on validation set
print("="*60)
print("VALIDATION EVALUATION")
print("="*60)
print(f"\nGenerating predictions...\n")

val_predictions = model.predict(val_flow, verbose=1)
val_true = val_data[TARGET_COLS].values

# Step 2: Calculate metrics
weighted_r2, individual_r2 = weighted_r2_score(val_true, val_predictions, TARGET_COLS, TARGET_WEIGHTS)

print("\n" + "="*60)
print("VALIDATION RESULTS")
print("="*60)
print(f"\nWeighted R-Squared Score: {weighted_r2:.4f}")
print("\nIndividual Target Performance:\n")
print(f"{'Target':<20} {'R-Squared':<12} {'MAE':<12} {'Weight':<10}")
print("-" * 60)
for i, col in enumerate(TARGET_COLS):
    r2 = individual_r2[i]
    mae = mean_absolute_error(val_true[:, i], val_predictions[:, i])
    weight = TARGET_WEIGHTS[col]
    print(f"{col:<20} {r2:<12.4f} {mae:<12.2f} {weight:<10}")

---

## 15. Test Predictions & Submission

In [None]:
# Step 1: Prepare test data
print("="*60)
print("TEST PREDICTIONS")
print("="*60)
print(f"\nPreparing test data...\n")

# Get unique test images
test_unique = test_df.drop_duplicates(subset=['image_id'])

# Create test generator
test_gen = ImageDataGenerator(rescale=1./255)
test_flow = test_gen.flow_from_dataframe(
    dataframe=test_unique,
    directory=DATA_PATH,
    x_col='image_path',
    y_col=None,
    target_size=(IMG_SIZE, IMG_SIZE),
    class_mode=None,
    batch_size=BATCH_SIZE,
    shuffle=False
)

# Step 2: Generate predictions
print("Generating predictions...\n")
test_predictions = model.predict(test_flow, verbose=1)

# Step 3: Create submission
print("\nCreating submission file...")

# Create a mapping from image_id to predictions
image_to_pred = {}
for idx, img_id in enumerate(test_unique['image_id']):
    image_to_pred[img_id] = test_predictions[idx]

# Fill submission
submission = sample_submission.copy()
for idx, row in submission.iterrows():
    sample_id = row['sample_id']
    img_id, target_name = sample_id.split('__')
    
    if img_id in image_to_pred:
        target_idx = TARGET_COLS.index(target_name)
        submission.loc[idx, 'target'] = image_to_pred[img_id][target_idx]

# Step 4: Save submission
submission.to_csv('submission.csv', index=False)

print("\n" + "="*60)
print("SUBMISSION CREATED")
print("="*60)
print(f"\nFile: submission.csv")
print(f"Shape: {submission.shape}")
print(f"\nPreview:")
display(submission.head(10))
print(f"\nStatistics:")
display(submission['target'].describe())

---

## 16. Predictions vs Ground Truth

In [None]:
# Step 1: Create scatter plots
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
axes = axes.flatten()

for i, col in enumerate(TARGET_COLS):
    ax = axes[i]
    
    # Scatter plot
    ax.scatter(val_true[:, i], val_predictions[:, i], alpha=0.6, s=40)
    
    # Perfect prediction line
    min_val = min(val_true[:, i].min(), val_predictions[:, i].min())
    max_val = max(val_true[:, i].max(), val_predictions[:, i].max())
    ax.plot([min_val, max_val], [min_val, max_val], 'r--', linewidth=2, label='Perfect Prediction')
    
    # Metrics
    r2 = individual_r2[i]
    mae = mean_absolute_error(val_true[:, i], val_predictions[:, i])
    
    ax.set_xlabel('True Values (g)', fontsize=12, fontweight='bold')
    ax.set_ylabel('Predicted Values (g)', fontsize=12, fontweight='bold')
    ax.set_title(f'{col}\nR-Squared = {r2:.4f}, MAE = {mae:.2f}g', 
                fontsize=13, fontweight='bold')
    ax.legend(fontsize=10)
    ax.grid(True, alpha=0.3)

# Remove extra subplot
fig.delaxes(axes[5])

plt.suptitle('Validation: Predictions vs Ground Truth', fontsize=18, fontweight='bold')
plt.tight_layout()
plt.show()

---

## 17. Final Summary

In [None]:
# Step 1: Display final summary
print("\n" + "="*70)
print(" " * 15 + "CSIRO IMAGE2BIOMASS PREDICTION - FINAL RESULTS")
print("="*70)

print(f"\nMODEL INFORMATION:")
print(f"  Architecture: EfficientNetB0 (Transfer Learning)")
print(f"  Total Parameters: {model.count_params():,}")
print(f"  Input Size: {IMG_SIZE}x{IMG_SIZE}x3")
print(f"  Output Targets: {len(TARGET_COLS)}")

print(f"\nDATASET INFORMATION:")
print(f"  Training Images: {len(train_data)}")
print(f"  Validation Images: {len(val_data)}")
print(f"  Test Images: {len(test_unique)}")
print(f"  Batch Size: {BATCH_SIZE}")

print(f"\nTRAINING INFORMATION:")
print(f"  Epochs Trained: {len(history.history['loss'])}")
print(f"  Best Val Loss: {min(history.history['val_loss']):.4f}")
print(f"  Best Val MAE: {min(history.history['val_mae']):.4f}")

print(f"\nVALIDATION PERFORMANCE:")
print(f"  Weighted R-Squared Score: {weighted_r2:.4f}")
print(f"\n  Individual Target Performance:")
print(f"  {'-'*66}")
print(f"  {'Target':<20} {'R-Squared':<12} {'MAE (g)':<15} {'Weight':<10}")
print(f"  {'-'*66}")
for i, col in enumerate(TARGET_COLS):
    r2 = individual_r2[i]
    mae = mean_absolute_error(val_true[:, i], val_predictions[:, i])
    weight = TARGET_WEIGHTS[col]
    print(f"  {col:<20} {r2:<12.4f} {mae:<15.2f} {weight:<10}")

print(f"\nSUBMISSION:")
print(f"  File: submission.csv")
print(f"  Shape: {submission.shape}")
print(f"  Status: Ready for submission")

print(f"\n" + "="*70)
print(" " * 20 + "NOTEBOOK EXECUTION COMPLETE")
print("="*70)

print(f"\nNext Steps:")
print(f"  1. Download submission.csv")
print(f"  2. Submit to Kaggle competition")
print(f"  3. Try fine-tuning hyperparameters")
print(f"  4. Experiment with different architectures")
print(f"  5. Consider ensemble methods\n")