
# ðŸŒ¾ CSIRO Image2Biomass Prediction - Kaggle Competition

**Author:** Manish Kumar Singh  
**Competition:** [CSIRO - Image2Biomass Prediction](https://www.kaggle.com/competitions/csiro-biomass)  
**Objective:** Predict pasture biomass from drone and ground images using deep learning.

---

## ðŸ“‹ Table of Contents
1. Introduction  
2. Imports & Setup  
3. Data Loading and Exploration  
4. Image Visualization  
5. Data Preprocessing  
6. Custom Weighted RÂ² Metric  
7. Model Architecture (EfficientNetB0)  
8. Training Configuration and Callbacks  
9. Model Training  
10. Model Evaluation  
11. Inference and Submission Creation  
12. Visualization: Predictions vs Ground Truth  
13. Final Results and Insights  
14. Appendix and References  



# 1. Introduction

The **CSIRO Image2Biomass Prediction** competition challenges participants to estimate pasture biomass using drone and ground imagery.  
Accurate predictions help improve **farm efficiency**, **animal welfare**, and **soil sustainability**.

### Evaluation Metric: Weighted RÂ²
The competition uses a weighted version of the RÂ² (coefficient of determination):

$$
R^2 = 1 - \frac{\sum w_i(y_i - \hat{y}_i)^2}{\sum w_i(y_i - \bar{y})^2}
$$

| Target | Weight |
|---------|---------|
| Dry_Green_g | 0.1 |
| Dry_Dead_g | 0.1 |
| Dry_Clover_g | 0.1 |
| GDM_g | 0.2 |
| Dry_Total_g | 0.5 |


In [None]:

import os, cv2, random, warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.applications import EfficientNetB0
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint

warnings.filterwarnings("ignore")
sns.set_style("whitegrid")

SEED = 42
np.random.seed(SEED)
tf.random.set_seed(SEED)

print(f"âœ… TensorFlow {tf.__version__} | NumPy {np.__version__} | Pandas {pd.__version__}")


In [None]:

DATA_PATH = '/kaggle/input/csiro-biomass'
train_df = pd.read_csv(f"{DATA_PATH}/train.csv")
test_df = pd.read_csv(f"{DATA_PATH}/test.csv")
sample_submission = pd.read_csv(f"{DATA_PATH}/sample_submission.csv")

print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")

train_df.head()


In [None]:

import matplotlib.image as mpimg

def visualize_samples(df, img_dir, num=6):
    sample = df.sample(num, random_state=SEED)
    plt.figure(figsize=(15,6))
    for i, row in enumerate(sample.itertuples()):
        path = os.path.join(img_dir, f"{row.image_id}.jpg")
        img = mpimg.imread(path)
        plt.subplot(2, num//2, i+1)
        plt.imshow(img)
        plt.title(f"{row.image_id}\nBiomass: {getattr(row, 'biomass', 'NA'):.2f}" if 'biomass' in df.columns else row.image_id)
        plt.axis('off')
    plt.suptitle("Sample Training Images", fontsize=16, fontweight='bold')
    plt.tight_layout()
    plt.show()

visualize_samples(train_df, f"{DATA_PATH}/train_images")


In [None]:

IMG_SIZE = 224
BATCH_SIZE = 32
VAL_SPLIT = 0.2

train_data, val_data = train_test_split(train_df, test_size=VAL_SPLIT, random_state=SEED)

train_gen = ImageDataGenerator(
    rescale=1./255,
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True
)

val_gen = ImageDataGenerator(rescale=1./255)

train_flow = train_gen.flow_from_dataframe(
    dataframe=train_data,
    directory=f"{DATA_PATH}/train_images",
    x_col='image_id',
    y_col='biomass',
    target_size=(IMG_SIZE, IMG_SIZE),
    class_mode='raw',
    batch_size=BATCH_SIZE,
    seed=SEED
)

val_flow = val_gen.flow_from_dataframe(
    dataframe=val_data,
    directory=f"{DATA_PATH}/train_images",
    x_col='image_id',
    y_col='biomass',
    target_size=(IMG_SIZE, IMG_SIZE),
    class_mode='raw',
    batch_size=BATCH_SIZE,
    seed=SEED
)


In [None]:

def weighted_r2(y_true, y_pred, weights=None):
    if weights is None:
        weights = np.ones_like(y_true)
    y_true_mean = np.average(y_true, weights=weights)
    ss_res = np.sum(weights * (y_true - y_pred)**2)
    ss_tot = np.sum(weights * (y_true - y_true_mean)**2)
    return 1 - ss_res / (ss_tot + 1e-8)


In [None]:

def build_model(input_shape=(224,224,3)):
    base = EfficientNetB0(include_top=False, input_shape=input_shape, weights='imagenet')
    base.trainable = False
    x = layers.GlobalAveragePooling2D()(base.output)
    x = layers.Dropout(0.3)(x)
    output = layers.Dense(1, activation='linear')(x)
    model = models.Model(inputs=base.input, outputs=output)
    model.compile(optimizer='adam', loss='mae', metrics=['mae'])
    return model

model = build_model()
model.summary()


In [None]:

callbacks = [
    EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True),
    ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3),
    ModelCheckpoint('best_model.h5', monitor='val_loss', save_best_only=True)
]

history = model.fit(
    train_flow,
    validation_data=val_flow,
    epochs=25,
    callbacks=callbacks
)


In [None]:

plt.plot(history.history['loss'], label='Train')
plt.plot(history.history['val_loss'], label='Val')
plt.title('Training and Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('MAE')
plt.legend()
plt.show()


In [None]:

test_gen = ImageDataGenerator(rescale=1./255)
test_flow = test_gen.flow_from_dataframe(
    dataframe=test_df,
    directory=f"{DATA_PATH}/test_images",
    x_col='image_id',
    y_col=None,
    target_size=(IMG_SIZE, IMG_SIZE),
    class_mode=None,
    batch_size=BATCH_SIZE,
    shuffle=False
)

preds = model.predict(test_flow)
submission = pd.DataFrame({
    'sample_id': sample_submission['sample_id'],
    'target': preds.flatten()
})
submission.to_csv('submission.csv', index=False)
print("âœ… Submission file created: submission.csv")
