### CSIRO - Image2Biomass Prediction
URL: https://www.kaggle.com/competitions/csiro-biomass/overview

### Architecture

```mermaid
graph LR
    LoadData --> Preprocess
    Preprocess --> ImageFeatures["Image Features Extraction (CNN)"]
    ImageFeatures --> TabularFeaturePrep["Tabular Feature preparation"]
    TabularFeaturePrep["Tabular Feature preparation"] --> FeatureCombination["Feature combination"]
    FeatureCombination --> LightGBM
    LightGBM --> Inference
```

In [None]:
import lightgbm as lgb
from sklearn.metrics import accuracy_score
import torch
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from lightgbm import LGBMRegressor, log_evaluation
from torchvision import models, transforms
from PIL import Image
from pathlib import Path

print(torch.cuda.is_available())
device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
# data import
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')
sample_submission = pd.read_csv('data/sample_submission.csv')
train_df.head(6)

In [None]:
# data check
train_df.isnull().sum()

In [None]:
clover_g = train_df[train_df['target_name'] == 'Dry_Clover_g']
clover_g.head(6)

In [None]:
feature_cols = [c for c in train_df.columns if c not in ['sample_id', 'image_path','Sampling_Date', 'Sampling_Date', 'State', 'Species', 'target_name']]

feature_cols

In [None]:
fig, axes = plt.subplots(len(feature_cols), 1, figsize=(6, 3 * len(feature_cols)), sharex=False)

for ax, col in zip(axes, feature_cols):
    train_df[col].plot.hist(ax=ax, bins=40, color="steelblue", alpha=0.75)
    ax.set_title(f"{col} histogram")
    ax.set_xlabel("biomass")
    ax.set_ylabel("count")

plt.tight_layout()

In [None]:
train_df['sample_id'].dtype

In [None]:
train_df.describe()

In [None]:
train_df["Sampling_Date"] = pd.to_datetime(train_df["Sampling_Date"], format="%Y/%m/%d")

train_df["year"] = train_df["Sampling_Date"].dt.year
train_df["month"] = train_df["Sampling_Date"].dt.month
train_df["day"] = train_df["Sampling_Date"].dt.day

train_df[["year", "month", "day"]].head()

In [None]:
train_df.head(5)

In [None]:
tabular_feats = ["Pre_GSHH_NDVI", "Height_Ave_cm", "State", "Species", "year", "month", "day"]

target_col = "target"

# Categorical features
cat_cols = ["State", "Species"]
train_df[cat_cols] = train_df[cat_cols].astype("category")

X = train_df[tabular_feats]
y = train_df[target_col]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

model = LGBMRegressor(
    n_estimators=300,
    learning_rate=0.01,
    max_depth=-1,
    num_leaves=20,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
)

model.fit(
    X_train,
    y_train,
    eval_set=[(X_val, y_val)],
    eval_metric="rmse",
    categorical_feature=cat_cols,
    callbacks=[log_evaluation(period=10)],
)

# predict on test set
valid_pred = model.predict(X_val)

In [None]:
# ImageNet
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# pretrained model
backbone = models.resnet18(weights=models.ResNet18_Weights.IMAGENET1K_V1)
feature_extractor = torch.nn.Sequential(*(list(backbone.children())[:-1])).to(device)
feature_extractor.eval()

def extract_features(img_path: Path) -> torch.Tensor:
    img = Image.open(img_path).convert("RGB")
    x = transform(img).unsqueeze(0).to(device)
    with torch.no_grad():
        feat = feature_extractor(x)
    return feat.detach().cpu().flatten()

def add_cnn_features(df, img_root: Path):
    feats = []
    for p in df["image_path"]:
        f = extract_features(img_root / p)
        feats.append(f.numpy())
    feat_df = pd.DataFrame(feats, columns=[f"cnn_{i}" for i in range(f.numel())])
    return pd.concat([df.reset_index(drop=True), feat_df], axis=1)

# Path to train and test images
img_root = Path("data")

train_with_feat = add_cnn_features(train_df, img_root)
test_with_feat = add_cnn_features(test_df, img_root)

cnn_feats = [c for c in train_with_feat.columns if c.startswith("cnn_")]
feats = tabular_feats + cnn_feats

In [None]:
feats

In [None]:
X = train_with_feat[feats]
y = train_with_feat[target_col]
model.fit(X, y, categorical_feature=cat_cols)

In [None]:
test_with_feat.columns

In [None]:
set(feats) - set(test_with_feat.columns)

In [None]:
# test_df にも年月日を追加する
test_df["Sampling_Date"] = pd.to_datetime(test_df["Sampling_Date"], format="%Y/%m/%d")
test_df["year"] = test_df["Sampling_Date"].dt.year
test_df["month"] = test_df["Sampling_Date"].dt.month
test_df["day"] = test_df["Sampling_Date"].dt.day

# カテゴリ型も train と合わせる
test_df[cat_cols] = test_df[cat_cols].astype("category")

test_with_feat = add_cnn_features(test_df, img_root)
test_pred = model.predict(test_with_feat[feats])

In [None]:
# Create submission
sub = test_df[["sample_id"]].copy()
sub["target"] = test_pred
sub.to_csv("submission.csv", index=False)