# About
- Feature preparation using SigLip
- Predictions using GradientBoosting
- No deep learning

## Update version 8:
- Added PCA trick with idea that predicted values are linearly dependent ([here](https://www.kaggle.com/code/none00000/lb-0-57-infer-model-code))
- My trick is different from the previous approaches, where people only predict 3 targets
- It gives around +0.01 LB
- PCA trick is in `cross_validate`

In [1]:
import argparse
from pathlib import Path
import sys
from tqdm.auto import tqdm
import json
from copy import deepcopy
import polars as pl
import numpy as np
import os

import torch
from PIL import Image
from transformers import AutoProcessor, AutoImageProcessor, AutoModel, Siglip2Model, Siglip2ImageProcessor, SiglipModel, SiglipImageProcessor

2025-12-10 19:31:45.846356: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1765395106.085982      38 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1765395106.160128      38 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
from sklearn.model_selection import KFold, GroupKFold
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.dummy import DummyRegressor
from sklearn.decomposition import PCA

import catboost
import xgboost as xgb

In [27]:
SEED = 1488

# Set random seeds for reproducibility
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed(SEED)
    torch.cuda.manual_seed_all(SEED)

# Prepare features

In [28]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

# Initialize model
model_name = "/kaggle/input/google-siglip-so400m-patch14-384/transformers/default/1/"
model = AutoModel.from_pretrained(
    model_name,
)
model = model.to(device)
model.eval()
processor = AutoImageProcessor.from_pretrained(model_name)

cuda


In [29]:
data_path = Path('/kaggle/input/csiro-biomass')

labels_to_predict = [
  "Dry_Clover_g",
  "Dry_Dead_g",
  "Dry_Green_g",
]

labels = [
  "Dry_Clover_g",
  "Dry_Dead_g",
  "Dry_Green_g",
  "Dry_Total_g",
  "GDM_g"
]

train = pl.read_csv(data_path / 'train.csv')

df = (
    train
    .with_columns([
        pl.when(pl.col('target_name') == label).then(pl.col('target')).alias(label)
        for label in labels
    ])
    .group_by('image_path')
    .agg([
        pl.col(label).mean()
        for label in labels
    ] + [
        pl.concat_str(["Sampling_Date", "State"], separator=" ")
        .alias("group")
        .first()
    ])
    .sort('image_path')
)

df

image_path,Dry_Clover_g,Dry_Dead_g,Dry_Green_g,Dry_Total_g,GDM_g,group
str,f64,f64,f64,f64,f64,str
"""train/ID1011485656.jpg""",0.0,31.9984,16.2751,48.2735,16.275,"""2015/9/4 Tas"""
"""train/ID1012260530.jpg""",0.0,0.0,7.6,7.6,7.6,"""2015/4/1 NSW"""
"""train/ID1025234388.jpg""",6.05,0.0,0.0,6.05,6.05,"""2015/9/1 WA"""
"""train/ID1028611175.jpg""",0.0,30.9703,24.2376,55.2079,24.2376,"""2015/5/18 Tas"""
"""train/ID1035947949.jpg""",0.4343,23.2239,10.5261,34.1844,10.9605,"""2015/9/11 Tas"""
…,…,…,…,…,…,…
"""train/ID975115267.jpg""",40.03,0.0,0.8,40.83,40.83,"""2015/7/8 WA"""
"""train/ID978026131.jpg""",24.6445,4.1948,12.0601,40.8994,36.7046,"""2015/9/4 Tas"""
"""train/ID980538882.jpg""",0.0,1.1457,91.6543,92.8,91.6543,"""2015/2/24 NSW"""
"""train/ID980878870.jpg""",32.3575,0.0,2.0325,34.39,34.39,"""2015/7/8 WA"""


In [30]:
test = pl.read_csv(data_path / 'test.csv')
# test = train.select('sample_id', 'image_path', 'target_name')

df_test = (
    test
    .group_by('image_path')
    .len()
    .sort('image_path')
)

df_test

image_path,len
str,u32
"""test/ID1001187975.jpg""",5


In [31]:
def compute_features(images, save_path):
    batch_size = 20
    with torch.no_grad(), open(save_path, 'w') as f:
        for i in tqdm(range(0, len(images), batch_size)):
            batch_paths = images[i:i + batch_size]
            batch = [Image.open(data_path / p) for p in batch_paths]
            
            inputs = processor(images=batch, return_tensors="pt").to(model.device)
            features = model.get_image_features(**inputs)
            
            for line in features:
                data = {f'x_{j}': line[j].item() for j in range(len(line))}
                f.write(json.dumps(data) + '\n')

In [32]:
compute_features(df['image_path'], 'features.ndjson')

  0%|          | 0/18 [00:00<?, ?it/s]

In [33]:
compute_features(df_test['image_path'], 'features_test.ndjson')

  0%|          | 0/1 [00:00<?, ?it/s]

In [34]:
responses = pl.read_ndjson('features.ndjson')

In [35]:
responses_test = pl.read_ndjson('features_test.ndjson')
responses_test

x_18,x_428,x_238,x_227,x_326,x_692,x_995,x_978,x_1008,x_138,x_216,x_394,x_281,x_635,x_684,x_547,x_317,x_460,x_57,x_876,x_221,x_1063,x_1118,x_941,x_853,x_204,x_55,x_219,x_840,x_965,x_58,x_94,x_1040,x_752,x_833,x_936,x_592,…,x_257,x_614,x_957,x_718,x_874,x_1048,x_322,x_787,x_408,x_1119,x_425,x_985,x_1106,x_244,x_849,x_1030,x_621,x_159,x_177,x_284,x_670,x_798,x_155,x_193,x_229,x_171,x_491,x_383,x_832,x_885,x_32,x_213,x_512,x_901,x_904,x_403,x_1085
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
-0.021855,-0.11219,-0.224823,-0.523304,-0.004981,-0.509943,0.762589,0.283644,0.143005,0.090222,0.042954,-0.019346,-0.098607,-1.217008,0.033259,0.396746,0.72643,0.26275,-0.84413,-0.090972,0.459834,0.058255,0.342252,0.698421,-0.428012,0.651857,0.093385,-1.425974,-0.281697,-0.499629,-0.030316,-0.696083,-0.023989,0.369038,-0.117839,-0.212519,0.129441,…,0.428458,-1.071204,0.060508,0.052715,-0.707487,-0.042799,-0.413874,0.282822,-0.173101,-0.136621,0.587764,-0.135173,-0.395921,-0.392796,0.081903,0.313277,-0.75536,-0.623839,0.155232,-0.144018,0.157528,-0.303595,-0.186702,0.056899,0.193904,0.08469,-0.15959,-0.185376,0.101637,0.552308,-0.21446,0.363656,0.15399,0.130986,-0.070049,-0.910133,-0.138152


In [36]:
df_aug = pl.concat(
    [df, responses], how='horizontal'
)
df_aug

image_path,Dry_Clover_g,Dry_Dead_g,Dry_Green_g,Dry_Total_g,GDM_g,group,x_18,x_428,x_238,x_227,x_326,x_692,x_995,x_978,x_1008,x_138,x_216,x_394,x_281,x_635,x_684,x_547,x_317,x_460,x_57,x_876,x_221,x_1063,x_1118,x_941,x_853,x_204,x_55,x_219,x_840,x_965,…,x_257,x_614,x_957,x_718,x_874,x_1048,x_322,x_787,x_408,x_1119,x_425,x_985,x_1106,x_244,x_849,x_1030,x_621,x_159,x_177,x_284,x_670,x_798,x_155,x_193,x_229,x_171,x_491,x_383,x_832,x_885,x_32,x_213,x_512,x_901,x_904,x_403,x_1085
str,f64,f64,f64,f64,f64,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""train/ID1011485656.jpg""",0.0,31.9984,16.2751,48.2735,16.275,"""2015/9/4 Tas""",-0.14428,0.022661,-0.016167,-0.252088,-0.373039,-1.007389,0.470893,0.370865,0.493748,0.029173,-0.025503,0.41075,0.402937,-1.011704,0.457613,0.0487,0.434666,0.427441,-0.046315,-0.333254,0.235416,-0.270882,0.135359,0.169699,-0.382907,0.323221,-0.283825,-1.271094,-0.152305,-0.050661,…,0.290558,-0.936497,-0.050425,0.379897,-0.344531,-0.146995,-0.25363,0.051748,-0.064001,0.083368,0.353815,-0.063298,-0.307304,-0.401229,0.095408,0.159433,-0.85181,-0.39263,-0.024952,0.107037,0.108022,-0.364648,-0.283495,0.035661,0.054606,-0.138971,-0.233365,0.490649,0.171849,0.656227,-0.735559,0.312968,0.201796,-0.099082,-0.183408,-0.698468,-0.066766
"""train/ID1012260530.jpg""",0.0,0.0,7.6,7.6,7.6,"""2015/4/1 NSW""",-0.50553,-0.174203,-0.209916,0.791203,-0.388388,-0.404259,-0.080089,0.476183,0.118312,-1.078758,-0.299929,0.323835,-0.299373,-0.52807,0.168382,-0.207139,0.32973,0.042723,-0.270965,0.082398,0.164731,0.4652,0.599991,0.256269,0.219555,1.310713,-0.162887,-1.431642,-0.420189,0.18867,…,-0.994404,-0.882589,-0.152656,0.538201,0.40432,0.511285,-0.307153,-0.224805,-0.525184,-0.35031,0.945852,0.544222,-0.227485,-0.323405,0.225077,1.002002,0.197086,-0.941843,-0.024058,-0.180228,0.097282,0.355393,0.005305,0.02672,-0.438549,-0.73386,-0.207101,0.439738,0.2629,1.461051,-0.306708,-0.683787,-0.014259,-0.086599,-0.050309,1.218574,-0.174019
"""train/ID1025234388.jpg""",6.05,0.0,0.0,6.05,6.05,"""2015/9/1 WA""",0.076407,0.583089,0.135113,-0.01391,-0.479146,-0.85857,0.014412,0.147023,0.235496,-0.054349,0.057677,0.520417,-0.704632,-0.787462,0.341955,0.127134,0.328471,0.495831,0.123691,-0.275305,0.184393,0.418725,0.251698,-0.051683,0.105278,1.236495,0.113227,-0.482178,0.06815,0.463661,…,-0.909211,-0.82552,0.353518,-0.314552,-0.035847,0.805015,0.24582,0.328699,0.175555,0.31297,0.204382,0.104212,-0.365857,-0.341132,0.020763,0.90317,0.158329,-0.679189,-0.26418,-0.030489,0.080104,0.121713,-0.031238,0.07634,-0.077614,-0.2494,0.09256,0.600153,0.165403,1.018795,-0.313015,-0.627181,0.320098,0.482728,-0.124021,-0.413536,0.429877
"""train/ID1028611175.jpg""",0.0,30.9703,24.2376,55.2079,24.2376,"""2015/5/18 Tas""",-0.176732,-0.237737,-0.164139,-0.48794,-0.383988,-0.694647,0.523053,0.491013,0.367791,-0.07502,0.024331,0.729208,0.104373,-1.269079,0.386929,0.195232,0.281811,0.426861,-0.505983,-0.025748,0.293831,-0.243678,0.34247,0.183173,-0.51128,0.504397,-0.121957,-1.412337,-0.322522,0.233878,…,0.005229,-1.24251,-0.17279,0.35949,0.365571,-0.268168,-0.104888,0.245491,0.099797,0.111823,0.062114,-0.198651,-0.147368,-0.443199,0.152185,0.1919,-0.948426,-0.40777,-0.008985,-0.105275,0.171362,-0.405823,-0.484946,0.134108,0.209546,-0.033382,-0.293876,0.357906,0.481876,0.729462,-0.92972,0.223204,0.272503,0.166802,-0.299081,-0.39095,-0.001605
"""train/ID1035947949.jpg""",0.4343,23.2239,10.5261,34.1844,10.9605,"""2015/9/11 Tas""",-0.16011,-0.198612,0.215693,-0.641983,-0.387972,-0.710815,0.459952,0.877919,0.222191,-0.032446,0.13172,0.564582,0.527081,-1.134573,0.178638,0.256365,0.442387,0.489343,-0.10444,-0.155133,0.217771,-0.112873,0.131081,0.258752,-0.273807,-0.087716,-0.2746,-1.334072,-0.304055,-0.114108,…,0.417028,-1.133934,-0.16851,0.357199,-0.375038,-0.058373,-0.290897,-0.028579,0.312092,0.085109,-0.127468,-0.151086,-0.23937,-0.085151,0.039099,0.133287,-0.753938,-0.331101,-0.058651,-0.180264,0.146094,-0.405771,-0.300778,-0.076378,0.062957,-0.357927,-0.476415,0.053564,0.139463,0.9548,-1.123362,0.405813,0.171786,0.204473,-0.180852,-1.01453,0.16884
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""train/ID975115267.jpg""",40.03,0.0,0.8,40.83,40.83,"""2015/7/8 WA""",-0.251324,-0.076377,0.394964,-0.004022,0.04943,-0.970458,0.329312,0.86749,0.038912,-0.339821,0.341495,0.729157,0.10231,-0.123478,-0.20075,-0.182662,0.097771,0.162247,-0.535461,0.101766,0.065497,-0.256814,0.102167,0.295147,0.022359,0.696981,0.006689,-1.481401,-0.011683,0.285525,…,-0.23701,-0.565795,-0.435149,-0.171302,0.029538,-0.010158,0.208061,-0.376769,0.058264,0.582177,0.113137,-0.089064,-0.40028,0.083485,0.32005,0.58166,-0.162976,-0.409445,-0.103491,-0.339782,0.07546,-0.531538,-0.009439,-0.175626,0.071119,-0.382681,-0.548976,0.113167,0.065141,1.256686,-0.567553,-0.547734,-0.197013,0.569921,-0.184817,-0.320904,0.020584
"""train/ID978026131.jpg""",24.6445,4.1948,12.0601,40.8994,36.7046,"""2015/9/4 Tas""",-0.749154,0.033471,0.237877,-0.633689,-0.411214,-0.68942,0.292797,0.940826,-0.138721,-0.372986,0.02258,0.359104,-0.364324,-0.505588,-0.198005,0.207238,0.468996,0.417018,-0.669029,0.104746,0.14601,0.101905,-0.050969,0.486955,-0.648338,0.770642,0.164795,-1.581013,-0.255879,0.384023,…,-0.511893,-0.97628,-0.156637,0.041702,0.083636,-0.005332,-0.289047,-0.32638,0.193974,-0.087619,0.544274,-0.158276,-0.270389,-0.336421,0.425076,0.682788,-0.025419,-0.652876,-0.35392,-0.422138,0.124583,-0.040143,-0.329908,0.123429,-0.529237,0.310421,-0.150769,0.489589,0.291858,1.159555,-1.130683,-0.57779,0.386335,0.71749,-0.137963,0.076135,-0.057092
"""train/ID980538882.jpg""",0.0,1.1457,91.6543,92.8,91.6543,"""2015/2/24 NSW""",-0.655161,-0.45427,-0.012784,-0.485317,-0.563242,-0.446824,0.779508,0.40648,0.569569,-0.29797,0.139651,0.511493,-0.413237,-0.650209,0.182923,0.477249,0.700911,0.200929,-0.269202,-0.00578,0.353442,-0.232776,-0.174187,0.352242,-0.365292,1.090916,-0.026864,-1.434443,-0.901517,0.238826,…,0.147501,-0.858144,-0.00983,0.593094,-0.149189,-0.25411,-0.156369,0.081549,-0.417275,0.033948,0.872927,-0.040444,-0.209113,-0.075345,0.164347,0.631424,-0.769548,-0.64087,-0.163553,-0.208221,0.301756,-0.01093,-0.409392,0.157296,-0.039051,-0.132768,-0.040609,0.218639,0.245671,0.762676,-0.677249,0.343909,0.224381,0.682979,-0.125324,-0.133455,0.084902
"""train/ID980878870.jpg""",32.3575,0.0,2.0325,34.39,34.39,"""2015/7/8 WA""",-0.582185,-0.365328,0.372645,0.156657,0.292938,-0.900554,0.347478,1.020979,0.089215,-0.791327,0.059097,0.79052,-0.299681,-1.173181,-0.293164,0.1653,0.083154,0.439576,-0.048235,-0.268969,-0.496065,0.259831,-0.341085,0.343919,0.233549,0.52087,-0.089676,-0.776876,-0.334907,0.573624,…,-0.404451,-0.48562,-0.211094,-0.158696,0.344861,0.087284,-0.025676,-0.6131,0.18984,0.583204,0.50312,-0.14218,-0.229894,-0.100497,0.094838,0.675678,0.056288,-0.469968,-0.222886,-0.18767,0.011535,-0.109287,-0.323503,0.194792,0.102187,-0.380447,-0.197716,0.586398,0.592757,1.401184,-1.153577,-0.546638,0.301224,0.772208,-0.536389,-0.384087,0.11187


In [37]:
df_test_aug = pl.concat(
    [df_test, responses_test], how='horizontal'
)
df_test_aug

image_path,len,x_18,x_428,x_238,x_227,x_326,x_692,x_995,x_978,x_1008,x_138,x_216,x_394,x_281,x_635,x_684,x_547,x_317,x_460,x_57,x_876,x_221,x_1063,x_1118,x_941,x_853,x_204,x_55,x_219,x_840,x_965,x_58,x_94,x_1040,x_752,x_833,…,x_257,x_614,x_957,x_718,x_874,x_1048,x_322,x_787,x_408,x_1119,x_425,x_985,x_1106,x_244,x_849,x_1030,x_621,x_159,x_177,x_284,x_670,x_798,x_155,x_193,x_229,x_171,x_491,x_383,x_832,x_885,x_32,x_213,x_512,x_901,x_904,x_403,x_1085
str,u32,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""test/ID1001187975.jpg""",5,-0.021855,-0.11219,-0.224823,-0.523304,-0.004981,-0.509943,0.762589,0.283644,0.143005,0.090222,0.042954,-0.019346,-0.098607,-1.217008,0.033259,0.396746,0.72643,0.26275,-0.84413,-0.090972,0.459834,0.058255,0.342252,0.698421,-0.428012,0.651857,0.093385,-1.425974,-0.281697,-0.499629,-0.030316,-0.696083,-0.023989,0.369038,-0.117839,…,0.428458,-1.071204,0.060508,0.052715,-0.707487,-0.042799,-0.413874,0.282822,-0.173101,-0.136621,0.587764,-0.135173,-0.395921,-0.392796,0.081903,0.313277,-0.75536,-0.623839,0.155232,-0.144018,0.157528,-0.303595,-0.186702,0.056899,0.193904,0.08469,-0.15959,-0.185376,0.101637,0.552308,-0.21446,0.363656,0.15399,0.130986,-0.070049,-0.910133,-0.138152


# Set up validation

In [38]:
weights = {
    'Dry_Green_g': 0.1,
    'Dry_Dead_g': 0.1,
    'Dry_Clover_g': 0.1,
    'GDM_g': 0.2,
    'Dry_Total_g': 0.5,
}

def competition_metric(y_true, y_pred) -> float:
    y_weighted = 0
    for l, label in enumerate(labels):
        y_weighted = y_weighted + y_true[:, l].mean() * weights[label]

    ss_res = 0
    ss_tot = 0
    for l, label in enumerate(labels):
        ss_res = ss_res + ((y_true[:, l] - y_pred[:, l])**2).mean() * weights[label]
        ss_tot = ss_tot + ((y_true[:, l] - y_weighted)**2).mean() * weights[label]

    return 1 - ss_res / ss_tot

In [39]:
def cross_validate(model, data, data_test, x_columns, random_state=42) -> float:
    assert not any((col in labels for col in x_columns))
    X = data.select(x_columns).to_numpy()
    X_test = data_test.select(x_columns).to_numpy()
    y_true = data.select(labels).to_numpy()
    y_pred = np.zeros([len(X), len(labels)])
    y_pred_test = np.zeros([len(X_test), len(labels)])

    n_splits = 5
    kf = GroupKFold(n_splits=5)
    groups = data.select('group')

    for i, (train_index, test_index) in enumerate(kf.split(X, groups=groups)):
        pca = PCA(3).fit(y_true[train_index])
        for l in range(len(labels)):
            m = deepcopy(model)
            m.fit(X[train_index], y_true[train_index, l])
            y_pred[test_index, l] = m.predict(X[test_index]).clip(0)
            y_pred_test[:, l] += m.predict(X_test).clip(0) / n_splits

        # Apply the PCA trick
        y_pred[test_index] = pca.inverse_transform(pca.transform(y_pred[test_index])).clip(0)
        y_pred_test = pca.inverse_transform(pca.transform(y_pred_test)).clip(0)

        print(f'Fold {i}:', competition_metric(y_true[test_index], y_pred[test_index]))

    print('Full CV:', competition_metric(y_true, y_pred))

    return y_pred, y_pred_test

New function to use only 3 targets

In [40]:
def cross_validate(model, data, data_test, x_columns, random_state=SEED) -> float:
    assert not any((col in labels for col in x_columns))
    X = data.select(x_columns).to_numpy()
    X_test = data_test.select(x_columns).to_numpy()
    y_true = data.select(labels).to_numpy()
    
    # Predict only 3 targets
    y_pred = np.zeros([len(X), len(labels_to_predict)])
    y_pred_test = np.zeros([len(X_test), len(labels_to_predict)])

    n_splits = 5
    kf = GroupKFold(n_splits=n_splits)
    groups = data.select('group')

    for i, (train_index, test_index) in enumerate(kf.split(X, groups=groups)):
        # Train models only for 3 targets
        for l, label in enumerate(labels_to_predict):
            m = deepcopy(model)
            # Set random state if model supports it
            if hasattr(m, 'random_state'):
                m.set_params(random_state=random_state + i)
            
            label_idx = labels.index(label)
            m.fit(X[train_index], y_true[train_index, label_idx])
            y_pred[test_index, l] = m.predict(X[test_index]).clip(0)
            y_pred_test[:, l] += m.predict(X_test).clip(0) / n_splits

        # Calculate Dry_Total_g and GDM_g from predictions
        y_pred_full = np.zeros([len(test_index), len(labels)])
        y_pred_full[:, 0] = y_pred[test_index, 0]  # Dry_Clover_g
        y_pred_full[:, 1] = y_pred[test_index, 1]  # Dry_Dead_g
        y_pred_full[:, 2] = y_pred[test_index, 2]  # Dry_Green_g
        y_pred_full[:, 3] = y_pred[test_index, 2] + y_pred[test_index, 1] + y_pred[test_index, 0]  # Dry_Total_g
        y_pred_full[:, 4] = y_pred[test_index, 0] + y_pred[test_index, 2]  # GDM_g

        print(f'Fold {i}:', competition_metric(y_true[test_index], y_pred_full))

    # Calculate final test predictions with formulas
    y_pred_test_full = np.zeros([len(X_test), len(labels)])
    y_pred_test_full[:, 0] = y_pred_test[:, 0]  # Dry_Clover_g
    y_pred_test_full[:, 1] = y_pred_test[:, 1]  # Dry_Dead_g
    y_pred_test_full[:, 2] = y_pred_test[:, 2]  # Dry_Green_g
    y_pred_test_full[:, 3] = y_pred_test[:, 2] + y_pred_test[:, 1] + y_pred_test[:, 0]  # Dry_Total_g
    y_pred_test_full[:, 4] = y_pred_test[:, 0] + y_pred_test[:, 2]  # GDM_g

    # For full CV score, need to collect all fold predictions
    y_pred_all = np.zeros([len(X), len(labels)])
    for i, (train_index, test_index) in enumerate(kf.split(X, groups=groups)):
        for l, label in enumerate(labels_to_predict):
            m = deepcopy(model)
            if hasattr(m, 'random_state'):
                m.set_params(random_state=random_state + i)
            
            label_idx = labels.index(label)
            m.fit(X[train_index], y_true[train_index, label_idx])
            y_pred_all[test_index, label_idx] = m.predict(X[test_index]).clip(0)
    
    # Apply formulas to full predictions
    y_pred_all[:, 3] = y_pred_all[:, 2] + y_pred_all[:, 1] + y_pred_all[:, 0]  # Dry_Total_g
    y_pred_all[:, 4] = y_pred_all[:, 0] + y_pred_all[:, 2]  # GDM_g

    print('Full CV:', competition_metric(y_true, y_pred_all))

    return y_pred_all, y_pred_test_full

# Choose a model

In [41]:
cross_validate(DummyRegressor(), df_aug, df_test_aug, sorted(responses.columns));

Fold 0: 0.23631087693306596
Fold 1: 0.20370740350549588
Fold 2: 0.24489722631682453
Fold 3: -0.01108827644918664
Fold 4: 0.10359235771267639
Full CV: 0.20051833009925546


In [42]:
cross_validate(Ridge(), df_aug, df_test_aug, sorted(responses.columns));

Fold 0: 0.19564421593403813
Fold 1: 0.5048714068439633
Fold 2: 0.5077795704233967
Fold 3: 0.42494666458404684
Fold 4: 0.6151084211231383
Full CV: 0.5123161510108214


In [43]:
cross_validate(Lasso(), df_aug, df_test_aug, sorted(responses.columns));

Fold 0: 0.5205685822036861
Fold 1: 0.5251013019932689
Fold 2: 0.3753713292531703
Fold 3: 0.5172220004107317
Fold 4: 0.41358303046929845
Full CV: 0.48931030074022075


In [44]:
_, pred_test_gb = cross_validate(
    GradientBoostingRegressor(random_state=SEED), 
    df_aug, 
    df_test_aug, 
    sorted(responses.columns),
    random_state=SEED
)

Fold 0: 0.4171869697132212
Fold 1: 0.6141634993630927
Fold 2: 0.5857304168221811
Fold 3: 0.5322259217557476
Fold 4: 0.6080671594698155
Full CV: 0.5909506421843125


In [None]:
_, pred_test_cb = cross_validate(
    catboost.CatBoostRegressor(verbose=False, iterations=100, random_seed=SEED), 
    df_aug, 
    df_test_aug, 
    sorted(responses.columns),
    random_state=SEED
)

Fold 0: 0.4483536833417333
Fold 1: 0.595307807066345
Fold 2: 0.5246354551096314
Fold 3: 0.5765429546088836
Fold 4: 0.5049897662619771


In [None]:
_, pred_test_xgb = cross_validate(
    xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, random_state=SEED), 
    df_aug,
    df_test_aug, 
    sorted(responses.columns),
    random_state=SEED
)

I choose the last two models

In [None]:
pred_test = (
    pred_test_gb
    + pred_test_cb
) / 2

# Save predictions

In [None]:
pred_with_id = pl.concat([
    df_test,
    pl.DataFrame(pred_test, schema=labels),
], how='horizontal')
pred_with_id

In [None]:
pred_save = (
    test
    .join(pred_with_id, on='image_path')
    .with_columns(
        pl.coalesce(*[
            pl.when(pl.col('target_name') == col).then(pl.col(col))
            for col in labels
        ]).alias('target')
    )
    .select('sample_id', 'target')
)
pred_save

In [None]:
pred_save.write_csv('submission.csv')