In [None]:
# ================================================================
#                   CSIRO BIOMASS PREDICTION – NOTEBOOK FINAL
# ================================================================

# =============================
# 1. IMPORTS Y CONFIGURACIÓN
# =============================

import os
import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use("seaborn-v0_8")

# ML – tabular
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error

from xgboost import XGBRegressor

# Deep Learning – CNN
import tensorflow as tf
from tensorflow.keras import layers, models

# Utilidades
from math import sqrt

# =============================
# 2. CARGA DE DATOS
# =============================

DATA_DIR = "data/csiro-biomass"
train_df = pd.read_csv(os.path.join(DATA_DIR, "train.csv"))
test_df  = pd.read_csv(os.path.join(DATA_DIR, "test.csv"))

print("TRAIN SHAPE:", train_df.shape)
print("TEST SHAPE:", test_df.shape)

train_df.head()


In [None]:
# =============================
# 3. EXPLORACIÓN INICIAL
# =============================

print(train_df.describe())
print(train_df.info())

# Valores faltantes
print("\nMissing values:")
print(train_df.isnull().sum())

# Distribución de target_name
sns.countplot(y=train_df["target_name"])
plt.title("Distribución de componentes de biomasa")
plt.show()



In [None]:
# =============================
# 4. FEATURE ENGINEERING
# =============================

df = train_df.copy()

# Convertir fecha a datetime
df["Sampling_Date"] = pd.to_datetime(df["Sampling_Date"])
df["Year"] = df["Sampling_Date"].dt.year
df["Month"] = df["Sampling_Date"].dt.month

# Columnas num / cat
num_cols = ["Pre_GSHH_NDVI", "Height_Ave_cm", "Year", "Month"]
cat_cols = ["State", "Species", "target_name"]

X = df[num_cols + cat_cols]
y = df["target"]

print("Numéricas:", num_cols)
print("Categóricas:", cat_cols)


In [None]:
# ================================================================
# 5. MODELO BASELINE – REGRESIÓN LINEAL
# ================================================================

from sklearn.linear_model import LinearRegression

preprocess = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
    ]
)

baseline_model = Pipeline([
    ("prep", preprocess),
    ("lr", LinearRegression())
])

scores = cross_val_score(
    baseline_model, X, y,
    scoring="neg_root_mean_squared_error",
    cv=5
)

rmse_baseline = -scores
print("RMSE Baseline:", rmse_baseline.mean(), "+/-", rmse_baseline.std())


In [None]:
# ================================================================
# 6. MODELO XGBOOST – BÚSQUEDA DE HIPERPARÁMETROS
# ================================================================

xgb_pipe = Pipeline([
    ("prep", preprocess),
    ("xgb", XGBRegressor(
        objective="reg:squarederror",
        random_state=42
    ))
])

param_grid = {
    "xgb__n_estimators": [500, 800, 900],
    "xgb__learning_rate": [0.01, 0.05],
    "xgb__max_depth": [4, 5],
    "xgb__subsample": [0.8, 0.9],
    "xgb__colsample_bytree": [0.8, 0.9],
    "xgb__gamma": [0, 0.3, 0.4],
    "xgb__min_child_weight": [1,3]
}

grid = GridSearchCV(
    xgb_pipe,
    param_grid,
    cv=5,
    scoring="neg_root_mean_squared_error",
    n_jobs=-1,
    verbose=1
)

grid.fit(X, y)

print("\nMejores hiperparámetros:", grid.best_params_)
print("Mejor RMSE:", -grid.best_score_)

model_xgb = grid.best_estimator_


In [None]:
# ================================================================
# 7. MODELO CNN POR COMPONENTE DE BIOMASA
# ================================================================

IMG_SIZE = 224
BATCH_SIZE = 32
AUTOTUNE = tf.data.AUTOTUNE

def path_full(rel_path):
    return os.path.join(DATA_DIR, rel_path)

train_df["full_path"] = train_df["image_path"].apply(path_full)

# Entrenaremos un modelo CNN por cada tipo de biomasa
targets = train_df["target_name"].unique()
cnn_models = {}

def create_cnn():
    model = models.Sequential([
        layers.Rescaling(1./255, input_shape=(IMG_SIZE, IMG_SIZE, 3)),
        layers.Conv2D(32, 3, activation="relu"),
        layers.MaxPooling2D(),
        layers.Conv2D(64, 3, activation="relu"),
        layers.MaxPooling2D(),
        layers.Conv2D(128, 3, activation="relu"),
        layers.MaxPooling2D(),
        layers.Flatten(),
        layers.Dense(64, activation="relu"),
        layers.Dense(1)
    ])
    model.compile(optimizer="adam", loss="mse", metrics=[tf.keras.metrics.RootMeanSquaredError()])
    return model


In [None]:
# Entrenar 1 red por target_name
for tname in targets:
    print(f"\n=== Entrenando CNN para {tname} ===")
    
    df_t = train_df[train_df["target_name"] == tname].copy()
    df_t["full_path"] = df_t["image_path"].apply(path_full)
    
    paths = df_t["full_path"].values
    labels = df_t["target"].values

    def load_img(path, label):
        img = tf.io.read_file(path)
        img = tf.image.decode_jpeg(img, channels=3)
        img = tf.image.resize(img, (IMG_SIZE, IMG_SIZE))
        return img / 255.0, label
    
    ds = tf.data.Dataset.from_tensor_slices((paths, labels))
    ds = ds.map(load_img, num_parallel_calls=AUTOTUNE).batch(16).prefetch(AUTOTUNE)

    model = create_cnn()
    model.fit(ds, epochs=5, verbose=1)

    cnn_models[tname] = model


In [None]:
# ================================================================
# 8. GENERAR SUBMISSION FINAL (CNN)
# ================================================================

test_df["full_path"] = test_df["image_path"].apply(path_full)

preds = []

for tname in targets:
    print(f"\n>>> Prediciendo para {tname}")
    df_t = test_df[test_df["target_name"] == tname]

    def load_test(path):
        img = tf.io.read_file(path)
        img = tf.image.decode_jpeg(img, channels=3)
        img = tf.image.resize(img, (IMG_SIZE, IMG_SIZE))
        return img / 255.0
    
    ds_test = tf.data.Dataset.from_tensor_slices(df_t["full_path"].values)
    ds_test = ds_test.map(load_test).batch(16)

    pred = cnn_models[tname].predict(ds_test).flatten()
    preds.extend(pred)

submission_cnn = pd.DataFrame({
    "sample_id": test_df["sample_id"],
    "target": preds
})

submission_cnn.to_csv("submission_cnn_per_target.csv", index=False)
submission_cnn.head()


In [None]:
# ================================================================
# 9. GUARDAR SUBMISSION XGBOOST Y COMPARAR
# ================================================================

X_test_tab = test_df.drop(columns=["sample_id","image_path"])

pred_xgb = model_xgb.predict(X_test_tab)

submission_xgb = pd.DataFrame({
    "sample_id": test_df["sample_id"],
    "target": pred_xgb
})

submission_xgb.to_csv("submission_model2_xgb.csv", index=False)
submission_xgb.head()
