In [None]:
# 0) Install CatBoost
!pip install -q catboost

# 1) Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from catboost import CatBoostRegressor, Pool

# 2) Load CSV (upload)
from google.colab import files
uploaded = files.upload()
CSV_PATH = next(iter(uploaded.keys()))

# Robust read (skip malformed lines if any)
df = pd.read_csv(CSV_PATH, engine="python", on_bad_lines="skip")
print("Loaded:", df.shape, "rows x cols")

# 3) Features = first 4 columns; Targets = remaining NUMERIC columns
feature_cols = list(df.columns[:4])
candidate_targets = df.columns[4:]
target_cols = [c for c in candidate_targets if pd.api.types.is_numeric_dtype(df[c])]
if not target_cols:
    raise ValueError("No numeric target columns detected after the first 4 columns.")

print("Features:", feature_cols)
print("Targets :", target_cols)

# 4) Optional: coerce to numeric & simple NaN handling
df[feature_cols]   = df[feature_cols].apply(pd.to_numeric, errors="coerce")
df[target_cols]    = df[target_cols].apply(pd.to_numeric, errors="coerce")

X = df[feature_cols].values
idx_all = np.arange(len(df))

# 5) Split once (reused for all targets)
idx_train, idx_test = train_test_split(idx_all, test_size=0.2, random_state=42)
X_train, X_test = X[idx_train], X[idx_test]

# 6) Train CatBoost for each target
results = []
preds_store = {}

cat_params = dict(
    loss_function="RMSE",
    depth=8,
    learning_rate=0.05,
    n_estimators=3000,
    random_seed=42,
    eval_metric="RMSE",
    od_type="Iter",          # early stopping
    od_wait=200,             # patience
    verbose=False,
    task_type="CPU"          # change to "GPU" if Colab GPU is enabled
)

for col in target_cols:
    y = df[col].values.astype(float)
    y_train, y_test = y[idx_train], y[idx_test]

    train_pool = Pool(X_train, y_train)
    valid_pool = Pool(X_test,  y_test)

    model = CatBoostRegressor(**cat_params)
    model.fit(train_pool, eval_set=valid_pool, use_best_model=True)

    y_pred = model.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    r2  = r2_score(y_test, y_pred)

    print(f"Target: {col}")
    print("Mean Squared Error (MSE):", mse)
    print("R² Score:", r2)
    print("-"*50)

    results.append({"Target": col, "MSE": mse, "R2": r2})
    preds_store[col] = (y_test, y_pred)

# 7) Summary table + save
metrics_df = pd.DataFrame(results).sort_values("R2", ascending=False).reset_index(drop=True)
from IPython.display import display
display(metrics_df)
metrics_df.to_csv("/content/catboost_metrics.csv", index=False)
print("Saved: /content/catboost_metrics.csv")

# 8) (Optional) Feature importance (same for all targets since features are the same)
# Retrain one quick model on the first target to show importances
if len(target_cols) > 0:
    col0 = target_cols[0]
    y0 = df[col0].values.astype(float)
    model0 = CatBoostRegressor(**cat_params)
    model0.fit(Pool(X_train, y0[idx_train]), eval_set=Pool(X_test, y0[idx_test]), use_best_model=True, verbose=False)
    fi = pd.Series(model0.get_feature_importance(), index=feature_cols, name="Importance").sort_values(ascending=False)
    print("\nFeature importance (example target:", col0, ")")
    display(fi.to_frame())


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25h

Saving DATE_400_micro_m_info.csv to DATE_400_micro_m_info.csv
Loaded: (14995, 10) rows x cols
Features: ['Gamma', 'NQW', 'kappa', 'I']
Targets : ['Pmax', 'FWHM', 'Etotal', 'Epulse', 'Coeff', 'Delay']
Target: Pmax
Mean Squared Error (MSE): 0.01688950705161953
R² Score: 0.9632977052850867
--------------------------------------------------
Target: FWHM
Mean Squared Error (MSE): 2.834621341343513
R² Score: 0.8304057903922913
--------------------------------------------------
Target: Etotal
Mean Squared Error (MSE): 0.17638645550239346
R² Score: 0.9987480139873881
--------------------------------------------------
Target: Epulse
Mean Squared Error (MSE): 2.3619544522615783
R² Score: 0.9595427323080702
--------------------------------------------------
Target: Coeff
Mean Squared Error (MSE): 0.0037312916237746564
R² Score: 0.9068320158742379
--------------------------------------------------
Target: Delay
Mean Squared Error (MSE): 5.486822059809525e-05
R² Score: 0.9609806046645161
----------

Unnamed: 0,Target,MSE,R2
0,Etotal,0.176386,0.998748
1,Pmax,0.01689,0.963298
2,Delay,5.5e-05,0.960981
3,Epulse,2.361954,0.959543
4,Coeff,0.003731,0.906832
5,FWHM,2.834621,0.830406


Saved: /content/catboost_metrics.csv

Feature importance (example target: Pmax )


Unnamed: 0,Importance
I,59.033925
kappa,19.463842
Gamma,12.038524
NQW,9.463708
