<a href="https://colab.research.google.com/github/Lee2008-cyber/R2_Score/blob/main/R2_Score(Kaggle).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()


In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

thapar_summer_school_2025_hack_iii_path = kagglehub.competition_download('thapar-summer-school-2025-hack-iii')

print('Data source import complete.')


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.linear_model import RidgeCV
from sklearn.metrics import r2_score

import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor

# Load data
train_df = pd.read_csv("/kaggle/input/thapar-summer-school-2025-hack-iii/train.csv")
test_df = pd.read_csv("/kaggle/input/thapar-summer-school-2025-hack-iii/test.csv")

train_id_col = "Row#"
test_id_col = "id"
target_col = "output"

# Feature engineering
def engineer_features(df):
    df = df.copy()
    df["UpperTempRange"] = df["MaxOfUpperTRange"] - df["MinOfUpperTRange"]
    df["LowerTempRange"] = df["MaxOfLowerTRange"] - df["MinOfLowerTRange"]
    df["RainInteraction"] = df["RainingDays"] * df["AverageRainingDays"]
    df["Pollination"] = df["honeybee"] + df["bumbles"] + df["andrena"] + df["osmia"]
    df["SizeMassRatio"] = df["fruitmass"] / (df["clonesize"] + 1e-5)
    return df

train_df = engineer_features(train_df)
test_df = engineer_features(test_df)

X = train_df.drop(columns=[train_id_col, target_col])
y = train_df[target_col]
X_test = test_df.drop(columns=[test_id_col, train_id_col])

kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Store base model oof/test predictions
oof_lgb, oof_cat, oof_xgb = np.zeros(len(X)), np.zeros(len(X)), np.zeros(len(X))
test_lgb, test_cat, test_xgb = np.zeros((len(X_test), 5)), np.zeros((len(X_test), 5)), np.zeros((len(X_test), 5))

for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
    print(f" Fold {fold + 1}")

    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    # LightGBM
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_val = lgb.Dataset(X_val, y_val)
    lgb_model = lgb.train(
        {
            "objective": "regression",
            "metric": "rmse",
            "learning_rate": 0.02,
            "num_leaves": 64,
            "feature_fraction": 0.9,
            "bagging_fraction": 0.75,
            "bagging_freq": 5,
            "verbosity": -1,
            "seed": 42
        },
        lgb_train,
        valid_sets=[lgb_val],
        num_boost_round=10000,
        callbacks=[
            lgb.early_stopping(stopping_rounds=100),
            lgb.log_evaluation(period=0)
        ]
    )
    oof_lgb[val_idx] = lgb_model.predict(X_val, num_iteration=lgb_model.best_iteration)
    test_lgb[:, fold] = lgb_model.predict(X_test, num_iteration=lgb_model.best_iteration)

    # CatBoost
    cat_model = CatBoostRegressor(
        iterations=2000,
        learning_rate=0.03,
        depth=6,
        loss_function='RMSE',
        verbose=0,
        random_seed=42
    )
    cat_model.fit(X_train, y_train, eval_set=(X_val, y_val), early_stopping_rounds=100)
    oof_cat[val_idx] = cat_model.predict(X_val)
    test_cat[:, fold] = cat_model.predict(X_test)

    # XGBoost
    xgb_model = xgb.XGBRegressor(
        n_estimators=2000,
        learning_rate=0.02,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        verbosity=0
    )
    xgb_model.fit(X_train, y_train,
                  eval_set=[(X_val, y_val)],
                  early_stopping_rounds=100,
                  verbose=0)
    oof_xgb[val_idx] = xgb_model.predict(X_val)
    test_xgb[:, fold] = xgb_model.predict(X_test)

# Build meta features
oof_meta = pd.DataFrame({
    "lgb": oof_lgb,
    "cat": oof_cat,
    "xgb": oof_xgb
})
test_meta = pd.DataFrame({
    "lgb": test_lgb.mean(axis=1),
    "cat": test_cat.mean(axis=1),
    "xgb": test_xgb.mean(axis=1)
})

# Train meta-model
meta_model = RidgeCV(alphas=np.logspace(-5, 3, 100))
meta_model.fit(oof_meta, y)
meta_preds = meta_model.predict(test_meta)
cv_r2 = r2_score(y, meta_model.predict(oof_meta))

print(f"\n Stacking Ridge CV R² Score: {cv_r2:.5f}")
print(f"Ridge Weights: {meta_model.coef_}")

# Save submission
submission = pd.DataFrame({
    test_id_col: test_df[test_id_col],
    target_col: meta_preds
})
submission.to_csv("submission.csv", index=False)
print("submission.csv saved")
