<a href="https://colab.research.google.com/github/NOTGOD6000/NOTGOD6000/blob/main/untitled19.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

train=pd.read_csv("train.csv")
test=pd.read_csv("test.csv")
# Load data
X = train.drop(columns=['target'])
y = train['target']

# Feature Engineering
X['row_mean'] = X.mean(axis=1)
X['row_std'] = X.std(axis=1)
X['row_max'] = X.max(axis=1)
X['row_min'] = X.min(axis=1)
X['row_range'] = X['row_max'] - X['row_min']

# Train-validation split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# LightGBM datasets
train_data = lgb.Dataset(X_train, label=y_train)
valid_data = lgb.Dataset(X_valid, label=y_valid)

# Parameters
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'learning_rate': 0.05,
    'num_leaves': 31,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': -1,
    'random_state': 42
}

# Training
model = lgb.train(
    params,
    train_data,
    valid_sets=[train_data, valid_data],
    valid_names=['train', 'valid'],
    num_boost_round=1000,
    callbacks=[
        lgb.early_stopping(stopping_rounds=50),
        lgb.log_evaluation(period=50)
    ]
)

# Prediction and Evaluation
y_pred = model.predict(X_valid, num_iteration=model.best_iteration)
print("LightGBM Validation R2 with feature engineering:", r2_score(y_valid, y_pred))


Training until validation scores don't improve for 50 rounds
[50]	train's rmse: 4.64226	valid's rmse: 5.15941
[100]	train's rmse: 3.93122	valid's rmse: 4.96798
[150]	train's rmse: 3.43076	valid's rmse: 4.90059
[200]	train's rmse: 3.0314	valid's rmse: 4.8734
[250]	train's rmse: 2.69365	valid's rmse: 4.84962
Early stopping, best iteration is:
[239]	train's rmse: 2.76226	valid's rmse: 4.84616
LightGBM Validation R2 with feature engineering: 0.44016539694931933


In [6]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

# Load your train.csv
train = pd.read_csv('train.csv')
X = train.drop(columns=['target'])
y = train['target']

# Feature Engineering
X['row_mean'] = X.mean(axis=1)
X['row_std'] = X.std(axis=1)
X['row_max'] = X.max(axis=1)
X['row_min'] = X.min(axis=1)
X['row_range'] = X['row_max'] - X['row_min']

# Log-transform the target
y_log = np.log1p(y)

# Split
X_train, X_valid, y_train_log, y_valid = train_test_split(X, y_log, test_size=0.2, random_state=42)

# LightGBM datasets
train_data = lgb.Dataset(X_train, label=y_train_log)
valid_data = lgb.Dataset(X_valid, label=np.log1p(y_valid))

# Parameters
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'learning_rate': 0.05,
    'num_leaves': 31,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': -1,
    'random_state': 42
}

# Train
model = lgb.train(
    params,
    train_data,
    valid_sets=[train_data, valid_data],
    valid_names=['train', 'valid'],
    num_boost_round=1000,
    callbacks=[
        lgb.early_stopping(stopping_rounds=50),
        lgb.log_evaluation(period=50)
    ]
)

# Predict and inverse log
y_pred_log = model.predict(X_valid, num_iteration=model.best_iteration)
y_pred = np.expm1(y_pred_log)

# R2 Score
print("Log-transformed LightGBM R2:", r2_score(np.expm1(y_valid), y_pred))


Training until validation scores don't improve for 50 rounds
[50]	train's rmse: 0.0307086	valid's rmse: 3.22594
Early stopping, best iteration is:
[1]	train's rmse: 0.0438801	valid's rmse: 3.22535
Log-transformed LightGBM R2: 0.020977588270903236


In [7]:
from sklearn.model_selection import RandomizedSearchCV
from lightgbm import LGBMRegressor
from sklearn.metrics import r2_score
import pandas as pd
from sklearn.model_selection import train_test_split

# Load data
train = pd.read_csv("train.csv")
X = train.drop(columns=['target'])
y = train['target']

# Feature engineering (same as before)
X['row_mean'] = X.mean(axis=1)
X['row_std'] = X.std(axis=1)
X['row_max'] = X.max(axis=1)
X['row_min'] = X.min(axis=1)
X['row_range'] = X['row_max'] - X['row_min']

# Train/validation split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# Fast param grid
param_dist = {
    'num_leaves': [20, 40],
    'max_depth': [5, 10],
    'learning_rate': [0.03, 0.05],
    'n_estimators': [100, 200],
    'subsample': [0.8],
    'colsample_bytree': [0.8]
}

# LightGBM Regressor
lgb_model = LGBMRegressor(random_state=42, n_jobs=-1)

# Faster RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=lgb_model,
    param_distributions=param_dist,
    n_iter=6,           # small number of combinations
    cv=2,               # faster cross-validation
    scoring='r2',
    verbose=1,
    random_state=42,
    n_jobs=-1
)

# Fit model
random_search.fit(X_train, y_train)

# Evaluate
best_model = random_search.best_estimator_
y_pred = best_model.predict(X_valid)

print("Fast Tuned LightGBM R2:", r2_score(y_valid, y_pred))
print("Best Params:", random_search.best_params_)


Fitting 2 folds for each of 6 candidates, totalling 12 fits
Fast Tuned LightGBM R2: 0.4193542365673183
Best Params: {'subsample': 0.8, 'num_leaves': 20, 'n_estimators': 200, 'max_depth': 10, 'learning_rate': 0.05, 'colsample_bytree': 0.8}


In [8]:
!pip install catboost


Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [9]:
from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor
import lightgbm as lgb
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
import pandas as pd

# Load data
train = pd.read_csv("train.csv")
X = train.drop(columns=['target'])
y = train['target']

# Feature engineering
X['row_mean'] = X.mean(axis=1)
X['row_std'] = X.std(axis=1)
X['row_max'] = X.max(axis=1)
X['row_min'] = X.min(axis=1)
X['row_range'] = X['row_max'] - X['row_min']

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# 1. LightGBM with best params
lgb_model = lgb.LGBMRegressor(
    subsample=0.8,
    num_leaves=20,
    n_estimators=200,
    max_depth=10,
    learning_rate=0.05,
    colsample_bytree=0.8,
    random_state=42
)
lgb_model.fit(X_train, y_train)
lgb_pred = lgb_model.predict(X_valid)

# 2. CatBoost
cat_model = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.05,
    depth=6,
    loss_function='RMSE',
    early_stopping_rounds=50,
    verbose=0,
    random_state=42
)
cat_model.fit(X_train, y_train, eval_set=(X_valid, y_valid))
cat_pred = cat_model.predict(X_valid)

# 3. Random Forest
rf_model = RandomForestRegressor(
    n_estimators=200,
    max_depth=10,
    random_state=42,
    n_jobs=-1
)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_valid)

# Blend predictions (simple average)
final_pred = (lgb_pred + cat_pred + rf_pred) / 3

# Evaluate
print("Blended R2 score:", r2_score(y_valid, final_pred))


Blended R2 score: 0.4322006457962714


In [10]:
# A: LGBM heavy
final_pred_A = 0.7 * lgb_pred + 0.2 * cat_pred + 0.1 * rf_pred
print("A: 70% LGBM, 20% Cat, 10% RF R2:", r2_score(y_valid, final_pred_A))

# B: LGBM + Cat only
final_pred_B = 0.6 * lgb_pred + 0.4 * cat_pred
print("B: 60% LGBM, 40% Cat R2:", r2_score(y_valid, final_pred_B))

# C: Equal strong models
final_pred_C = 0.5 * lgb_pred + 0.5 * cat_pred
print("C: 50% LGBM, 50% Cat R2:", r2_score(y_valid, final_pred_C))

# D: CatBoost heavy
final_pred_D = 0.4 * lgb_pred + 0.5 * cat_pred + 0.1 * rf_pred
print("D: 40% LGBM, 50% Cat, 10% RF R2:", r2_score(y_valid, final_pred_D))

# E: LGBM only (best single model)
print("E: 100% LGBM R2:", r2_score(y_valid, lgb_pred))


A: 70% LGBM, 20% Cat, 10% RF R2: 0.4318064648933103
B: 60% LGBM, 40% Cat R2: 0.44334896689371717
C: 50% LGBM, 50% Cat R2: 0.447471750530551
D: 40% LGBM, 50% Cat, 10% RF R2: 0.44632283899384273
E: 100% LGBM R2: 0.4193542365673183


In [11]:
# Load test set
test = pd.read_csv("test.csv")
X_test = test.drop(columns=['id']).copy()

# Apply same feature engineering
X_test['row_mean'] = X_test.mean(axis=1)
X_test['row_std'] = X_test.std(axis=1)
X_test['row_max'] = X_test.max(axis=1)
X_test['row_min'] = X_test.min(axis=1)
X_test['row_range'] = X_test['row_max'] - X_test['row_min']

# Predict using LightGBM and CatBoost only
lgb_test_pred = lgb_model.predict(X_test)
cat_test_pred = cat_model.predict(X_test)

# Best blend
final_test_pred = 0.5 * lgb_test_pred + 0.5 * cat_test_pred

# Save to CSV
submission = pd.DataFrame({
    'id': test['id'],
    'target': final_test_pred
})
submission.to_csv("submission.csv", index=False)

print("✅ submission.csv created with R² optimized blend.")


✅ submission.csv created with R² optimized blend.


In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import lightgbm as lgb

# Load data
train = pd.read_csv("train.csv")
X = train.drop(columns=['target'])
y = train['target']

# Base features
X['row_mean'] = X.mean(axis=1)
X['row_std'] = X.std(axis=1)
X['row_max'] = X.max(axis=1)
X['row_min'] = X.min(axis=1)
X['row_range'] = X['row_max'] - X['row_min']

# New feature interactions
X['f1_f2'] = X['f1'] * X['f2']
X['f3_div_f4'] = X['f3'] / (X['f4'] + 1e-5)
X['f5_plus_f6'] = X['f5'] + X['f6']
X['f7_minus_f8'] = X['f7'] - X['f8']
X['f9_times_f10'] = X['f9'] * X['f10']

# Train/validation split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# Best LGBM config
model = lgb.LGBMRegressor(
    subsample=0.8,
    num_leaves=20,
    n_estimators=200,
    max_depth=10,
    learning_rate=0.05,
    colsample_bytree=0.8,
    random_state=42
)

# Train and evaluate
model.fit(X_train, y_train)
y_pred = model.predict(X_valid)
print("R2 with interaction features:", r2_score(y_valid, y_pred))


R2 with interaction features: 0.4206081310820652


In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import r2_score
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor

# Load and split data
train = pd.read_csv("train.csv")
X = train.drop(columns=['target'])
y = train['target']

# Feature engineering
X['row_mean'] = X.mean(axis=1)
X['row_std'] = X.std(axis=1)
X['row_max'] = X.max(axis=1)
X['row_min'] = X.min(axis=1)
X['row_range'] = X['row_max'] - X['row_min']

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize base models
lgb_model = LGBMRegressor(num_leaves=20, max_depth=10, learning_rate=0.05,
                          n_estimators=200, subsample=0.8, colsample_bytree=0.8, random_state=42)

cat_model = CatBoostRegressor(iterations=1000, learning_rate=0.05, depth=6,
                              loss_function='RMSE', early_stopping_rounds=50, verbose=0, random_state=42)

rf_model = RandomForestRegressor(n_estimators=200, max_depth=10, random_state=42, n_jobs=-1)

# Train base models on training set
lgb_model.fit(X_train, y_train)
cat_model.fit(X_train, y_train, eval_set=(X_valid, y_valid))
rf_model.fit(X_train, y_train)

# Get base model predictions for validation set
lgb_val_pred = lgb_model.predict(X_valid)
cat_val_pred = cat_model.predict(X_valid)
rf_val_pred = rf_model.predict(X_valid)

# Stack base predictions into new feature matrix
stacked_val = np.column_stack([lgb_val_pred, cat_val_pred, rf_val_pred])

# Train meta-model (Ridge regression) on stacked predictions
meta_model = Ridge(alpha=1.0)
meta_model.fit(stacked_val, y_valid)

# Final stacked prediction
meta_pred = meta_model.predict(stacked_val)
print("Stacked Ridge Meta-Model R2:", r2_score(y_valid, meta_pred))


Stacked Ridge Meta-Model R2: 0.4597593502268543


In [16]:
# Load test set
test = pd.read_csv("test.csv")
X_test = test.drop(columns=['id']).copy()

# Same feature engineering
X_test['row_mean'] = X_test.mean(axis=1)
X_test['row_std'] = X_test.std(axis=1)
X_test['row_max'] = X_test.max(axis=1)
X_test['row_min'] = X_test.min(axis=1)
X_test['row_range'] = X_test['row_max'] - X_test['row_min']

# Base model predictions on test set
lgb_test_pred = lgb_model.predict(X_test)
cat_test_pred = cat_model.predict(X_test)
rf_test_pred = rf_model.predict(X_test)

# Stack predictions for meta-model
stacked_test = np.column_stack([lgb_test_pred, cat_test_pred, rf_test_pred])
final_test_pred = meta_model.predict(stacked_test)

# Create submission
submission = pd.DataFrame({
    'id': test['id'],
    'target': final_test_pred
})
submission.to_csv("submission.csv", index=False)
print("✅ submission.csv created using stacked model (R² = 0.4598)")


✅ submission.csv created using stacked model (R² = 0.4598)


In [17]:
! pip install xgboost




In [18]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score

# Load and prepare data
train = pd.read_csv("train.csv")
X = train.drop(columns=['target'])
y = train['target']

# Feature engineering
X['row_mean'] = X.mean(axis=1)
X['row_std'] = X.std(axis=1)
X['row_max'] = X.max(axis=1)
X['row_min'] = X.min(axis=1)
X['row_range'] = X['row_max'] - X['row_min']

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# Base models
lgb_model = LGBMRegressor(num_leaves=20, max_depth=10, learning_rate=0.05,
                          n_estimators=200, subsample=0.8, colsample_bytree=0.8, random_state=42)

cat_model = CatBoostRegressor(iterations=1000, learning_rate=0.05, depth=6,
                              loss_function='RMSE', early_stopping_rounds=50, verbose=0, random_state=42)

rf_model = RandomForestRegressor(n_estimators=200, max_depth=10, random_state=42, n_jobs=-1)

# Fit base models
lgb_model.fit(X_train, y_train)
cat_model.fit(X_train, y_train, eval_set=(X_valid, y_valid))
rf_model.fit(X_train, y_train)

# Get base predictions
lgb_val_pred = lgb_model.predict(X_valid)
cat_val_pred = cat_model.predict(X_valid)
rf_val_pred = rf_model.predict(X_valid)

# Stack base predictions
stacked_val = np.column_stack([lgb_val_pred, cat_val_pred, rf_val_pred])

# Train XGBoost as meta-model
xgb_meta = XGBRegressor(n_estimators=200, learning_rate=0.05, max_depth=3, random_state=42)
xgb_meta.fit(stacked_val, y_valid)

# Final prediction
xgb_stack_pred = xgb_meta.predict(stacked_val)
print("XGBoost Meta-Model R2:", r2_score(y_valid, xgb_stack_pred))


XGBoost Meta-Model R2: 0.5914836756491184


In [20]:
# Load test data
test = pd.read_csv("test.csv")
X_test = test.drop(columns=['id']).copy()

# Apply same feature engineering
X_test['row_mean'] = X_test.mean(axis=1)
X_test['row_std'] = X_test.std(axis=1)
X_test['row_max'] = X_test.max(axis=1)
X_test['row_min'] = X_test.min(axis=1)
X_test['row_range'] = X_test['row_max'] - X_test['row_min']

# Base model predictions on test set
lgb_test_pred = lgb_model.predict(X_test)
cat_test_pred = cat_model.predict(X_test)
rf_test_pred = rf_model.predict(X_test)

# Stack predictions for XGB meta-model
stacked_test = np.column_stack([lgb_test_pred, cat_test_pred, rf_test_pred])
final_test_pred = xgb_meta.predict(stacked_test)

# Create submission file
submission = pd.DataFrame({
    'id': test['id'],
    'target': final_test_pred
})
submission.to_csv("submission.csv", index=False)
print("✅ submission.csv created with XGBoost stacked model (R² = 0.5915)")


✅ submission.csv created with XGBoost stacked model (R² = 0.5915)


In [36]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score

# Load data
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

X = train.drop(columns=["target"])
y = train["target"]
X_test = test.drop(columns=["id"])

# Feature engineering
for df in [X, X_test]:
    df["row_mean"] = df.mean(axis=1)
    df["row_std"] = df.std(axis=1)
    df["row_max"] = df.max(axis=1)
    df["row_min"] = df.min(axis=1)
    df["row_range"] = df["row_max"] - df["row_min"]

# K-Fold stacking setup
kf = KFold(n_splits=5, shuffle=True, random_state=42)
oof_preds = np.zeros((X.shape[0], 3))
test_preds = np.zeros((X_test.shape[0], 3))

for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
    X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]

    lgb = LGBMRegressor(num_leaves=20, max_depth=10, learning_rate=0.05,
                        n_estimators=200, subsample=0.8, colsample_bytree=0.8, random_state=42)
    cat = CatBoostRegressor(iterations=1000, learning_rate=0.05, depth=6,
                            loss_function='RMSE', early_stopping_rounds=50, verbose=0, random_state=42)
    rf = RandomForestRegressor(n_estimators=200, max_depth=10, random_state=42, n_jobs=-1)

    lgb.fit(X_tr, y_tr)
    cat.fit(X_tr, y_tr, eval_set=(X_val, y_val))
    rf.fit(X_tr, y_tr)

    oof_preds[val_idx, 0] = lgb.predict(X_val)
    oof_preds[val_idx, 1] = cat.predict(X_val)
    oof_preds[val_idx, 2] = rf.predict(X_val)

    test_preds[:, 0] += lgb.predict(X_test) / 5
    test_preds[:, 1] += cat.predict(X_test) / 5
    test_preds[:, 2] += rf.predict(X_test) / 5

# Train XGBoost meta-model
meta = XGBRegressor(n_estimators=200, learning_rate=0.05, max_depth=3, random_state=42)
meta.fit(oof_preds, y)
final_test_pred = meta.predict(test_preds)

# Evaluate OOF R2
print("Final OOF R2:", r2_score(y, meta.predict(oof_preds)))

# Save submission
submission = pd.DataFrame({
    "id": test["id"],
    "target": final_test_pred
})
submission.to_csv("submission_kfold_stack.csv", index=False)
print("✅ submission_kfold_stack.csv saved!")


Final OOF R2: 0.47960563001360235
✅ submission_kfold_stack.csv saved!


In [22]:
# Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score

# Load datasets
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

X = train.drop(columns=["target"])
y = train["target"]
X_test = test.drop(columns=["id"])

# Feature engineering
for df in [X, X_test]:
    df["row_mean"] = df.mean(axis=1)
    df["row_std"] = df.std(axis=1)
    df["row_max"] = df.max(axis=1)
    df["row_min"] = df.min(axis=1)
    df["row_range"] = df["row_max"] - df["row_min"]

# Log-transform the target
y_log = np.log1p(y)

# K-Fold setup
kf = KFold(n_splits=5, shuffle=True, random_state=42)
oof_preds = np.zeros((X.shape[0], 3))
test_preds = np.zeros((X_test.shape[0], 3))

for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
    X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_tr, y_val = y_log.iloc[train_idx], y_log.iloc[val_idx]

    lgb = LGBMRegressor(num_leaves=20, max_depth=10, learning_rate=0.05,
                        n_estimators=200, subsample=0.8, colsample_bytree=0.8, random_state=42)
    cat = CatBoostRegressor(iterations=1000, learning_rate=0.05, depth=6,
                            loss_function='RMSE', early_stopping_rounds=50, verbose=0, random_state=42)
    rf = RandomForestRegressor(n_estimators=200, max_depth=10, random_state=42, n_jobs=-1)

    lgb.fit(X_tr, y_tr)
    cat.fit(X_tr, y_tr, eval_set=(X_val, y_val))
    rf.fit(X_tr, y_tr)

    oof_preds[val_idx, 0] = lgb.predict(X_val)
    oof_preds[val_idx, 1] = cat.predict(X_val)
    oof_preds[val_idx, 2] = rf.predict(X_val)

    test_preds[:, 0] += lgb.predict(X_test) / 5
    test_preds[:, 1] += cat.predict(X_test) / 5
    test_preds[:, 2] += rf.predict(X_test) / 5

# Meta-model on log-space
meta = XGBRegressor(n_estimators=200, learning_rate=0.05, max_depth=3, random_state=42)
meta.fit(oof_preds, y_log)

# Predict and reverse transform
stacked_log_pred = meta.predict(test_preds)
stacked_pred = np.expm1(stacked_log_pred)

# Blend with base model (e.g., LightGBM)
final_pred = 0.6 * stacked_pred + 0.4 * np.expm1(test_preds[:, 0])

# Save to submission
submission = pd.DataFrame({
    "id": test["id"],
    "target": final_pred
})
submission.to_csv("submission_log_blend.csv", index=False)

# Evaluate local log-space R²
print("OOF R² in log-space:", r2_score(y_log, meta.predict(oof_preds)))


OOF R² in log-space: 0.48435472014489167


In [24]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.decomposition import PCA
from sklearn.metrics import r2_score

# Load data
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

X = train.drop(columns=["target"])
y = train["target"]
X_test = test.drop(columns=["id"])

# Feature engineering
for df in [X, X_test]:
    df["row_mean"] = df.mean(axis=1)
    df["row_std"] = df.std(axis=1)
    df["row_max"] = df.max(axis=1)
    df["row_min"] = df.min(axis=1)
    df["row_range"] = df["row_max"] - df["row_min"]

# PCA
pca = PCA(n_components=20)  # or use X.shape[1] - 1 for dynamic sizing
X_pca = pca.fit_transform(X)
X_test_pca = pca.transform(X_test)

# Prepare OOF and test containers
oof_preds = np.zeros((X.shape[0], 5))
test_preds = np.zeros((X_test.shape[0], 5))

kf = KFold(n_splits=5, shuffle=True, random_state=42)
for fold, (train_idx, val_idx) in enumerate(kf.split(X_pca)):
    X_tr, X_val = X_pca[train_idx], X_pca[val_idx]
    y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]

    # Models
    lgb = LGBMRegressor(num_leaves=20, max_depth=10, learning_rate=0.05, n_estimators=200)
    cat = CatBoostRegressor(iterations=1000, learning_rate=0.05, depth=6, verbose=0, early_stopping_rounds=50)
    rf = RandomForestRegressor(n_estimators=200, max_depth=10, n_jobs=-1)
    hgb = HistGradientBoostingRegressor(max_iter=200, learning_rate=0.05, max_depth=10)
    xgb = XGBRegressor(n_estimators=200, learning_rate=0.05, max_depth=5)

    # Train
    lgb.fit(X_tr, y_tr)
    cat.fit(X_tr, y_tr, eval_set=(X_val, y_val))
    rf.fit(X_tr, y_tr)
    hgb.fit(X_tr, y_tr)
    xgb.fit(X_tr, y_tr)

    # OOF
    oof_preds[val_idx, 0] = lgb.predict(X_val)
    oof_preds[val_idx, 1] = cat.predict(X_val)
    oof_preds[val_idx, 2] = rf.predict(X_val)
    oof_preds[val_idx, 3] = hgb.predict(X_val)
    oof_preds[val_idx, 4] = xgb.predict(X_val)

    # Test preds (average)
    test_preds[:, 0] += lgb.predict(X_test_pca) / 5
    test_preds[:, 1] += cat.predict(X_test_pca) / 5
    test_preds[:, 2] += rf.predict(X_test_pca) / 5
    test_preds[:, 3] += hgb.predict(X_test_pca) / 5
    test_preds[:, 4] += xgb.predict(X_test_pca) / 5

# Meta-model
meta = XGBRegressor(n_estimators=200, learning_rate=0.05, max_depth=3)
meta.fit(oof_preds, y)
final_test_pred = meta.predict(test_preds)

# Evaluate
print("Final OOF R²:", r2_score(y, meta.predict(oof_preds)))

# Save submission
submission = pd.DataFrame({
    "id": test["id"],
    "target": final_test_pred
})
submission.to_csv("submission_advanced_stack.csv", index=False)




Final OOF R²: 0.4255236342502451


In [25]:
pip install optuna


Collecting optuna
  Downloading optuna-4.4.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.16.2-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.4.0-py3-none-any.whl (395 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m395.9/395.9 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.16.2-py3-none-any.whl (242 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m242.7/242.7 kB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, alembic, optuna
Successfully installed alembic-1.16.2 colorlog-6.9.0 optuna-4.4.0


In [26]:
import optuna
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import r2_score
import pandas as pd

# Objective function
def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 500),
        "max_depth": trial.suggest_int("max_depth", 2, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        "gamma": trial.suggest_float("gamma", 0, 5),
        "reg_alpha": trial.suggest_float("reg_alpha", 0.0, 5.0),
        "reg_lambda": trial.suggest_float("reg_lambda", 0.0, 5.0),
        "random_state": 42
    }
    model = XGBRegressor(**params)
    scores = cross_val_score(model, oof_preds, y, cv=5, scoring="r2", n_jobs=-1)
    return scores.mean()

# Run optimization
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=30)

# Best meta-model
best_params = study.best_params
best_params["random_state"] = 42
meta_model = XGBRegressor(**best_params)
meta_model.fit(oof_preds, y)

# Predict on test stacked predictions
final_pred = meta_model.predict(test_preds)

# Save submission
submission = pd.DataFrame({
    "id": test["id"],
    "target": final_pred
})
submission.to_csv("submission_optuna_xgb_meta.csv", index=False)

# Report
print("Best R2:", r2_score(y, meta_model.predict(oof_preds)))
print("Best Params:", best_params)


[I 2025-06-21 17:57:56,799] A new study created in memory with name: no-name-03ec96c0-7f17-4445-860a-94ca5632ddee
[I 2025-06-21 17:58:02,320] Trial 0 finished with value: 0.3547865191593881 and parameters: {'n_estimators': 199, 'max_depth': 4, 'learning_rate': 0.04752400619780795, 'subsample': 0.6436466628884236, 'colsample_bytree': 0.8899552397579427, 'gamma': 2.226781736794661, 'reg_alpha': 2.038016132256986, 'reg_lambda': 4.961239630298745}. Best is trial 0 with value: 0.3547865191593881.
[I 2025-06-21 17:58:20,663] Trial 1 finished with value: 0.31481196604773876 and parameters: {'n_estimators': 441, 'max_depth': 10, 'learning_rate': 0.015777133737404893, 'subsample': 0.8106581079085065, 'colsample_bytree': 0.9846080829876719, 'gamma': 3.8494408466972088, 'reg_alpha': 0.05289859285011189, 'reg_lambda': 0.8489606980213493}. Best is trial 0 with value: 0.3547865191593881.
[I 2025-06-21 17:58:21,181] Trial 2 finished with value: 0.298401209283078 and parameters: {'n_estimators': 211, 

Best R2: 0.3858450235795122
Best Params: {'n_estimators': 162, 'max_depth': 2, 'learning_rate': 0.03346348192058747, 'subsample': 0.7512917273699846, 'colsample_bytree': 0.932634569390918, 'gamma': 1.2002031316467865, 'reg_alpha': 1.633179472235233, 'reg_lambda': 3.3911345511867754, 'random_state': 42}


In [27]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score

# Load data
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

X = train.drop(columns=["target"])
y = train["target"]
X_test = test.drop(columns=["id"])

# Feature engineering
for df in [X, X_test]:
    df["row_mean"] = df.mean(axis=1)
    df["row_std"] = df.std(axis=1)
    df["row_max"] = df.max(axis=1)
    df["row_min"] = df.min(axis=1)
    df["row_range"] = df["row_max"] - df["row_min"]

# Setup for stacking
n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
oof_preds = np.zeros((X.shape[0], 3))       # LGBM, CatBoost, RF
test_preds = np.zeros((X_test.shape[0], 3)) # same

# K-Fold stacking
for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
    X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]

    # Base models
    lgb = LGBMRegressor(num_leaves=20, max_depth=10, learning_rate=0.05, n_estimators=200, random_state=42)
    cat = CatBoostRegressor(iterations=1000, learning_rate=0.05, depth=6,
                            loss_function='RMSE', early_stopping_rounds=50, verbose=0, random_state=42)
    rf = RandomForestRegressor(n_estimators=200, max_depth=10, random_state=42, n_jobs=-1)

    # Fit
    lgb.fit(X_tr, y_tr)
    cat.fit(X_tr, y_tr, eval_set=(X_val, y_val))
    rf.fit(X_tr, y_tr)

    # OOF preds
    oof_preds[val_idx, 0] = lgb.predict(X_val)
    oof_preds[val_idx, 1] = cat.predict(X_val)
    oof_preds[val_idx, 2] = rf.predict(X_val)

    # Test preds
    test_preds[:, 0] += lgb.predict(X_test) / n_splits
    test_preds[:, 1] += cat.predict(X_test) / n_splits
    test_preds[:, 2] += rf.predict(X_test) / n_splits

# Meta-model
meta = XGBRegressor(n_estimators=200, learning_rate=0.05, max_depth=3, random_state=42)
meta.fit(oof_preds, y)

# Final prediction from stacked meta-model
stacked_pred = meta.predict(test_preds)

# Optional blend with LGBM base (adjust weights if needed)
lgb_pred = test_preds[:, 0]
final_pred = 0.6 * stacked_pred + 0.4 * lgb_pred

# Save submission
submission = pd.DataFrame({
    "id": test["id"],
    "target": final_pred
})
submission.to_csv("submission_final_stack.csv", index=False)

# Local evaluation
print("Final OOF R²:", r2_score(y, meta.predict(oof_preds)))



Final OOF R²: 0.47734974239954475


In [28]:
from sklearn.feature_selection import SelectKBest, f_regression
selector = SelectKBest(score_func=f_regression, k=25)
X_new = selector.fit_transform(X, y)


In [29]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score

# Load data
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

X = train.drop(columns=["target"])
y = train["target"]
X_test = test.drop(columns=["id"])

# Feature engineering
for df in [X, X_test]:
    df["row_mean"] = df.mean(axis=1)
    df["row_std"] = df.std(axis=1)
    df["row_max"] = df.max(axis=1)
    df["row_min"] = df.min(axis=1)
    df["row_range"] = df["row_max"] - df["row_min"]

# Feature selection (top 25)
selector = SelectKBest(score_func=f_regression, k=25)
X_sel = selector.fit_transform(X, y)
X_test_sel = selector.transform(X_test)

# Stacking
oof_preds = np.zeros((X.shape[0], 3))
test_preds = np.zeros((X_test.shape[0], 3))

kf = KFold(n_splits=5, shuffle=True, random_state=42)
for fold, (train_idx, val_idx) in enumerate(kf.split(X_sel)):
    X_tr, X_val = X_sel[train_idx], X_sel[val_idx]
    y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]

    lgb = LGBMRegressor(num_leaves=20, max_depth=10, learning_rate=0.05, n_estimators=200)
    cat = CatBoostRegressor(iterations=1000, learning_rate=0.05, depth=6, verbose=0, early_stopping_rounds=50)
    rf = RandomForestRegressor(n_estimators=200, max_depth=10)

    lgb.fit(X_tr, y_tr)
    cat.fit(X_tr, y_tr, eval_set=(X_val, y_val))
    rf.fit(X_tr, y_tr)

    oof_preds[val_idx, 0] = lgb.predict(X_val)
    oof_preds[val_idx, 1] = cat.predict(X_val)
    oof_preds[val_idx, 2] = rf.predict(X_val)

    test_preds[:, 0] += lgb.predict(X_test_sel) / 5
    test_preds[:, 1] += cat.predict(X_test_sel) / 5
    test_preds[:, 2] += rf.predict(X_test_sel) / 5

# Meta-model
meta = XGBRegressor(n_estimators=200, learning_rate=0.05, max_depth=3)
meta.fit(oof_preds, y)
stacked_pred = meta.predict(test_preds)

# Optional blend
final_pred = 0.6 * stacked_pred + 0.4 * test_preds[:, 0]

# Save submission
submission = pd.DataFrame({
    "id": test["id"],
    "target": final_pred
})
submission.to_csv("submission_selected_stack.csv", index=False)

# Local R2
print("OOF R²:", r2_score(y, meta.predict(oof_preds)))





OOF R²: 0.47328428216636065


In [31]:
import pandas as pd
from sklearn.linear_model import Ridge
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import r2_score

# Train meta-models on OOF predictions
ridge = Ridge(alpha=1.0)
xgb = XGBRegressor(n_estimators=200, learning_rate=0.05, max_depth=3, random_state=42)
lgbm = LGBMRegressor(num_leaves=20, max_depth=6, learning_rate=0.05, n_estimators=200, random_state=42)

ridge.fit(oof_preds, y)
xgb.fit(oof_preds, y)
lgbm.fit(oof_preds, y)

# OOF predictions from meta-models
ridge_oof = ridge.predict(oof_preds)
xgb_oof = xgb.predict(oof_preds)
lgbm_oof = lgbm.predict(oof_preds)

# R² scores
print("Ridge OOF R²:", r2_score(y, ridge_oof))
print("XGB OOF R²:", r2_score(y, xgb_oof))
print("LGBM OOF R²:", r2_score(y, lgbm_oof))

# ✅ Ensemble OOF predictions (equal weight)
ensemble_oof_eq = (ridge_oof + xgb_oof + lgbm_oof) / 3
print("Equal Weight Ensemble OOF R²:", r2_score(y, ensemble_oof_eq))

# ✅ Weighted Ensemble (based on performance)
ensemble_oof_weighted = 0.1 * ridge_oof + 0.4 * xgb_oof + 0.5 * lgbm_oof
print("Weighted Ensemble OOF R²:", r2_score(y, ensemble_oof_weighted))

# ====== Final Test Predictions ======
ridge_test = ridge.predict(test_preds)
xgb_test = xgb.predict(test_preds)
lgbm_test = lgbm.predict(test_preds)

# Final weighted test prediction
final_pred = 0.1 * ridge_test + 0.4 * xgb_test + 0.5 * lgbm_test

# Optional blend with base LGBM
final_pred = 0.6 * final_pred + 0.4 * test_preds[:, 0]  # blend with LGBM base

# Save submission
submission = pd.DataFrame({
    "id": test["id"],
    "target": final_pred
})
submission.to_csv("submission_meta_ensemble_weighted.csv", index=False)
print("✅ Saved: submission_meta_ensemble_weighted.csv")




Ridge OOF R²: 0.43693417302493875
XGB OOF R²: 0.47328428216636065
LGBM OOF R²: 0.5133638761807828
Equal Weight Ensemble OOF R²: 0.4793623615196567
Weighted Ensemble OOF R²: 0.4932606935104151
✅ Saved: submission_meta_ensemble_weighted.csv




In [32]:
# Fit meta-models on ALL OOF + y
ridge = Ridge(alpha=1.0)
xgb = XGBRegressor(n_estimators=200, learning_rate=0.05, max_depth=3, random_state=42)
lgbm = LGBMRegressor(num_leaves=20, max_depth=6, learning_rate=0.05, n_estimators=200, random_state=42)

ridge.fit(oof_preds, y)
xgb.fit(oof_preds, y)
lgbm.fit(oof_preds, y)

# Predict on test
ridge_test = ridge.predict(test_preds)
xgb_test = xgb.predict(test_preds)
lgbm_test = lgbm.predict(test_preds)

# Final blend
final_pred = 0.1 * ridge_test + 0.4 * xgb_test + 0.5 * lgbm_test
final_pred = 0.6 * final_pred + 0.4 * test_preds[:, 0]  # Optional blend with LGBM base

# Save
submission = pd.DataFrame({
    "id": test["id"],
    "target": final_pred
})
submission.to_csv("submission_refit_full_meta.csv", index=False)
print("✅ Saved: submission_refit_full_meta.csv")




✅ Saved: submission_refit_full_meta.csv


In [33]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.preprocessing import KBinsDiscretizer

# Load data
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
X = train.drop(columns=["target"])
y = train["target"]
X_test = test.drop(columns=["id"])

# Optional: feature engineering
for df in [X, X_test]:
    df["row_mean"] = df.mean(axis=1)
    df["row_std"] = df.std(axis=1)
    df["row_max"] = df.max(axis=1)
    df["row_min"] = df.min(axis=1)
    df["row_range"] = df["row_max"] - df["row_min"]

# Bin target
n_bins = 10
kbin = KBinsDiscretizer(n_bins=n_bins, encode='ordinal', strategy='quantile')
y_bins = kbin.fit_transform(y.values.reshape(-1, 1)).astype(int).ravel()

# Split for validation
X_train, X_valid, y_train, y_valid, y_bins_train, y_bins_valid = train_test_split(
    X, y, y_bins, test_size=0.2, random_state=42
)

# Train classifier
clf = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42)
clf.fit(X_train, y_bins_train)

# Train one regressor per bin
bin_regressors = {}
for b in range(n_bins):
    idx = y_bins_train == b
    reg = RandomForestRegressor(n_estimators=200, max_depth=10, random_state=42)
    reg.fit(X_train[idx], y_train[idx])
    bin_regressors[b] = reg

# Predict on validation
bin_preds = clf.predict(X_valid)
final_preds = np.zeros_like(y_valid)

for b in range(n_bins):
    idx = bin_preds == b
    if np.any(idx):
        final_preds[idx] = bin_regressors[b].predict(X_valid[idx])

# Evaluate
print("Hybrid Model Validation R2:", r2_score(y_valid, final_preds))

# ==== Predict on test set ====
bin_test_preds = clf.predict(X_test)
test_final = np.zeros(X_test.shape[0])

for b in range(n_bins):
    idx = bin_test_preds == b
    if np.any(idx):
        test_final[idx] = bin_regressors[b].predict(X_test[idx])

# Save submission
submission = pd.DataFrame({
    "id": test["id"],
    "target": test_final
})
submission.to_csv("submission_hybrid_quantile.csv", index=False)
print("✅ Saved: submission_hybrid_quantile.csv")


Hybrid Model Validation R2: -0.034081607010225445
✅ Saved: submission_hybrid_quantile.csv


In [34]:
# Predict probabilities for all bins
proba_valid = clf.predict_proba(X_valid)
proba_test = clf.predict_proba(X_test)

# Predict using all regressors, weighted by classifier probability
final_preds = np.zeros(len(X_valid))
test_final = np.zeros(len(X_test))

for b in range(n_bins):
    reg = bin_regressors.get(b)
    if reg:
        final_preds += proba_valid[:, b] * reg.predict(X_valid)
        test_final += proba_test[:, b] * reg.predict(X_test)

# Evaluate
from sklearn.metrics import r2_score
print("Soft Hybrid Model Validation R2:", r2_score(y_valid, final_preds))

# Save improved submission
submission = pd.DataFrame({
    "id": test["id"],
    "target": test_final
})
submission.to_csv("submission_hybrid_soft.csv", index=False)
print("✅ Saved: submission_hybrid_soft.csv")


Soft Hybrid Model Validation R2: 0.3145977432955087
✅ Saved: submission_hybrid_soft.csv


In [35]:
sub_stack = pd.read_csv("submission_meta_ensemble_weighted.csv")
sub_soft = pd.read_csv("submission_hybrid_soft.csv")

# 70% stack + 30% soft hybrid (or try 80/20 if hybrid is noisy)
blended = sub_stack.copy()
blended["target"] = 0.7 * sub_stack["target"] + 0.3 * sub_soft["target"]

blended.to_csv("submission_final_blend.csv", index=False)
print("✅ Saved: submission_final_blend.csv")


✅ Saved: submission_final_blend.csv


In [37]:
sub1 = pd.read_csv("submission_kfold_stack.csv")         # 0.5
sub2 = pd.read_csv("submission_log_blend.csv")           # 0.3
sub3 = pd.read_csv("submission_meta_ensemble_weighted.csv")  # 0.2

blend = sub1.copy()
blend["target"] = (
    0.5 * sub1["target"] +
    0.3 * sub2["target"] +
    0.2 * sub3["target"]
)

blend.to_csv("submission_final_top3_blend.csv", index=False)
print("✅ Saved: submission_final_top3_blend.csv")


✅ Saved: submission_final_top3_blend.csv


In [39]:
import pandas as pd
import numpy as np
from itertools import product

sub1 = pd.read_csv("submission_kfold_stack.csv")
sub2 = pd.read_csv("submission_log_blend.csv")
sub3 = pd.read_csv("submission_meta_ensemble_weighted.csv")

# Try weights like (0.5, 0.3, 0.2), etc.
best_score = -np.inf
best_weights = None
best_submission = None

# Simulate weights in 0.1 steps (they must sum to 1)
for w1, w2, w3 in product(np.arange(0.1, 1.0, 0.1), repeat=3):
    if abs((w1 + w2 + w3) - 1.0) > 1e-6:
        continue
    blend = w1 * sub1["target"] + w2 * sub2["target"] + w3 * sub3["target"]

    # For Kaggle: we can't evaluate without y_true, so skip scoring here.
    # Instead, just save all blends and submit top few manually.
    fname = f"blend_{int(w1*100)}_{int(w2*100)}_{int(w3*100)}.csv"
    pd.DataFrame({
        "id": sub1["id"],
        "target": blend
    }).to_csv(fname, index=False)
