<a href="https://colab.research.google.com/github/NOTGOD6000/NOTGOD6000/blob/main/Untitled19.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

train=pd.read_csv("train.csv")
test=pd.read_csv("test.csv")
# Load data
X = train.drop(columns=['target'])
y = train['target']

# Feature Engineering
X['row_mean'] = X.mean(axis=1)
X['row_std'] = X.std(axis=1)
X['row_max'] = X.max(axis=1)
X['row_min'] = X.min(axis=1)
X['row_range'] = X['row_max'] - X['row_min']

# Train-validation split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# LightGBM datasets
train_data = lgb.Dataset(X_train, label=y_train)
valid_data = lgb.Dataset(X_valid, label=y_valid)

# Parameters
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'learning_rate': 0.05,
    'num_leaves': 31,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': -1,
    'random_state': 42
}

# Training
model = lgb.train(
    params,
    train_data,
    valid_sets=[train_data, valid_data],
    valid_names=['train', 'valid'],
    num_boost_round=1000,
    callbacks=[
        lgb.early_stopping(stopping_rounds=50),
        lgb.log_evaluation(period=50)
    ]
)

# Prediction and Evaluation
y_pred = model.predict(X_valid, num_iteration=model.best_iteration)
print("LightGBM Validation R2 with feature engineering:", r2_score(y_valid, y_pred))


Training until validation scores don't improve for 50 rounds
[50]	train's rmse: 4.64226	valid's rmse: 5.15941
[100]	train's rmse: 3.93122	valid's rmse: 4.96798
[150]	train's rmse: 3.43076	valid's rmse: 4.90059
[200]	train's rmse: 3.0314	valid's rmse: 4.8734
[250]	train's rmse: 2.69365	valid's rmse: 4.84962
Early stopping, best iteration is:
[239]	train's rmse: 2.76226	valid's rmse: 4.84616
LightGBM Validation R2 with feature engineering: 0.44016539694931933


In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

# Load your train.csv
train = pd.read_csv('train.csv')
X = train.drop(columns=['target'])
y = train['target']

# Feature Engineering
X['row_mean'] = X.mean(axis=1)
X['row_std'] = X.std(axis=1)
X['row_max'] = X.max(axis=1)
X['row_min'] = X.min(axis=1)
X['row_range'] = X['row_max'] - X['row_min']

# Log-transform the target
y_log = np.log1p(y)

# Split
X_train, X_valid, y_train_log, y_valid = train_test_split(X, y_log, test_size=0.2, random_state=42)

# LightGBM datasets
train_data = lgb.Dataset(X_train, label=y_train_log)
valid_data = lgb.Dataset(X_valid, label=np.log1p(y_valid))

# Parameters
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'learning_rate': 0.05,
    'num_leaves': 31,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': -1,
    'random_state': 42
}

# Train
model = lgb.train(
    params,
    train_data,
    valid_sets=[train_data, valid_data],
    valid_names=['train', 'valid'],
    num_boost_round=1000,
    callbacks=[
        lgb.early_stopping(stopping_rounds=50),
        lgb.log_evaluation(period=50)
    ]
)

# Predict and inverse log
y_pred_log = model.predict(X_valid, num_iteration=model.best_iteration)
y_pred = np.expm1(y_pred_log)

# R2 Score
print("Log-transformed LightGBM R2:", r2_score(np.expm1(y_valid), y_pred))


Training until validation scores don't improve for 50 rounds
[50]	train's rmse: 0.0307086	valid's rmse: 3.22594
Early stopping, best iteration is:
[1]	train's rmse: 0.0438801	valid's rmse: 3.22535
Log-transformed LightGBM R2: 0.020977588270903236


In [None]:
from sklearn.model_selection import RandomizedSearchCV
from lightgbm import LGBMRegressor
from sklearn.metrics import r2_score
import pandas as pd
from sklearn.model_selection import train_test_split

# Load data
train = pd.read_csv("train.csv")
X = train.drop(columns=['target'])
y = train['target']

# Feature engineering (same as before)
X['row_mean'] = X.mean(axis=1)
X['row_std'] = X.std(axis=1)
X['row_max'] = X.max(axis=1)
X['row_min'] = X.min(axis=1)
X['row_range'] = X['row_max'] - X['row_min']

# Train/validation split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# Fast param grid
param_dist = {
    'num_leaves': [20, 40],
    'max_depth': [5, 10],
    'learning_rate': [0.03, 0.05],
    'n_estimators': [100, 200],
    'subsample': [0.8],
    'colsample_bytree': [0.8]
}

# LightGBM Regressor
lgb_model = LGBMRegressor(random_state=42, n_jobs=-1)

# Faster RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=lgb_model,
    param_distributions=param_dist,
    n_iter=6,           # small number of combinations
    cv=2,               # faster cross-validation
    scoring='r2',
    verbose=1,
    random_state=42,
    n_jobs=-1
)

# Fit model
random_search.fit(X_train, y_train)

# Evaluate
best_model = random_search.best_estimator_
y_pred = best_model.predict(X_valid)

print("Fast Tuned LightGBM R2:", r2_score(y_valid, y_pred))
print("Best Params:", random_search.best_params_)


Fitting 2 folds for each of 6 candidates, totalling 12 fits
Fast Tuned LightGBM R2: 0.4193542365673183
Best Params: {'subsample': 0.8, 'num_leaves': 20, 'n_estimators': 200, 'max_depth': 10, 'learning_rate': 0.05, 'colsample_bytree': 0.8}


In [None]:
!pip install catboost


Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [None]:
from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor
import lightgbm as lgb
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
import pandas as pd

# Load data
train = pd.read_csv("train.csv")
X = train.drop(columns=['target'])
y = train['target']

# Feature engineering
X['row_mean'] = X.mean(axis=1)
X['row_std'] = X.std(axis=1)
X['row_max'] = X.max(axis=1)
X['row_min'] = X.min(axis=1)
X['row_range'] = X['row_max'] - X['row_min']

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# 1. LightGBM with best params
lgb_model = lgb.LGBMRegressor(
    subsample=0.8,
    num_leaves=20,
    n_estimators=200,
    max_depth=10,
    learning_rate=0.05,
    colsample_bytree=0.8,
    random_state=42
)
lgb_model.fit(X_train, y_train)
lgb_pred = lgb_model.predict(X_valid)

# 2. CatBoost
cat_model = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.05,
    depth=6,
    loss_function='RMSE',
    early_stopping_rounds=50,
    verbose=0,
    random_state=42
)
cat_model.fit(X_train, y_train, eval_set=(X_valid, y_valid))
cat_pred = cat_model.predict(X_valid)

# 3. Random Forest
rf_model = RandomForestRegressor(
    n_estimators=200,
    max_depth=10,
    random_state=42,
    n_jobs=-1
)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_valid)

# Blend predictions (simple average)
final_pred = (lgb_pred + cat_pred + rf_pred) / 3

# Evaluate
print("Blended R2 score:", r2_score(y_valid, final_pred))


Blended R2 score: 0.4322006457962715


In [None]:
# A: LGBM heavy
final_pred_A = 0.7 * lgb_pred + 0.2 * cat_pred + 0.1 * rf_pred
print("A: 70% LGBM, 20% Cat, 10% RF R2:", r2_score(y_valid, final_pred_A))

# B: LGBM + Cat only
final_pred_B = 0.6 * lgb_pred + 0.4 * cat_pred
print("B: 60% LGBM, 40% Cat R2:", r2_score(y_valid, final_pred_B))

# C: Equal strong models
final_pred_C = 0.5 * lgb_pred + 0.5 * cat_pred
print("C: 50% LGBM, 50% Cat R2:", r2_score(y_valid, final_pred_C))

# D: CatBoost heavy
final_pred_D = 0.4 * lgb_pred + 0.5 * cat_pred + 0.1 * rf_pred
print("D: 40% LGBM, 50% Cat, 10% RF R2:", r2_score(y_valid, final_pred_D))

# E: LGBM only (best single model)
print("E: 100% LGBM R2:", r2_score(y_valid, lgb_pred))


A: 70% LGBM, 20% Cat, 10% RF R2: 0.4318064648933103
B: 60% LGBM, 40% Cat R2: 0.44334896689371717
C: 50% LGBM, 50% Cat R2: 0.447471750530551
D: 40% LGBM, 50% Cat, 10% RF R2: 0.44632283899384273
E: 100% LGBM R2: 0.4193542365673183


In [None]:
# Load test set
test = pd.read_csv("test.csv")
X_test = test.drop(columns=['id']).copy()

# Apply same feature engineering
X_test['row_mean'] = X_test.mean(axis=1)
X_test['row_std'] = X_test.std(axis=1)
X_test['row_max'] = X_test.max(axis=1)
X_test['row_min'] = X_test.min(axis=1)
X_test['row_range'] = X_test['row_max'] - X_test['row_min']

# Predict using LightGBM and CatBoost only
lgb_test_pred = lgb_model.predict(X_test)
cat_test_pred = cat_model.predict(X_test)

# Best blend
final_test_pred = 0.5 * lgb_test_pred + 0.5 * cat_test_pred

# Save to CSV
submission = pd.DataFrame({
    'id': test['id'],
    'target': final_test_pred
})
submission.to_csv("submission.csv", index=False)

print("✅ submission.csv created with R² optimized blend.")


✅ submission.csv created with R² optimized blend.


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score

# Load data
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

X = train.drop(columns=["target"])
y = train["target"]
X_test = test.drop(columns=["id"])

# Feature engineering
for df in [X, X_test]:
    df["row_mean"] = df.mean(axis=1)
    df["row_std"] = df.std(axis=1)
    df["row_max"] = df.max(axis=1)
    df["row_min"] = df.min(axis=1)
    df["row_range"] = df["row_max"] - df["row_min"]

# K-Fold stacking setup
kf = KFold(n_splits=5, shuffle=True, random_state=42)
oof_preds = np.zeros((X.shape[0], 3))
test_preds = np.zeros((X_test.shape[0], 3))

for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
    X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]

    lgb = LGBMRegressor(num_leaves=20, max_depth=10, learning_rate=0.05,
                        n_estimators=200, subsample=0.8, colsample_bytree=0.8, random_state=42)
    cat = CatBoostRegressor(iterations=1000, learning_rate=0.05, depth=6,
                            loss_function='RMSE', early_stopping_rounds=50, verbose=0, random_state=42)
    rf = RandomForestRegressor(n_estimators=200, max_depth=10, random_state=42, n_jobs=-1)

    lgb.fit(X_tr, y_tr)
    cat.fit(X_tr, y_tr, eval_set=(X_val, y_val))
    rf.fit(X_tr, y_tr)

    oof_preds[val_idx, 0] = lgb.predict(X_val)
    oof_preds[val_idx, 1] = cat.predict(X_val)
    oof_preds[val_idx, 2] = rf.predict(X_val)

    test_preds[:, 0] += lgb.predict(X_test) / 5
    test_preds[:, 1] += cat.predict(X_test) / 5
    test_preds[:, 2] += rf.predict(X_test) / 5

# Train XGBoost meta-model
meta = XGBRegressor(n_estimators=200, learning_rate=0.05, max_depth=3, random_state=42)
meta.fit(oof_preds, y)
final_test_pred = meta.predict(test_preds)

# Evaluate OOF R2
print("Final OOF R2:", r2_score(y, meta.predict(oof_preds)))

# Save submission
submission = pd.DataFrame({
    "id": test["id"],
    "target": final_test_pred
})
submission.to_csv("submission_kfold_stack.csv", index=False)
print("✅ submission_kfold_stack.csv saved!")
