In [None]:
# Install necessary libraries in Colab
!pip install lightgbm catboost xgboost optuna

import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, PolynomialFeatures
from sklearn.impute import SimpleImputer
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import Ridge
from sklearn.ensemble import StackingRegressor
from sklearn.metrics import mean_squared_error
from google.colab import files
import optuna

# Load data from Google Drive links
train_url = "https://drive.google.com/file/d/1c3XaTRGyN9Cy2Ffnb26DlfZowi3zxE2T/view?usp=sharing"
test_url = "https://drive.google.com/file/d/1-fP60UWTyCb45r7vBrPOkU_LrE0TTn-Q/view?usp=sharing"
train_path = 'https://drive.google.com/uc?export=download&id=' + train_url.split('/')[-2]
test_path = 'https://drive.google.com/uc?export=download&id=' + test_url.split('/')[-2]
train_data = pd.read_csv(train_path)
test_data = pd.read_csv(test_path)

# Separate target and features
y_train = np.log1p(train_data.pop("SalePrice"))
X_train = train_data.drop("Id", axis=1)
id_column = test_data.pop('Id')

# Feature Engineering
class FeatureEngineering:
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()

        # Aggregate Features
        X['TotalSF'] = X['1stFlrSF'] + X['2ndFlrSF'] + X['TotalBsmtSF']
        X['TotalBathrooms'] = X['FullBath'] + (0.5 * X['HalfBath']) + X['BsmtFullBath'] + (0.5 * X['BsmtHalfBath'])
        X['HouseAge'] = X['YrSold'] - X['YearBuilt']

        # Interaction Features
        X['GrLivArea_OverallQual'] = X['GrLivArea'] * X['OverallQual']
        X['GrLivArea_log'] = np.log1p(X['GrLivArea'])

        return X

# Define pipelines for numeric and categorical features
numeric_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', MinMaxScaler()),
    ('poly', PolynomialFeatures(degree=2, include_bias=False))
])

categoric_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', numeric_pipe, make_column_selector(dtype_include=np.number)),
    ('cat', categoric_pipe, make_column_selector(dtype_include=object))
])

# Define base models with recommended settings
lgb_model = LGBMRegressor(
    objective='regression',
    random_state=123,
    colsample_bytree=0.75,
    learning_rate=0.02,
    max_depth=4,
    n_estimators=1200,
    subsample=0.85
)

cat_model = CatBoostRegressor(
    iterations=1200,
    learning_rate=0.02,
    depth=7,
    random_seed=123,
    verbose=0
)

xgb_model = XGBRegressor(
    objective='reg:squarederror',
    random_state=123,
    colsample_bytree=0.75,
    learning_rate=0.02,
    max_depth=4,
    n_estimators=1200,
    subsample=0.85
)

# Define stacking ensemble with tuned Ridge meta-learner
stacking_model = StackingRegressor(
    estimators=[
        ('lgb', lgb_model),
        ('cat', cat_model),
        ('xgb', xgb_model)
    ],
    final_estimator=Ridge(alpha=0.5)
)

# Full pipeline with feature engineering, preprocessing, and stacking model
pipeline = Pipeline([
    ('feature_eng', FeatureEngineering()),
    ('preprocessor', preprocessor),
    ('model', stacking_model)
])

# Fit pipeline on training data
print("Fitting model with stacking ensemble of LightGBM, CatBoost, and XGBoost...")
pipeline.fit(X_train, y_train)

# Evaluate on training set
train_pred = pipeline.predict(X_train)
train_rmse = mean_squared_error(y_train, train_pred, squared=False)
print("Training RMSE (log scale):", train_rmse)

# Predictions on test data and prepare submission
test_pred = np.expm1(pipeline.predict(test_data))
submission = pd.DataFrame({'Id': id_column, 'SalePrice': test_pred})

# Save submission file
submission_file = 'submission_stacking_ensemble.csv'
submission.to_csv(submission_file, index=False)
files.download(submission_file)

print(f"Submission file saved as {submission_file} and downloaded.")
