In [1]:
# Set up code checking
import os
if not os.path.exists("../input/train.csv"):
    os.symlink("../input/home-data-for-ml-course/train.csv", "../input/train.csv")  
    os.symlink("../input/home-data-for-ml-course/test.csv", "../input/test.csv") 
from learntools.core import binder
binder.bind(globals())
from learntools.ml_intermediate.ex4 import *
print("Setup Complete")

Setup Complete


In [2]:
import pandas as pd
import numpy as np
from scipy.stats import skew
from sklearn.preprocessing import RobustScaler, OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.ensemble import GradientBoostingRegressor, StackingRegressor
from sklearn.linear_model import RidgeCV, LassoCV, ElasticNetCV
from xgboost import XGBRegressor
import warnings
warnings.filterwarnings('ignore')

# 1. LOAD DATA
print("Loading data...")
train = pd.read_csv('../input/train.csv', index_col='Id')
test = pd.read_csv('../input/test.csv', index_col='Id')

# 2. REMOVE OUTLIERS (Crucial for Top 1%)
train = train.drop(train[(train['GrLivArea'] > 4000) & (train['SalePrice'] < 300000)].index)

# 3. PREPARE TARGET
y = np.log1p(train['SalePrice'])
train_features = train.drop(['SalePrice'], axis=1)
test_features = test.copy()

# Combine for uniform processing
all_data = pd.concat([train_features, test_features]).reset_index(drop=True)

# 4. CLEANING & FEATURE ENGINEERING
# A. Impute LotFrontage by Neighborhood
all_data['LotFrontage'] = all_data.groupby('Neighborhood')['LotFrontage'].transform(
    lambda x: x.fillna(x.median()))

# B. Fix specific integer columns that are actually categorical or have special NA meaning
# MSSubClass is really a category
all_data['MSSubClass'] = all_data['MSSubClass'].apply(str)
# GarageYrBlt: missing means no garage, fill with 0 or a placeholder like 1900
all_data['GarageYrBlt'] = all_data['GarageYrBlt'].fillna(0)

# C. Fill all other missing values (This fixes the ValueError)
# Categorical -> "None", Numerical -> 0
objects = []
numerics = []
for c in all_data.columns:
    if all_data[c].dtype == 'object':
        objects.append(c)
    else:
        numerics.append(c)

all_data[objects] = all_data[objects].fillna("None")
all_data[numerics] = all_data[numerics].fillna(0)

# D. Feature Engineering
all_data['TotalSF'] = all_data['TotalBsmtSF'] + all_data['1stFlrSF'] + all_data['2ndFlrSF']
all_data['TotalBath'] = (all_data['FullBath'] + (0.5 * all_data['HalfBath']) + 
                         all_data['BsmtFullBath'] + (0.5 * all_data['BsmtHalfBath']))
all_data['HasPool'] = all_data['PoolArea'].apply(lambda x: 1 if x > 0 else 0)
all_data['Has2ndFloor'] = all_data['2ndFlrSF'].apply(lambda x: 1 if x > 0 else 0)
all_data['HasGarage'] = all_data['GarageArea'].apply(lambda x: 1 if x > 0 else 0)

# E. Skew Correction
numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index
skewed_feats = all_data[numeric_feats].apply(lambda x: skew(x)).sort_values(ascending=False)
skewness = skewed_feats[abs(skewed_feats) > 0.75]
skewed_features = skewness.index
all_data[skewed_features] = np.log1p(all_data[skewed_features])

# F. Manual Ordinal Encoding
ordinal_map = {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'None': 0}
ordinal_cols = ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'HeatingQC', 
                'KitchenQual', 'FireplaceQu', 'GarageQual', 'GarageCond', 'PoolQC']

for col in ordinal_cols:
    all_data[col] = all_data[col].map(ordinal_map).fillna(0)

# G. One-Hot Encoding
all_data = pd.get_dummies(all_data)

# 5. PREPARE FOR MODELING
X = all_data.iloc[:len(y), :]
X_test = all_data.iloc[len(y):, :]

# Scale data (Important for Ridge/Lasso/SVM)
scaler = RobustScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)

# 6. DEFINE MODELS
# XGBoost (Tree-based)
xgboost = XGBRegressor(learning_rate=0.01, n_estimators=3460,
                       max_depth=3, min_child_weight=0,
                       gamma=0, subsample=0.7,
                       colsample_bytree=0.7,
                       objective='reg:squarederror', nthread=-1,
                       scale_pos_weight=1, seed=27,
                       reg_alpha=0.00006)

# Gradient Boosting (Tree-based)
gbr = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
                                max_depth=4, max_features='sqrt',
                                min_samples_leaf=15, min_samples_split=10, 
                                loss='huber', random_state=5)

# Ridge/Lasso (Linear)
ridge = RidgeCV(alphas=[14.6, 14.7, 14.8, 14.9, 15, 15.1, 15.2, 15.3, 15.4, 15.5])
lasso = LassoCV(alphas=[0.0001, 0.0003, 0.0006, 0.001, 0.003, 0.006, 0.01, 0.03, 0.06, 0.1, 0.3, 0.6, 1], 
                max_iter=50000, cv=10)

# 7. STACKING REGRESSOR
# We wrap linear models in a pipeline just to be safe, though we scaled X globally.
stack_gen = StackingRegressor(
    estimators=[
        ('xgb', xgboost), 
        ('gbr', gbr), 
        ('ridge', ridge), 
        ('lasso', lasso)
    ],
    final_estimator=RidgeCV(),
    n_jobs=-1
)

# 8. TRAIN
print("Training Stacking Ensemble...")
# We use X_scaled for best performance
stack_gen.fit(X_scaled, y)

# 9. PREDICT
print("Generating predictions...")
preds_log = stack_gen.predict(X_test_scaled)
preds = np.expm1(preds_log)

# 10. SAVE
output = pd.DataFrame({'Id': test.index, 'SalePrice': preds})
output.to_csv('submission.csv', index=False)
print("Submission saved successfully!")

Loading data...
Training Stacking Ensemble...
Generating predictions...
Submission saved successfully!
