# Setup

In [1]:
# Print Data directories
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv
/kaggle/input/house-prices-advanced-regression-techniques/data_description.txt
/kaggle/input/house-prices-advanced-regression-techniques/train.csv
/kaggle/input/house-prices-advanced-regression-techniques/test.csv


In [2]:
# Import helpful libraries
import numpy as np
import pandas as pd
from cuml.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
import category_encoders as ce
import xgboost as xgb
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
# from sklearn.linear_model import Ridge
from sklearn.ensemble import StackingRegressor
import optuna
import warnings
warnings.filterwarnings('ignore')
from cuml.linear_model import Lasso, Ridge, LinearRegression
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import matplotlib.pyplot as plt


In [3]:
# Save Data Paths
train_path = '../input/house-prices-advanced-regression-techniques/train.csv'
test_path = '../input/house-prices-advanced-regression-techniques/test.csv'

# Read data
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)

target = 'SalePrice'

# Look at the shape of the data
print('Train shape:', train.shape)
print('Train shape:', test.shape)

Train shape: (1460, 81)
Train shape: (1459, 80)


# Preprocessing

In [4]:
# Concat all data
house = pd.concat([train, test])

## Deal With Missing Values

In [5]:
# Separate numerical and categorical columns
num_cols = house.select_dtypes(include=['number']).columns
cat_cols = house.select_dtypes(exclude=['number']).columns

# Apply imputers separately
house[num_cols] = SimpleImputer(strategy='mean').fit_transform(house[num_cols])
house[cat_cols] = SimpleImputer(strategy='constant', fill_value='MISSING').fit_transform(house[cat_cols])

# preserve Id dtype
house['Id'] = house['Id'].astype('int')

## Feature Engineering

In [6]:
# Create new features
house['TotalArea'] = house['TotalBsmtSF'] + house['1stFlrSF'] + house['2ndFlrSF']


# Age-Based Features
house['HouseAge'] = house['YrSold'] - house['YearBuilt']
house['YearsSinceRemodel'] = house['YrSold'] - house['YearRemodAdd']
house['GarageAge'] = house['YrSold'] - house['GarageYrBlt']

# Size-Based Features
house['TotalSF'] = house['TotalBsmtSF'] + house['1stFlrSF'] + house['2ndFlrSF']
house['AboveGradeSF'] = house['GrLivArea']
house['BasementSF'] = house['TotalBsmtSF']
house['FirstFlrSF'] = house['1stFlrSF']
house['SecondFlrSF'] = house['2ndFlrSF']

# Bathroom Features
house['TotalBath'] = house['FullBath'] + (house['HalfBath'] * 0.5) + house['BsmtFullBath'] + (house['BsmtHalfBath'] * 0.5)
house['FullBathAboveGrade'] = house['FullBath']
house['HalfBathAboveGrade'] = house['HalfBath']
house['TotalBsmtBath'] = house['BsmtFullBath'] + house['BsmtHalfBath']

# Fireplaces & Features
house['HasFireplace'] = house['Fireplaces'].apply(lambda x: 1 if x > 0 else 0)
house['FireplaceQuality'] = house['FireplaceQu'].map({'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'MISSING': 0})
house['HasPool'] = house['PoolArea'].apply(lambda x: 1 if x > 0 else 0)

# Exterior & Structural Features
house['HasGarage'] = house['GarageArea'].apply(lambda x: 1 if x > 0 else 0)
house['GarageAge'] = house['YrSold'] - house['GarageYrBlt']
house['GarageQuality'] = house['GarageQual'].map({'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'MISSING': 0})
house['GarageCondition'] = house['GarageCond'].map({'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'MISSING': 0})

# Porch & Outdoor Features
house['TotalPorchSF'] = house['OpenPorchSF'] + house['EnclosedPorch'] + house['3SsnPorch'] + house['ScreenPorch']
house['HasDeck'] = house['WoodDeckSF'].apply(lambda x: 1 if x > 0 else 0)

# Lot Features
house['LotSize'] = house['LotArea']
house['LotFrontageCategory'] = pd.cut(house['LotFrontage'], bins=[0, 50, 100, 150, 200, np.inf], labels=['Small', 'Medium', 'Large', 'Very Large', 'Huge'])
house['LotShapeReg'] = house['LotShape'].apply(lambda x: 1 if x == 'Reg' else 0)
house['LotShapeIR'] = house['LotShape'].apply(lambda x: 1 if x != 'Reg' else 0)

# Quality and Condition Features
house['QualCond'] = house['OverallQual'] * house['OverallCond']
house['HasExteriorQual'] = house['ExterQual'].apply(lambda x: 1 if x == 'Ex' else 0)
house['HasBasementQual'] = house['BsmtQual'].apply(lambda x: 1 if x == 'Ex' or x == 'Gd' else 0)

# Zoning and Neighborhood
neighborhood_quality = house.groupby('Neighborhood')['OverallQual'].mean()
house['NeighborhoodQuality'] = house['Neighborhood'].map(neighborhood_quality)

zoning_quality = house.groupby('MSZoning')['OverallQual'].mean()
house['ZoningQuality'] = house['MSZoning'].map(zoning_quality)

# Functional Features
house['FunctionalQuality'] = house['Functional'].apply(lambda x: 0 if x == 'Typ' else (1 if x in ['Min1', 'Min2'] else (2 if x in ['Mod', 'Maj1', 'Maj2'] else 3)))

# Interaction Features
house['FireplaceGarage'] = house['HasFireplace'] * house['HasGarage']
house['StyleQuality'] = house['HouseStyle'].map(lambda x: {'1Story': 1, '2Story': 2, 'SFoyer': 3}.get(x, 0)) * house['OverallQual']

# Transformation of Skewed Features
house['LogGrLivArea'] = np.log1p(house['GrLivArea'])
house['LogTotalSF'] = np.log1p(house['TotalSF'])
house['LogLotArea'] = np.log1p(house['LotArea'])
house['LogLotSize'] = np.log1p(house['LotSize'])

# Ratio Features
house['LivingToLotRatio'] = house['GrLivArea'] / house['LotArea']
house['GarageToHouseArea'] = house['GarageArea'] / house['GrLivArea']
house['BasementToTotalSF'] = house['TotalBsmtSF'] / house['TotalSF']

# # Categorical Feature Encodings
# house['ZoningLotConfig'] = house['MSZoning'] + "_" + house['LotConfig']
# house['GarageTypeEncoded'] = house['GarageType'].map({'Attchd': 1, 'Detchd': 2, 'BuiltIn': 3, 'Basment': 4, 'CarPort': 5, 'MISSING': 0})
# house['NeighborhoodGarageFinish'] = house['Neighborhood'] + "_" + house['GarageFinish'].fillna('MISSING')

# Interaction of Structural and Exterior Features
# house['TotalRoomsPerStyle'] = house['TotRmsAbvGrd'] / house['HouseStyle'].map({'1Story': 1, '2Story': 2, 'SFoyer': 3, 'SLvl': 4, '2.5Fin': 5})
house['PorchAndPool'] = house['HasDeck'] * house['HasPool']
house['QualPoolArea'] = house['OverallQual'] * house['PoolArea']

# More Features from Quality and Condition
house['QualityBasement'] = house['OverallQual'] * house['BsmtQual'].map({'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'MISSING': 0})
house['ConditionGarage'] = house['OverallCond'] * house['GarageQual'].map({'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'MISSING': 0})

# Additional Date Features
house['SaleSeason'] = house['MoSold'].apply(lambda x: 'Spring' if x in [3, 4, 5] else ('Summer' if x in [6, 7, 8] else ('Fall' if x in [9, 10, 11] else 'Winter')))
house['BuiltDecade'] = (house['YearBuilt'] // 10) * 10
house['SoldDecade'] = (house['YrSold'] // 10) * 10

# Multi-Level Features
house['FloorAreaToTotal'] = house['1stFlrSF'] / house['TotalSF']
# house['FirstToSecondFloor'] = house['1stFlrSF'] / house['2ndFlrSF'] # has 0 division
house['ConditionLotConfig'] = house['OverallCond'].astype(str) + '_' + house['LotConfig']


In [7]:
# Split in train and test data again
train = house[house['Id'].isin(train['Id'])]
test = house[house['Id'].isin(test['Id'])].drop(target, axis=1)
# Separate numerical and categorical columns
num_cols = house.select_dtypes(include=['number']).columns
cat_cols = house.select_dtypes(exclude=['number']).columns

## Target Encoding

In [8]:
# Initialize Target Encoder
encoder = ce.TargetEncoder(cols=cat_cols)

# Fit on training data and transform (store in a new DataFrame to avoid warnings)
train_encoded = encoder.fit_transform(train[cat_cols], train[target])
test_encoded = encoder.transform(test[cat_cols])

# Assign back safely using `.loc`
train.loc[:, cat_cols] = train_encoded
test.loc[:, cat_cols] = test_encoded

# Modeling

In [9]:
#Create features
features = test.columns
# Select columns corresponding to features, and preview the data
X = train[features]
y = train[target]


## Scaling

In [10]:
# scaler = StandardScaler()
# X = scaler.fit_transform(X)

## Hyperparameter tuning

In [11]:
# # Define the objective function for optimization
# def objective(trial):
#     params = {
#         "n_estimators": trial.suggest_int("n_estimators", 50, 500),
#         "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
#         "max_depth": trial.suggest_int("max_depth", 3, 15),
#         "subsample": trial.suggest_float("subsample", 0.5, 1.0),
#         "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
#         "reg_alpha": trial.suggest_float("reg_alpha", 0, 10),
#         "reg_lambda": trial.suggest_float("reg_lambda", 0, 10),
#         "tree_method": "gpu_hist",  # Use GPU acceleration
#         "device": "cuda",
#         "n_gpus": -1
#     }

#     model = XGBRegressor(**params, random_state=42)
#     cv_scores = cross_val_score(model, X.values, y.values, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
    
#     # Calculate RMSE
#     rmse = np.sqrt(-cv_scores)
#     return np.mean(rmse) + np.std(rmse)

# # Run optimization
# study = optuna.create_study(direction="minimize")  # Minimize RMSE
# study.optimize(objective, n_trials=50)  # Adjust n_trials for more tuning

# # Print the best parameters
# print("Best hyperparameters:", study.best_params)

In [12]:
# Best hyperparameters XGBoost: {'n_estimators': 478, 'learning_rate': 0.1663328863259368, 'max_depth': 3, 'subsample': 0.7252234539866299, 'colsample_bytree': 0.7862385259565916, 'reg_alpha': 9.778581214799006, 'reg_lambda': 5.87816523829437}
best_params = {'n_estimators': 478, 'learning_rate': 0.1663328863259368, 'max_depth': 3, 'subsample': 0.7252234539866299, 'colsample_bytree': 0.7862385259565916, 'reg_alpha': 9.778581214799006, 'reg_lambda': 5.87816523829437}
# Create a Random Forest object


## Feature Selection

In [13]:
# Train XGBoost model
model = XGBRegressor(**best_params, tree_method='gpu_hist', device='cuda', random_state=42)
model.fit(X.values, y.values)

# Get feature importance
feature_importance = model.feature_importances_
features = X.columns

# Sort features by importance
sorted_idx = feature_importance.argsort()


In [14]:
# Select top N important features
N = 120  # Change this based on importance cutoff
important_features = features[sorted_idx][-N:]
X_selected = X[important_features]

print("Selected Features:", important_features.tolist())

Selected Features: ['Street', 'QualPoolArea', 'PoolQC', 'HasPool', 'HasGarage', 'LowQualFinSF', 'HasFireplace', 'Condition2', 'LogLotArea', 'GarageCond', 'TotalBsmtBath', 'SoldDecade', 'HasDeck', 'BsmtQual', 'Exterior2nd', 'BsmtFinType2', 'HasBasementQual', 'FireplaceGarage', 'MiscFeature', 'Fence', 'ExterQual', 'BldgType', 'LotShapeReg', 'RoofStyle', 'HalfBathAboveGrade', 'ExterCond', 'Foundation', 'MasVnrType', 'FireplaceQuality', 'HouseStyle', 'LotShape', 'LogGrLivArea', 'MSSubClass', 'BsmtFullBath', 'GarageToHouseArea', 'BasementToTotalSF', 'BsmtFinType1', 'Id', 'FloorAreaToTotal', 'EnclosedPorch', '3SsnPorch', 'LandSlope', 'LivingToLotRatio', 'MoSold', 'YearBuilt', 'YearsSinceRemodel', 'BsmtHalfBath', 'StyleQuality', 'Exterior1st', 'BsmtCond', 'GarageAge', 'AboveGradeSF', 'Electrical', 'YrSold', 'LotFrontageCategory', 'BsmtFinSF2', 'KitchenAbvGr', 'MiscVal', 'ConditionLotConfig', 'SaleSeason', 'WoodDeckSF', 'Alley', 'ZoningQuality', 'TotalPorchSF', 'BasementSF', 'MasVnrArea', 'Ope

### PCA

In [15]:
from sklearn.decomposition import PCA

pca = PCA(n_components=100)
X_pca = pca.fit_transform(X)

## Ensembling

In [16]:
base_models = [
    ('xgb', XGBRegressor(**best_params, tree_method='gpu_hist', device='cuda', random_state=42)),
    ('rf', RandomForestRegressor(n_estimators=200, random_state=42)),
    ('nn', MLPRegressor(hidden_layer_sizes=(64, 32), activation='relu', solver='adam', max_iter=1000, random_state=42)),
]

## Validation

In [17]:
# Create the stacking model
model = StackingRegressor(estimators=base_models, final_estimator=LinearRegression())

In [18]:
# Create a KFold object
kf = KFold(n_splits=5, shuffle=True, random_state=5)

# Compute 5-fold cross-validation scores
cv_scores = cross_val_score(model, X.values, y.values, cv=kf, scoring='neg_mean_squared_error')

# Calculate RMSE
rmse = np.sqrt(-cv_scores)

# print("Validation MAE for Random Forest Model: {:,.0f}".format(rf_val_mae))

print('Overall validation RMSE: {:,.0f}'.format(np.mean(rmse) + np.std(rmse)))

Overall validation RMSE: 28,618


## Training

In [19]:
# Train a model
model.fit(X.values, y.values)

## Submission

In [20]:
# Get predictions for the test set
test[target] = model.predict(test.values)

# Write test predictions using the sample_submission format
test[['Id', target]].to_csv('kaggle_submission.csv', index=False)