In [None]:
import pandas as pd
import numpy as np

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import KNNImputer

In [None]:
from sklearn.cluster import KMeans

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV

In [None]:
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import Ridge
from catboost import CatBoostRegressor

In [None]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [None]:
print(train.isnull().sum().sort_values(ascending=False).head(20))

In [None]:
print(test.isnull().sum().sort_values(ascending=False).head(35))

In [None]:
# Data clean
# missing values 
# high_missing_cols fill with 'None'
high_missing_cols = ['PoolQC', 'MiscFeature', 'Alley', 'Fence']
for col in high_missing_cols:
    train[col] = train[col].fillna('None')
    test[col] = test[col].fillna('None')

# middle_missing_cols fill with 'None' except LotFrontage
for col in ['MasVnrType', 'FireplaceQu']:
    train[col] = train[col].fillna('None')
    test[col] = test[col].fillna('None')

# low missing（except MSZoning）
zero_fill_cols = ['MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath']
for col in zero_fill_cols:
    train[col] = train[col].fillna(0)
    test[col] = test[col].fillna(0)
#categories and garagecars,garagearea
cat_cols = ['Electrical', 'Utilities', 'Functional', 'Exterior1st', 'Exterior2nd', 'SaleType', 'KitchenQual',
            'BsmtExposure', 'BsmtCond', 'BsmtQual', 'BsmtFinType1', 'BsmtFinType2','GarageCars', 'GarageArea']
for col in cat_cols:
    train[col] = train[col].fillna(train[col].mode()[0])
    test[col] = test[col].fillna(test[col].mode()[0])
train['GarageYrBlt'] = train['GarageYrBlt'].fillna(0)
test['GarageYrBlt'] = test['GarageYrBlt'].fillna(0)
garage_cols = ['GarageQual', 'GarageFinish', 'GarageType', 'GarageCond']
for col in garage_cols:
    train[col] = train[col].fillna('None')
    test[col] = test[col].fillna('None')


# KNN fill LotFrontage and MSZoning
knn_cols = ['LotFrontage', 'MSZoning']
related_cols = ['GrLivArea', 'LotArea', 'OverallQual', 'Neighborhood']
combined = pd.concat([train[knn_cols + related_cols], test[knn_cols + related_cols]], axis=0)
label_encoders = {}
for col in ['MSZoning', 'Neighborhood']:
    le = LabelEncoder()
    combined[col] = combined[col].fillna('Missing')
    combined[col] = le.fit_transform(combined[col])
    label_encoders[col] = le
scaler = StandardScaler()
num_cols = ['LotFrontage', 'GrLivArea', 'LotArea', 'OverallQual']
combined[num_cols] = scaler.fit_transform(combined[num_cols])
imputer = KNNImputer(n_neighbors=5, weights='distance')
combined_imputed = pd.DataFrame(imputer.fit_transform(combined), columns=combined.columns)
combined_imputed[num_cols] = scaler.inverse_transform(combined_imputed[num_cols])
combined_imputed['MSZoning'] = label_encoders['MSZoning'].inverse_transform(combined_imputed['MSZoning'].round().astype(int))
combined_imputed['Neighborhood'] = label_encoders['Neighborhood'].inverse_transform(combined_imputed['Neighborhood'].round().astype(int))
train[knn_cols] = combined_imputed.iloc[:train.shape[0], :][knn_cols]
test[knn_cols] = combined_imputed.iloc[:test.shape[0], :][knn_cols]

# validate missing values
print("Train missing values:\n", train.isnull().sum().sort_values(ascending=False).head(5))
print("Test missing values:\n", test.isnull().sum().sort_values(ascending=False).head(5))

In [None]:
# outlier removal
plt.figure(figsize=(10, 6))
sns.scatterplot(x='GrLivArea', y='SalePrice', hue='PoolQC', data=train)
plt.title('GrLivArea vs SalePrice (Colored by PoolQC)')
plt.show()
train = train[train['GrLivArea'] < 4000]
train = train[train['SalePrice'] < 700000]
print("Train shape after outlier removal:", train.shape)



In [None]:
# transform categorical columns to string
cat_cols = ['MSSubClass', 'OverallQual', 'OverallCond', 'PoolQC', 'MiscFeature', 'Alley', 'Fence', 'MSZoning']
train[cat_cols] = train[cat_cols].astype(str)
test[cat_cols] = test[cat_cols].astype(str)

In [None]:
# save cleaned data
train.to_csv('train_cleaned_knn.csv', index=False)
test.to_csv('test_cleaned_knn.csv', index=False)
print("Train shape after cleaning:", train.shape)
print("Test shape after cleaning:", test.shape)



In [None]:
# validate KNN imputation effect
plt.figure(figsize=(10, 6))
sns.histplot(train['LotFrontage'], kde=True, label='Train (KNN Imputed)')
plt.title('LotFrontage Distribution After KNN Imputation')
plt.legend()
plt.show()
plt.figure(figsize=(10, 6))
sns.boxplot(x='MSZoning', y='SalePrice', data=train)
plt.title('MSZoning vs SalePrice After KNN Imputation')
plt.show()

In [None]:

#Feature engineering
# load cleaned data
train1 = pd.read_csv('train_cleaned_knn.csv',keep_default_na=False).set_index('Id') 
#keep_default_na=False to avoid NaN issues, important
#set_index('Id') to exclude id column, important
test1 = pd.read_csv('test_cleaned_knn.csv',keep_default_na=False).set_index('Id') 
#keep_default_na=False to avoid NaN issues, important
#set_index('Id') to exclude id column, important

# combine datasets for feature engineering consistency
train1['is_train'] = 1
test1['is_train'] = 0
combined = pd.concat([train1.drop('SalePrice', axis=1), test1], axis=0)


In [None]:
# 1. create new features
# total area (above ground + basement)
combined['TotalSF'] = combined['TotalBsmtSF'] + combined['1stFlrSF'] + combined['2ndFlrSF']

# house age (year sold - year built)
combined['HouseAge'] = combined['YrSold'] - combined['YearBuilt']

# renewal age (year sold - year remodeled)
combined['RemodAge'] = combined['YrSold'] - combined['YearRemodAdd']

# total bathroom count (above ground + basement)
combined['TotalBath'] = combined['FullBath'] + 0.5 * combined['HalfBath'] + \
                       combined['BsmtFullBath'] + 0.5 * combined['BsmtHalfBath']

# total rooms (excluding bathrooms)
combined['TotalRooms'] = combined['TotRmsAbvGrd'] + combined['BedroomAbvGr']

# garage age (if no garage, fill 0)
combined['GarageAge'] = combined['YrSold'] - combined['GarageYrBlt'].fillna(0)

# swimming pool, fence, miscellaneous features
combined['HasPool'] = combined['PoolArea'].apply(lambda x: 1 if x > 0 else 0)
combined['HasFence'] = combined['Fence'].apply(lambda x: 1 if x != 'None' else 0)
combined['HasMisc'] = combined['MiscFeature'].apply(lambda x: 1 if x != 'None' else 0)

# additional features
combined['SF_Qual'] = combined['TotalSF'] * combined['OverallQual']
combined['LivArea_Bedroom'] = combined['GrLivArea'] * combined['BedroomAbvGr']

# season feature
def get_season(month):
    if month in [3, 4, 5]: return 'Spring'
    elif month in [6, 7, 8]: return 'Summer'
    elif month in [9, 10, 11]: return 'Fall'
    else: return 'Winter'
combined['Season'] = combined['MoSold'].apply(get_season)


In [None]:
combined['FireplaceQu'].nunique()
combined['FireplaceQu'].unique()

In [None]:
combined['ExterQual'].unique()

In [None]:
# 2. categorical encoding
# ordinal variable mapping
ordinal_cols = ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'HeatingQC', 
                'KitchenQual', 'FireplaceQu', 'GarageQual', 'GarageCond', 
                'BsmtExposure', 'Functional','PoolQC']
quality_map = {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'None': 0, 'NA': 0 }
bsmt_exposure_map = {'Gd': 4, 'Av': 3, 'Mn': 2, 'No': 1,  'NA': 0}
functional_map = {'Typ': 8, 'Min1': 7, 'Min2': 6, 'Mod': 5, 'Maj1': 4, 
                  'Maj2': 3, 'Sev': 2, 'Sal': 1}
PoolQC_map = {'Ex': 4, 'Gd': 3, 'TA': 2, 'Fa': 1, 'None': 0}
for col in ordinal_cols:
    if col == 'BsmtExposure':
        combined[col] = combined[col].map(bsmt_exposure_map)
    elif col == 'Functional':
        combined[col] = combined[col].map(functional_map)
    elif col == 'PoolQC':
        combined[col] = combined[col].map(PoolQC_map)
    else:
        combined[col] = combined[col].map(quality_map)

In [None]:
# label encoding 
nominal_cols = ['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 
                'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 
                'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 
                'Exterior2nd', 'MasVnrType', 'Foundation', 'BsmtFinType1', 
                'BsmtFinType2', 'Heating', 'CentralAir', 'Electrical', 'GarageType', 
                'GarageFinish', 'PavedDrive', 'Fence', 'MiscFeature', 'SaleType', 
                'SaleCondition', 'Season']
label_encoders = {}
for col in nominal_cols:
    le = LabelEncoder()
    combined[col] = le.fit_transform(combined[col])
    label_encoders[col] = le

# encoding Neighborhood
neighborhood_mean = train1.groupby('Neighborhood')['SalePrice'].mean()
combined['Neighborhood_MeanPrice'] = combined['Neighborhood'].map(neighborhood_mean)
combined['Neighborhood_MeanPrice'] = combined['Neighborhood_MeanPrice'].fillna(neighborhood_mean.mean())

# 3. clustering
cluster_features = ['TotalSF', 'OverallQual', 'GrLivArea']
kmeans = KMeans(n_clusters=5, random_state=42)
combined['Cluster'] = kmeans.fit_predict(combined[cluster_features])

# 4. delete low information columns
# low_info_cols = ['LowQualFinSF', 'MiscVal', 'PoolArea']
# combined = combined.drop(low_info_cols, axis=1)

# 5. deal with skewness
# log transform SalePrice
train1['SalePrice'] = np.log1p(train1['SalePrice'])

# transform skewed numeric features when skewness > 0.75
numeric_cols = combined.select_dtypes(include=[np.number]).columns.drop(['is_train'])
skewed_cols = combined[numeric_cols].apply(lambda x: x.skew()).sort_values(ascending=False)
skewed_cols = skewed_cols[abs(skewed_cols) > 0.75].index
for col in skewed_cols:
    combined[col] = np.log1p(combined[col].clip(lower=0))

# 6. standardize numeric features
scaler = StandardScaler()
combined[numeric_cols] = scaler.fit_transform(combined[numeric_cols])

# 7. separate train and test sets
train_processed = combined[combined['is_train'] == 1].drop('is_train', axis=1)
test_processed = combined[combined['is_train'] == 0].drop('is_train', axis=1)
train_processed['SalePrice'] = train1['SalePrice']

# 8. validate feature engineering
print("New features created:", ['TotalSF', 'HouseAge', 'RemodAge', 'TotalBath', 
                              'TotalRooms', 'GarageAge', 'HasPool', 'HasFence', 
                              'HasMisc', 'SF_Qual', 'LivArea_Bedroom', 
                              'Neighborhood_MeanPrice', 'Season', 'Cluster'])
print("Train processed shape:", train_processed.shape)
print("Test processed shape:", test_processed.shape)
print("Missing values in train:", train_processed.isnull().sum().max())
print("Missing values in test:", test_processed.isnull().sum().max())

# save processed data
train_processed.to_csv('train_processed.csv', index=False)
test_processed.to_csv('test_processed.csv', index=False)

In [None]:
print(train_processed.isnull().sum().sort_values(ascending=False).head(5))

In [None]:
print(test_processed.isnull().sum().sort_values(ascending=False).head(5))

In [None]:
#model training, base models
train_processed = pd.read_csv('train_processed.csv')
test_processed = pd.read_csv('test_processed.csv')

# define features and target variable
X = train_processed.drop(['SalePrice'], axis=1)
y = train_processed['SalePrice']  # log transformed
X_test = test_processed.copy()

# split train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# default  base models
# random forest
rf = RandomForestRegressor(random_state=42)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_val)
rf_rmse = np.sqrt(mean_squared_error(y_val, rf_pred))
print(f"RandomForest RMSE: {rf_rmse:.5f}")

# XGBoost
xgb = XGBRegressor(random_state=42)
xgb.fit(X_train, y_train)
xgb_pred = xgb.predict(X_val)
xgb_rmse = np.sqrt(mean_squared_error(y_val, xgb_pred))
print(f"XGBoost RMSE: {xgb_rmse:.5f}")

# LightGBM
lgb = LGBMRegressor(random_state=42)
lgb.fit(X_train, y_train)
lgb_pred = lgb.predict(X_val)
lgb_rmse = np.sqrt(mean_squared_error(y_val, lgb_pred))
print(f"LightGBM RMSE: {lgb_rmse:.5f}")



In [None]:
# xgboost hyperparameter tuning
# XGBoost parameter grid
xgb_param_grid = {
    'n_estimators': [100, 500, 1000],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],
    'subsample': [0.7, 0.8, 1.0],
    'colsample_bytree': [0.7, 0.8, 1.0],
    'min_child_weight': [1, 3, 5]
}

# grid search for hyperparameter tuning
xgb = XGBRegressor(random_state=42)
xgb_grid = GridSearchCV(xgb, xgb_param_grid, cv=5, scoring='neg_root_mean_squared_error', n_jobs=-1)
xgb_grid.fit(X_train, y_train)

# output best parameters and score
print("best XGBoost params:", xgb_grid.best_params_)
print("best XGBoost RMSE:", -xgb_grid.best_score_)

# validate tuned XGBoost model
xgb_best = xgb_grid.best_estimator_
xgb_pred = xgb_best.predict(X_val)
xgb_rmse = np.sqrt(mean_squared_error(y_val, xgb_pred))
print(f"Hyperparameter tuned XGBoost RMSE: {xgb_rmse:.5f}")

In [None]:
#lgbm hyperparameter tuning
lgb_param_grid = {
    'n_estimators': [100, 500, 1000],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],
    'num_leaves': [15, 31, 63],
    'subsample': [0.7, 0.8, 1.0],
    'colsample_bytree': [0.7, 0.8, 1.0]
}

# grid search for hyperparameter tuning
lgb = LGBMRegressor(random_state=42)
lgb_grid = GridSearchCV(lgb, lgb_param_grid, cv=5, scoring='neg_root_mean_squared_error', n_jobs=-1)
lgb_grid.fit(X_train, y_train)

# output best parameters and score
print("Best LightGBM params:", lgb_grid.best_params_)
print("Best LightGBM RMSE:", -lgb_grid.best_score_)

# validate tuned XGBoost model
lgb_best = lgb_grid.best_estimator_
lgb_pred = lgb_best.predict(X_val)
lgb_rmse = np.sqrt(mean_squared_error(y_val, lgb_pred))
print(f"Hyperparameter tuned LightGBM RMSE: {lgb_rmse:.5f}")

In [None]:
# feature importance visualization
# XGBoost feature importance
xgb_importance = pd.Series(xgb_best.feature_importances_, index=X.columns).sort_values(ascending=False)
plt.figure(figsize=(10, 6))
xgb_importance[:10].plot(kind='bar')
plt.title('XGBoost top 10 feature importance')
plt.show()

# LightGBM feature importance
lgb_importance = pd.Series(lgb_best.feature_importances_, index=X.columns).sort_values(ascending=False)
plt.figure(figsize=(10, 6))
lgb_importance[:10].plot(kind='bar')
plt.title('LightGBM top 10 feature importance')
plt.show()

In [None]:
# XGBoost cross-validation
xgb_cv_scores = cross_val_score(xgb_best, X, y, cv=5, scoring='neg_root_mean_squared_error')
print(f"XGBoost cross-validation RMSE: {-xgb_cv_scores.mean():.5f} (+/- {xgb_cv_scores.std() * 2:.5f})")

# LightGBM cross-validation
lgb_cv_scores = cross_val_score(lgb_best, X, y, cv=5, scoring='neg_root_mean_squared_error')
print(f"LightGBM cross-validation RMSE: {-lgb_cv_scores.mean():.5f} (+/- {lgb_cv_scores.std() * 2:.5f})")

In [None]:
# predict on test set
xgb_test_pred = xgb_best.predict(X_test)
lgb_test_pred = lgb_best.predict(X_test)

# simple ensemble (average predictions)
final_pred = (xgb_test_pred + lgb_test_pred) / 2

# expm1 transformation to reverse log transformation
final_pred = np.expm1(final_pred)

# save submission file
submission = pd.DataFrame({'Id': test['Id'], 'SalePrice': final_pred})
submission.to_csv('submission_baseline.csv', index=False)

In [None]:
# define base models for stacking
base_models = [
    ('rf', RandomForestRegressor(random_state=42)),
    ('xgb', XGBRegressor(**xgb_grid.best_params_, random_state=42)),
    ('lgb', LGBMRegressor(**lgb_grid.best_params_, random_state=42))
]

# define meta learner
meta_learner = Ridge()

# initia Stacking
stacking_model = StackingRegressor(estimators=base_models, final_estimator=meta_learner, cv=5)

# train Stacking model
stacking_model.fit(X_train, y_train)

# validate Stacking model
stacking_pred = stacking_model.predict(X_val)
stacking_rmse = np.sqrt(mean_squared_error(y_val, stacking_pred))
print(f"Stacking RMSE: {stacking_rmse:.5f}")

# cross-validation
stacking_cv_scores = cross_val_score(stacking_model, X, y, cv=5, scoring='neg_root_mean_squared_error')
print(f"Stacking cross-validation RMSE: {-stacking_cv_scores.mean():.5f} (+/- {stacking_cv_scores.std() * 2:.5f})")

# predict on test set
stacking_test_pred = stacking_model.predict(X_test)
stacking_test_pred = np.expm1(stacking_test_pred)

# save submission file
submission_stacking = pd.DataFrame({'Id': test['Id'], 'SalePrice': stacking_test_pred})
submission_stacking.to_csv('submission_stacking.csv', index=False)