In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, HistGradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split, cross_val_score, KFold, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder
from scipy.stats import randint, uniform
import warnings
warnings.filterwarnings('ignore')

In [2]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [3]:
y = np.log1p(train_data.SalePrice)


In [4]:

numerical_features = [
    'LotArea', 'YearBuilt', 'YearRemodAdd', '1stFlrSF', '2ndFlrSF',
    'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd',
    'Fireplaces', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
    'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
    'OverallQual', 'OverallCond', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF',
    'TotalBsmtSF', 'GrLivArea', 'LotFrontage', 'MasVnrArea', 'GarageYrBlt'
]

categorical_features = [
    'MSSubClass', 'MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour',
    'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1',
    'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl',
    'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond',
    'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1',
    'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical',
    'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish',
    'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence',
    'MiscFeature', 'SaleType', 'SaleCondition'
]


all_features = numerical_features + categorical_features


In [5]:
X = train_data[all_features].copy()


for feature in numerical_features:
    if feature in X.columns:
        X[feature] = X[feature].fillna(X[feature].median())


for feature in categorical_features:
    if feature in X.columns:
        X[feature] = X[feature].fillna('Unknown')


label_encoders = {}
for feature in categorical_features:
    if feature in X.columns:
        le = LabelEncoder()
        X[feature] = le.fit_transform(X[feature].astype(str))
        label_encoders[feature] = le


In [7]:
X['TotalSF'] = X['1stFlrSF'] + X['2ndFlrSF'] + X['TotalBsmtSF']
X['TotalBath'] = X['FullBath'] + 0.5 * X['HalfBath']
X['Age'] = 2024 - X['YearBuilt']
X['RemodAge'] = 2024 - X['YearRemodAdd']
X['GarageAge'] = 2024 - X['GarageYrBlt']
X['HasPool'] = (X['PoolArea'] > 0).astype(int)
X['HasFireplace'] = (X['Fireplaces'] > 0).astype(int)
X['HasGarage'] = (X['GarageArea'] > 0).astype(int)
X['HasBasement'] = (X['TotalBsmtSF'] > 0).astype(int)


X = X.replace([np.inf, -np.inf], np.nan)
X = X.fillna(X.median())

print("Training data shape:", X.shape)
print("Features used:", len(X.columns))


Training data shape: (1460, 83)
Features used: 83


In [8]:


train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1, test_size=0.2)


models = {
    'Random Forest': RandomForestRegressor(n_estimators=400, max_depth=20, random_state=1),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=400, max_depth=6, learning_rate=0.05, random_state=1),
    'HistGradientBoosting': HistGradientBoostingRegressor(max_depth=8, learning_rate=0.05, max_bins=255, random_state=1),
    'Linear Regression': LinearRegression()
}

best_model = None
best_score = float('inf')
best_name = ""

print("\nModel Performance:")
for name, model in models.items():

    model.fit(train_X, train_y)


    predictions = model.predict(val_X)


    mae = mean_absolute_error(np.expm1(predictions), np.expm1(val_y))
    print(f"{name}: MAE = {mae:,.0f}")

    if mae < best_score:
        best_score = mae
        best_model = model
        best_name = name

print(f"\nBest model: {best_name} with MAE: {best_score:,.0f}")


best_model.fit(X, y)


Model Performance:
Random Forest: MAE = 16,689
Gradient Boosting: MAE = 15,408
HistGradientBoosting: MAE = 15,800
Linear Regression: MAE = 15,762

Best model: Gradient Boosting with MAE: 15,408


In [9]:
kf = KFold(n_splits=5, shuffle=True, random_state=1)
cv_scores = []
for tr_idx, va_idx in kf.split(X):
    _trX, _vaX = X.iloc[tr_idx], X.iloc[va_idx]
    _trY, _vaY = y.iloc[tr_idx], y.iloc[va_idx]
    mdl = models[best_name]
    mdl.fit(_trX, _trY)
    preds = mdl.predict(_vaX)
    cv_mae = mean_absolute_error(np.expm1(preds), np.expm1(_vaY))
    cv_scores.append(cv_mae)
print(f"5-Fold CV MAE: {np.mean(cv_scores):,.0f} ± {np.std(cv_scores):,.0f}")


test_X = test_data[all_features].copy()


for feature in numerical_features:
    if feature in test_X.columns:
        test_X[feature] = test_X[feature].fillna(test_X[feature].median())


for feature in categorical_features:
    if feature in test_X.columns:
        test_X[feature] = test_X[feature].fillna('Unknown')


for feature in categorical_features:
    if feature in test_X.columns and feature in label_encoders:
        le = label_encoders[feature]

        test_X[feature] = test_X[feature].astype(str)
        most_common = le.classes_[0]
        test_X[feature] = test_X[feature].apply(lambda x: x if x in le.classes_ else most_common)
        test_X[feature] = le.transform(test_X[feature])


5-Fold CV MAE: 16,103 ± 1,301


In [10]:

test_X['TotalSF'] = test_X['1stFlrSF'] + test_X['2ndFlrSF'] + test_X['TotalBsmtSF']
test_X['TotalBath'] = test_X['FullBath'] + 0.5 * test_X['HalfBath']
test_X['Age'] = 2024 - test_X['YearBuilt']
test_X['RemodAge'] = 2024 - test_X['YearRemodAdd']
test_X['GarageAge'] = 2024 - test_X['GarageYrBlt']
test_X['HasPool'] = (test_X['PoolArea'] > 0).astype(int)
test_X['HasFireplace'] = (test_X['Fireplaces'] > 0).astype(int)
test_X['HasGarage'] = (test_X['GarageArea'] > 0).astype(int)
test_X['HasBasement'] = (test_X['TotalBsmtSF'] > 0).astype(int)


test_X = test_X.replace([np.inf, -np.inf], np.nan)
test_X = test_X.fillna(test_X.median())


test_preds = np.expm1(best_model.predict(test_X))


output = pd.DataFrame({'Id': test_data.Id, 'SalePrice': test_preds})
output.to_csv('improved_submission.csv', index=False)

print(f"\nImproved predictions saved to improved_submission.csv")
print(f"Number of predictions: {len(test_preds)}")
print(f"Sample predictions:")
print(output.head())


if hasattr(best_model, 'feature_importances_'):
    feature_importance = pd.DataFrame({
        'feature': X.columns,
        'importance': best_model.feature_importances_
    }).sort_values('importance', ascending=False)

    print(f"\nTop 10 most important features:")
    print(feature_importance.head(10))



Improved predictions saved to improved_submission.csv
Number of predictions: 1459
Sample predictions:
     Id      SalePrice
0  1461  127189.188616
1  1462  161493.155032
2  1463  185783.926247
3  1464  197725.134589
4  1465  189267.218154

Top 10 most important features:
         feature  importance
20   OverallQual    0.452230
74       TotalSF    0.293549
59    CentralAir    0.021836
26     GrLivArea    0.021341
12    GarageArea    0.017017
22    BsmtFinSF1    0.014627
0        LotArea    0.014044
39  Neighborhood    0.010030
11    GarageCars    0.009803
27   LotFrontage    0.009778
