In [8]:
!pip install xgboost


Collecting xgboost
  Downloading xgboost-3.0.2-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.0.2-py3-none-win_amd64.whl (150.0 MB)
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
   ---------------------------------------- 1.0/150.0 MB 7.2 MB/s eta 0:00:21
   ---------------------------------------- 1.8/150.0 MB 4.8 MB/s eta 0:00:31
    --------------------------------------- 2.6/150.0 MB 4.3 MB/s eta 0:00:35
    --------------------------------------- 3.4/150.0 MB 4.4 MB/s eta 0:00:34
   - -------------------------------------- 4.2/150.0 MB 4.3 MB/s eta 0:00:35
   - -------------------------------------- 5.0/150.0 MB 4.1 MB/s eta 0:00:36
   - -------------------------------------- 5.8/150.0 MB 4.1 MB/s eta 0:00:35
   - -------------------------------------- 6.8/150.0 MB 4.1 MB/s eta 0:00:35
   -- ------------------------------------- 7.6/150.0 MB 4.1 MB/s eta 0:00:35
   -- ------------------------------------- 8.4/150.0 MB 4.1 MB/s eta 0:00:35
 

In [9]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import cross_val_score
from xgboost import XGBRegressor
import warnings
warnings.filterwarnings("ignore")

In [None]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
full_df = pd.concat([train_df.drop('SalePrice', axis=1), test_df], axis=0).reset_index(drop=True)
y_train = np.log1p(train_df['SalePrice']) 
test_ids = test_df['Id']

In [14]:
none_cols = ['Alley', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
             'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond',
             'PoolQC', 'Fence', 'MiscFeature', 'MasVnrType']
for col in none_cols:
    full_df[col] = full_df[col].fillna('None')

zero_cols = ['GarageYrBlt', 'GarageArea', 'GarageCars', 'BsmtFinSF1', 'BsmtFinSF2',
             'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath', 'MasVnrArea']
for col in zero_cols:
    full_df[col] = full_df[col].fillna(0)

mode_cols = ['MSZoning', 'KitchenQual', 'Exterior1st', 'Exterior2nd', 'SaleType',
             'Electrical', 'Functional', 'Utilities']
for col in mode_cols:
    full_df[col] = full_df[col].fillna(full_df[col].mode()[0])

num_cols = full_df.select_dtypes(include=[np.number]).columns
for col in num_cols:
    if full_df[col].isnull().sum() > 0:
        full_df[col] = full_df[col].fillna(full_df[col].median())

In [20]:
full_df['TotalSF'] = full_df['TotalBsmtSF'] + full_df['1stFlrSF'] + full_df['2ndFlrSF']
full_df['TotalBath'] = (full_df['BsmtFullBath'] + 0.5 * full_df['BsmtHalfBath'] +
                        full_df['FullBath'] + 0.5 * full_df['HalfBath'])
full_df['HouseAge'] = full_df['YrSold'] - full_df['YearBuilt']
full_df['RemodAge'] = full_df['YrSold'] - full_df['YearRemodAdd']
full_df['GarageAge'] = full_df['YrSold'] - full_df['GarageYrBlt']
full_df['GarageAge'] = full_df['GarageAge'].replace(2025, 0)
full_df['TotalPorchSF'] = full_df['OpenPorchSF'] + full_df['EnclosedPorch'] + \
                          full_df['3SsnPorch'] + full_df['ScreenPorch']

full_df['OverallQualCat'] = pd.cut(full_df['OverallQual'], bins=[0, 4, 6, 10],
                                    labels=["Low", "Medium", "High"])
full_df['OverallCondCat'] = pd.cut(full_df['OverallCond'], bins=[0, 4, 6, 10],
                                    labels=["Bad", "Average", "Good"])

In [22]:
ordinal_cols = ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'HeatingQC', 'KitchenQual',
                'FireplaceQu', 'GarageQual', 'GarageCond', 'PoolQC']

quality_map = {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'None': 0}
for col in ordinal_cols:
    full_df[col] = full_df[col].map(quality_map).astype(int)

categorical_cols = full_df.select_dtypes(include=['object']).columns
for col in categorical_cols:
    full_df[col] = full_df[col].astype(str)
    lbl = LabelEncoder()
    full_df[col] = lbl.fit_transform(full_df[col])

In [24]:
full_df['OverallQualCat'] = LabelEncoder().fit_transform(full_df['OverallQualCat'].astype(str))
full_df['OverallCondCat'] = LabelEncoder().fit_transform(full_df['OverallCondCat'].astype(str))

In [26]:
numerical_feats = full_df.select_dtypes(include=[np.number]).columns
scaler = StandardScaler()
full_df[numerical_feats] = scaler.fit_transform(full_df[numerical_feats])

In [28]:
X_train = full_df.iloc[:train_df.shape[0], :]
X_test = full_df.iloc[train_df.shape[0]:, :]

In [30]:
xgb_model = XGBRegressor(
    n_estimators=1000,
    learning_rate=0.05,
    max_depth=4,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

cv_score = cross_val_score(xgb_model, X_train, y_train, scoring="neg_root_mean_squared_error", cv=5)
print(f"Cross-validation RMSE: {-cv_score.mean():.4f}")

xgb_model.fit(X_train, y_train)

Cross-validation RMSE: 0.1226


In [32]:
preds = xgb_model.predict(X_test)
final_preds = np.expm1(preds)

submission = pd.DataFrame({
    'Id': test_ids,
    'SalePrice': final_preds
})
submission.to_csv('submission.csv', index=False)
print("✅ Finished! submission.csv created.")

✅ Finished! submission.csv created.
