In [3]:
%pip install pandas scikit-learn xgboost

Collecting scikit-learn
  Obtaining dependency information for scikit-learn from https://files.pythonhosted.org/packages/aa/ce/c0b912f2f31aeb1b756a6ba56bcd84dd1f8a148470526a48515a3f4d48cd/scikit_learn-1.5.2-cp312-cp312-win_amd64.whl.metadata
  Downloading scikit_learn-1.5.2-cp312-cp312-win_amd64.whl.metadata (13 kB)
Collecting xgboost
  Obtaining dependency information for xgboost from https://files.pythonhosted.org/packages/e2/7b/8c1b410cd0604cee9a167a19f7e1746f5b92ae7d02ad574ab560b73c5a48/xgboost-2.1.1-py3-none-win_amd64.whl.metadata
  Downloading xgboost-2.1.1-py3-none-win_amd64.whl.metadata (2.1 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Obtaining dependency information for scipy>=1.6.0 from https://files.pythonhosted.org/packages/aa/7d/43ab67228ef98c6b5dd42ab386eae2d7877036970a0d7e3dd3eb47a0d530/scipy-1.14.1-cp312-cp312-win_amd64.whl.metadata
  Downloading scipy-1.14.1-cp312-cp312-win_amd64.whl.metadata (60 kB)
     ---------------------------------------- 0.0/60.8 kB ? eta


[notice] A new release of pip is available: 23.2.1 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split, cross_val_predict
from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor

# Load the data
df_train = pd.read_csv('./home-data/train.csv')
df_test = pd.read_csv('./home-data/test.csv')

# Dropping rows with extremely high target values and other outliers
df_train = df_train[df_train['SalePrice'] <= df_train['SalePrice'].quantile(0.995)]
df_train = df_train[df_train['GrLivArea'] <= 4000]
df_train = df_train[df_train['LotArea'] <= 100000]

# Define the target and features
X_train = df_train.drop(['Id', 'SalePrice'], axis=1)
y_train = df_train['SalePrice']
X_test = df_test.drop(['Id'], axis=1)

# Select categorical columns with relatively low cardinality (convenient but arbitrary)
categorical_cols = [cname for cname in X_train.columns if X_train[cname].dtype == "object"]

# Select numerical columns
numerical_cols = [cname for cname in X_train.columns if X_train[cname].dtype in ['int64', 'float64']]

# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Define the improved XGBoost model with the best-found parameters
xgb_model = XGBRegressor(
    subsample=0.8, 
    reg_lambda=0.1, 
    reg_alpha=0, 
    n_estimators=1700, 
    max_depth=3, 
    learning_rate=0.05, 
    gamma=0.3, 
    colsample_bytree=0.7, 
    random_state=42
)

# Pipeline for the XGBoost model
xgb_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', xgb_model)])

# Define your original model (assumed to be GradientBoostingRegressor from the previous code)
gbr_model = GradientBoostingRegressor(n_estimators=1100, loss='squared_error', subsample=0.35, learning_rate=0.05, random_state=1)

# Pipeline for the original model
gbr_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', gbr_model)])

# Split the data into training and validation sets for comparison
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42)

# Fit and predict with the original model on validation set
gbr_pipeline.fit(X_tr, y_tr)
y_val_preds_gbr = gbr_pipeline.predict(X_val)
val_mae_gbr = mean_absolute_error(y_val, y_val_preds_gbr)
print(f'MAE on validation set with original model (GBR): {val_mae_gbr}')

# Fit and predict with the improved XGBoost model on validation set
xgb_pipeline.fit(X_tr, y_tr)
y_val_preds_xgb = xgb_pipeline.predict(X_val)
val_mae_xgb = mean_absolute_error(y_val, y_val_preds_xgb)
print(f'MAE on validation set with improved model (XGB): {val_mae_xgb}')

# Blending predictions from both models on validation set
blended_val_preds = (y_val_preds_gbr + y_val_preds_xgb) / 2
val_mae_blended = mean_absolute_error(y_val, blended_val_preds)
print(f'MAE on validation set with blended model: {val_mae_blended}')

# Fit the models on the entire training data
gbr_pipeline.fit(X_train, y_train)
xgb_pipeline.fit(X_train, y_train)

# Predict on the test set with both models
test_preds_gbr = gbr_pipeline.predict(X_test)
test_preds_xgb = xgb_pipeline.predict(X_test)

# Blending the predictions from both models on the test set
blended_test_preds = (test_preds_gbr + test_preds_xgb) / 2

# Prepare the submission DataFrame
submission = pd.DataFrame({'Id': df_test.Id, 'SalePrice': blended_test_preds})

# Save the predictions to a CSV file
submission.to_csv('submission_blended.csv', index=False)

print("Blended submission file created successfully!")


MAE on validation set with original model (GBR): 12713.600756862701
MAE on validation set with improved model (XGB): 12486.506034482758
MAE on validation set with blended model: 12337.054198653108
Blended submission file created successfully!
