# XGBoost
XGBoost standards for "Extreme Gradient Boosting". It builds on top of existing ensemble methods like random forests. The way it works is by building trees sequentially. The forest starts out with just one (bad and inaccurate)tree, that forest is then ran against a validation set, which is evaluate dby a loss function (like MAE) . The next tree is then built to correct the errors of the first using gradient descent (hence gradient boosting). This process is repeated, building more and more trees, each one trying to correct the errors of the previous trees. The final prediction is then made by averaging the predictions of all the trees.

In [22]:
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder

# Load data using pipeline
training_data_file_path = ('./data/4_housing_competition/train.csv')
home_data = pd.read_csv(training_data_file_path)

y = home_data.SalePrice

feature_names = [
    'MSSubClass',
    'LotArea',
    'OverallQual',
    'OverallCond',
    'YearBuilt',
    'YearRemodAdd',
    '1stFlrSF',
    '2ndFlrSF',
    'LowQualFinSF',
    'GrLivArea',
    'FullBath',
    'HalfBath',
    'BedroomAbvGr',
    'KitchenAbvGr',
    'TotRmsAbvGrd',
    'Fireplaces',
    'WoodDeckSF',
    'OpenPorchSF',
    'EnclosedPorch',
    '3SsnPorch',
    'ScreenPorch',
    'PoolArea',
    'MiscVal',
    'MoSold',
    'YrSold'
]

X = home_data[feature_names]
X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state=1)

# --- Custom thoughtful imputation for LotFrontage by MSSubClass ---
if 'LotFrontage' in X_train.columns:
    frontage_by_subclass = X_train.groupby('MSSubClass')['LotFrontage'].median()
    # Impute in training set
    X_train['LotFrontage'] = X_train.groupby('MSSubClass')['LotFrontage'].transform(
        lambda x: x.fillna(x.median())
    )
    # Impute in validation set using training medians
    for subclass in X_valid['MSSubClass'].unique():
        mask = X_valid['MSSubClass'] == subclass
        if subclass in frontage_by_subclass:
            fill_value = frontage_by_subclass[subclass]
        else:
            fill_value = X_train['LotFrontage'].median()
        X_valid.loc[mask, 'LotFrontage'] = X_valid.loc[mask, 'LotFrontage'].fillna(fill_value)

categorical_cols = [cname for cname in X_train.columns if
                    X_train[cname].nunique() < 10 and
                    X_train[cname].dtype == "object"]

numerical_cols = [cname for cname in X_train.columns if
                X_train[cname].dtype in ['int64', 'float64']]

numerical_transformer = SimpleImputer(strategy='constant')

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])

good_label_cols = []
for col in categorical_cols:
    validation_values = set(X_valid[col])
    training_values = set(X_train[col])
    if validation_values.issubset(training_values):
        good_label_cols.append(col)

columns_to_keep = numerical_cols + good_label_cols

X_train_filtered = X_train[columns_to_keep].copy()
X_valid_filtered = X_valid[columns_to_keep].copy()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, good_label_cols)
    ])

my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', XGBRegressor( # Define model as XGBoost
                                  n_estimators=1000, # Number of trees in forest
                                  learning_rate=0.01, # Step size shrinkage used in update to prevents overfitting
                                  n_jobs=4, # Parallelisation - number of CPU cores to use (4 cores in this case)
                              ))])

my_pipeline.fit(
    X_train_filtered,
    y_train,
)

preds = my_pipeline.predict(X_valid_filtered)
print(f"Mean Absolute Error: {mean_absolute_error(y_valid, preds)}")

# Retrain with all data
my_pipeline.fit(X, y)

test_data_file_path = ('./data/4_housing_competition/test.csv')
test_data = pd.read_csv(test_data_file_path)
X_test = test_data[feature_names]
test_predictions = my_pipeline.predict(X_test)

Mean Absolute Error: 16674.673828125


In [23]:
def generate_csv_from_predictions():
    output = pd.DataFrame({'Id': test_data.Id,
                           'SalePrice': test_predictions})

    output.to_csv('/Users/natedev/repos/machine-learning/data/4_housing_competition/submission.csv', index=False)
    print("Your submission was successfully saved!")

generate_csv_from_predictions()

Your submission was successfully saved!
