In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_absolute_error, root_mean_squared_error, mean_absolute_percentage_error, r2_score
from sklearn.compose import make_column_transformer, ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import KFold
from sklearn import set_config
set_config(transform_output='pandas')

In [None]:
houses_df = pd.read_csv('../data/train.csv')

In [None]:
# Explore the data
houses_df.columns

In [None]:
houses_df.head()


In [None]:
houses_df.info()

In [None]:
# The index is in the columnd so he have to remove it from the data we a re training
houses_df = houses_df.set_index('Id')


In [None]:
# X and y creation
X = houses_df.copy()
y = X.pop("SalePrice")

# data splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Numeric and categoric pipe
X_cat_columns = X.select_dtypes(exclude="number").columns
X_num_columns = X.select_dtypes(include="number").columns

In [None]:
# WITH ORDINAL ENCODING

kitchen_qual = ['Ex', 'Gd', 'TA', 'Fa', 'Po']
fireplace_qu = ['Ex', 'Gd', 'TA', 'Fa', 'Po', 'NA']
HeatingQC = ['Ex', 'Gd', 'TA', 'Fa', 'Po']
GarageQual = ['Ex', 'Gd', 'TA', 'Fa', 'Po', 'NA']

categories = [
    kitchen_qual,
    fireplace_qu,
    HeatingQC,
    GarageQual,
]

ord_features = [
    'KitchenQual',
    'FireplaceQu',
    'HeatingQC',
    'GarageQual'
]


ord_encoder = OrdinalEncoder(categories=categories)

oh_features = list(set(X_cat_columns) - set(ord_features))

oh_encoder = OneHotEncoder(handle_unknown='ignore',
                           sparse_output=False,
                           min_frequency=0.03)

cat_imputer = SimpleImputer(strategy='most_frequent')

cat_encoder = ColumnTransformer(transformers=[
    ('oh_encoder', oh_encoder, oh_features),
    ('ord_encoder', ord_encoder, ord_features)
])

In [None]:
# Create the numerical pipeline with SimpleImputer
numeric_pipe = make_pipeline(SimpleImputer(strategy="median"))

# Create categorical pipeline with SimpleImputer
categoric_pipe = make_pipeline(cat_imputer, cat_encoder)

In [None]:
# Combine pipelines
preprocessor = make_column_transformer((numeric_pipe, X_num_columns),
                                       (categoric_pipe, X_cat_columns))
preprocessor

In [None]:
gb_pipeline = make_pipeline(
    preprocessor,
    StandardScaler(),
    GradientBoostingRegressor(random_state=42)
)

kf = KFold(n_splits=5, random_state=42, shuffle=True) 

gb_param_grid = {
    "columntransformer__pipeline-1__simpleimputer__strategy": ["mean"],  # Imputer strategy: using mean for missing values.
    "gradientboostingregressor__n_estimators": [700],  # Number of trees (more trees may reduce variance but increase computation time) 100 - 500
    "gradientboostingregressor__max_depth": [4],  # Max depth of the trees (higher depth increases model complexity, risking overfitting) 2 - 3
    "gradientboostingregressor__learning_rate": [0.05, 0.1],  # The step size at each iteration (lower values may require more trees). 0.01 - Max 0.3
    "gradientboostingregressor__min_samples_split": [2, 3],  # Minimum samples required to split an internal node 2 - 5
    "gradientboostingregressor__min_samples_leaf": [3, 4],  # Minimum samples required to be at a leaf node 1 - 5
    "gradientboostingregressor__subsample": [0.8, 1.0],  # Fraction of samples used for fitting each tree (use 1.0 for no subsample) 0.8,, 1.0
    "gradientboostingregressor__max_features": ['sqrt', 0.5],  # The number of features to consider for each split 'sqrt', 0.5
    "gradientboostingregressor__n_iter_no_change": [5, 10],  # Stop training if the validation score does not improve after 5 or 10 iterations 5, 10
}

# Grid search
gb_search = GridSearchCV(gb_pipeline, gb_param_grid, cv=kf, scoring='neg_mean_absolute_error', verbose=1, n_jobs=-1)
gb_search.fit(X_train, y_train)

print(f"Best Parameters: {gb_search.best_params_}")
print(f"Best Cross-Validation Score: {gb_search.best_score_}")

In [None]:
# Predict training
y_pred_train = gb_search.predict(X_train) 
# Calculate errors
train_mae = mean_absolute_error(y_train, y_pred_train)
train_rmse = root_mean_squared_error (y_train, y_pred_train)
train_mape = mean_absolute_percentage_error(y_true = y_train,
                                           y_pred = y_pred_train)
train_r2 = r2_score(y_true = y_train,
                   y_pred = y_pred_train)

print(f"Train MAE: {round(train_mae, 3)}")
print(f"Train RMSE: {round(train_rmse, 3)}")
print(f"Train MAPE: {round(train_mape, 3)}")
print(f"Train R2: {round(train_r2, 3)}")

In [None]:
# Predict test data
y_pred_test = gb_search.predict(X_test)

test_mae = mean_absolute_error(y_test, y_pred_test)
test_rmse = root_mean_squared_error (y_test, y_pred_test)
test_mape = mean_absolute_percentage_error(y_true = y_test,
                                           y_pred = y_pred_test)
test_r2 = r2_score(y_true = y_test,
                   y_pred = y_pred_test)

print(f"Test MAE: {round(test_mae, 3)}")
print(f"Test RMSE: {round(test_rmse, 3)}")
print(f"Test MAPE: {round(test_mape, 3)}")
print(f"Test R2: {round(test_r2, 3)}")

In [None]:
best_model = gb_search.best_estimator_
best_model.fit(X, y)