In [41]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import pandas as pd
import project_libs as libs
from sklearn.ensemble import GradientBoostingRegressor

In [42]:
df = libs.read_csv_to_dataframe("data/data.csv")
df = df.drop(columns='date', axis=1)
df = libs.preprocess_missing_values(df)

feature = ['city','statezip','country','street']
X_train, X_test, y_train, y_test = libs.Feature_Encoding(df, 'price', feature, encoding_method='label')

shape (4600, 18)
----------------------------------------------------------------------------------------------------
List of columns
['date', 'price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'street', 'city', 'statezip', 'country']
----------------------------------------------------------------------------------------------------
Data info
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4600 entries, 0 to 4599
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   date           4600 non-null   object 
 1   price          4600 non-null   float64
 2   bedrooms       4600 non-null   float64
 3   bathrooms      4600 non-null   float64
 4   sqft_living    4600 non-null   int64  
 5   sqft_lot       4600 non-null   int64  
 6   floors         4600 non-null   float64
 7   waterfront     4600 non-null   i

In [43]:
# def hyperparameter_tuning(model, param_grid, X_train, y_train, scoring):
#     grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scoring, cv=1)
#     grid_search.fit(X_train, y_train)
#     return grid_search.best_estimator_, grid_search.best_params_, grid_search.best_score_

def hyperparameter_tuning(model, param_grid, X_train, y_train, scoring):
    random_search = RandomizedSearchCV(estimator=model, param_distributions=param_grid, scoring=scoring, n_iter=100, cv=5)
    random_search.fit(X_train, y_train)
    return random_search.best_estimator_, random_search.best_params_, random_search.best_score_

In [44]:
model = GradientBoostingRegressor()
param_grid = {
    'n_estimators': [50, 100, 150], # Number of boosting stages (trees)
    'learning_rate': [0.01, 0.1, 0.2], # Step size shrinkage used in update to prevent overfitting
    'max_depth': [3, 5, 7], # Maximum depth of the individual trees
    'min_samples_split': [2, 5, 10],# Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4],  # Minimum number of samples required to be at a leaf node
    'subsample': [0.8, 1.0],# Subsample ratio of the training instance
    'max_features': [10, 'sqrt', 'log2', None],  # Number of features to consider when looking for the best split
    'loss': ['squared_error', 'absolute_error', 'huber', 'quantile']  # Loss function to be optimized
}


best_model, best_params, best_score = hyperparameter_tuning(model, param_grid, X_train, y_train, 'r2')

In [None]:
print(best_score)
print(best_params)