In [14]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import r2_score
import pandas as pd
import project_libs as libs
from sklearn.ensemble import GradientBoostingRegressor

In [15]:
data = libs.read_csv_to_dataframe("data/data.csv")
df = data.copy()

df.drop(columns=['date','country'], axis=1, inplace=True)
df.info()

features = ['street','city','statezip']
target_column = 'price'

models = [
    ('Gradient Boosting', GradientBoostingRegressor, {'n_estimators': 100})
    
]

X_train, X_test, y_train, y_test,result = libs.run_ml_pipeline(df, target_column, features, models,encoding_method='label',  use_cross_validation=False)

shape (4600, 18)
----------------------------------------------------------------------------------------------------
List of columns
['date', 'price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'street', 'city', 'statezip', 'country']
----------------------------------------------------------------------------------------------------
Data info
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4600 entries, 0 to 4599
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   date           4600 non-null   object 
 1   price          4600 non-null   float64
 2   bedrooms       4600 non-null   float64
 3   bathrooms      4600 non-null   float64
 4   sqft_living    4600 non-null   int64  
 5   sqft_lot       4600 non-null   int64  
 6   floors         4600 non-null   float64
 7   waterfront     4600 non-null   i

In [21]:
def hyperparameter_tuning(model, param_grid, X_train, X_test, y_train, y_test, scoring):
    # Perform Randomized Search CV
    random_search = RandomizedSearchCV(estimator=model, param_distributions=param_grid, scoring=scoring, n_iter=100, cv=5, random_state=42)
    random_search.fit(X_train, y_train)

    # Get the best parameters and model
    best_params = random_search.best_params_
    best_model = random_search.best_estimator_
    best_score = random_search.best_score_

    # Train the tuned model
    best_model.fit(X_train, y_train)

    # Predict on the test set
    y_pred = best_model.predict(X_test)

    # Calculate accuracy score
    R2Score = r2_score(y_test, y_pred)

    print(y_pred)

    return best_model, best_params, best_score, R2Score

In [24]:
model = GradientBoostingRegressor()
param_grid = {
    'n_estimators': [10, 50, 100], # Number of boosting stages (trees)
    'learning_rate': [0.01, 0.1, 0.2], # Step size shrinkage used in update to prevent overfitting
    'max_depth': [3, 5, 7], # Maximum depth of the individual trees
    'min_samples_split': [2, 5, 10],# Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4],  # Minimum number of samples required to be at a leaf node
    'subsample': [0.8, 1.0],# Subsample ratio of the training instance
    'max_features': [10, 'sqrt', 'log2', None],  # Number of features to consider when looking for the best split
    'loss': ['squared_error', 'absolute_error', 'huber', 'quantile']  # Loss function to be optimized
}


best_model, best_params, best_score, R2Score = hyperparameter_tuning(model, param_grid, X_train, X_test, y_train, y_test, 'r2')

[ 422650.54563525  326140.85100648  960663.06535009  461171.36076301
  266047.34626919  602814.97774858  485822.48904377  300903.14199751
  474446.91711188  742496.45603347  545697.2599253   510327.04988346
  746365.23071043  334641.62881054  194339.6594697   975985.70435065
  545321.01226314  625010.34468156  942683.03205724  885459.2482177
  771514.48783967  561918.9144771   532351.02239717  455248.60339223
  261433.58761967  173520.68033081  785417.13777847 1075390.23550758
  448294.53466572  821396.66469776 2001225.08844705  324446.63113212
 1187938.67762149  431041.02521127  268909.08069729  429797.89049909
  637597.6448582   624909.19092264  225743.6527048   549917.3470812
  369050.47269898  403711.31619887  382323.5591868   511987.80121876
  255979.85681744  357037.23112158  273780.2905691   680913.25689571
 1129877.03963399  631407.31898573 1389084.05412627  473559.42206964
  481567.39665101  437749.79772771  329436.31092208  437397.53008225
  823870.4033368   606545.01758951  

In [25]:
print(best_score)
print(best_model)
print(R2Score)

0.6388782294362405
GradientBoostingRegressor(learning_rate=0.2, loss='huber', max_depth=5,
                          max_features='sqrt', min_samples_leaf=4,
                          min_samples_split=10, subsample=0.8)
0.04970611353277443
