In [1]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import r2_score
import pandas as pd
import project_libs as libs
from sklearn.ensemble import GradientBoostingRegressor

In [2]:
data = libs.read_csv_to_dataframe("data/data.csv")
df = data.copy()

df.drop(columns=['date','country'], axis=1, inplace=True)
df.info()

features = ['street','city','statezip']
target_column = 'price'

models = [
    ('Gradient Boosting', GradientBoostingRegressor, {'n_estimators': 100})
    
]

X_train, X_test, y_train, y_test,result = libs.run_ml_pipeline(df, target_column, features, models,encoding_method='label',  use_cross_validation=False)

shape (4600, 18)
----------------------------------------------------------------------------------------------------
List of columns
['date', 'price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'street', 'city', 'statezip', 'country']
----------------------------------------------------------------------------------------------------
Data info
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4600 entries, 0 to 4599
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   date           4600 non-null   object 
 1   price          4600 non-null   float64
 2   bedrooms       4600 non-null   float64
 3   bathrooms      4600 non-null   float64
 4   sqft_living    4600 non-null   int64  
 5   sqft_lot       4600 non-null   int64  
 6   floors         4600 non-null   float64
 7   waterfront     4600 non-null   i

In [3]:
model = GradientBoostingRegressor()
param_grid = {
    'n_estimators': [10, 50, 100], # Number of boosting stages (trees)
    'learning_rate': [0.01, 0.1, 0.2], # Step size shrinkage used in update to prevent overfitting
    'max_depth': [3, 5, 7], # Maximum depth of the individual trees
    'min_samples_split': [2, 5, 10],# Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4],  # Minimum number of samples required to be at a leaf node
    'subsample': [0.8, 1.0],# Subsample ratio of the training instance
    'max_features': [10, 'sqrt', 'log2', None],  # Number of features to consider when looking for the best split
    'loss': ['squared_error', 'absolute_error', 'huber', 'quantile']  # Loss function to be optimized
}


best_model, best_params, best_score = libs.hyperparameter_tuning(model, param_grid, X_train, y_train, 'r2')

In [4]:
# Train the tuned model
best_model.fit(X_train, y_train)

# Predict on the test set
y_pred = best_model.predict(X_test)

# Calculate accuracy score
R2Score = r2_score(y_test, y_pred)

print(R2Score)

0.056501052566035326
