In [3]:
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np

In [4]:
from sklearn.datasets import fetch_california_housing
from sklearn.metrics import r2_score

In [23]:
ds = fetch_california_housing()

In [25]:
df = pd.DataFrame(ds.data , columns = ds.feature_names)
y = pd.DataFrame(ds.target ,  columns = ds.target_names)

In [27]:
Q1 = np.percentile(y, 25)  # 25th percentile
Q3 = np.percentile(y, 75)  # 75th percentile
IQR = Q3 - Q1  # Interquartile Range

# Define outlier limits
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Get boolean mask for outliers
outlier_mask = (y < lower_bound) | (y > upper_bound)

outlier_indices = np.where((y < lower_bound) | (y > upper_bound))[0]

In [29]:
len(outlier_indices)

1071

In [31]:
df_clean = df.drop(index=outlier_indices).reset_index(drop=True)
y_clean = y.drop(index=outlier_indices).reset_index(drop=True)

In [33]:
from sklearn.model_selection import train_test_split
x_train , x_test , y_train , y_test = train_test_split(df_clean , y_clean , test_size=0.3, random_state=42)

In [35]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_trian = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [21]:
from xgboost import XGBRegressor
from sklearn.metrics import r2_score

xgb_model = XGBRegressor(n_estimators=300, learning_rate=0.05, random_state = 42)
xgb_model.fit(x_train, y_train)
y_pred = xgb_model.predict(x_test)
print("XGBoost R² Score:", r2_score(y_test, y_pred))
print(type(r2_score))  # Should print: <class 'function'>


XGBoost R² Score: -0.5807241201400757
<class 'function'>


In [39]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],
    'subsample': [0.7, 0.8, 1],
    'colsample_bytree': [0.8, 0.9, 1],
    'random_state' : [42]
}

xgb_model = XGBRegressor()

grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(x_train, y_train)

# Best parameters
print("Best Parameters: ", grid_search.best_params_)
print("Best R² Score: ", grid_search.best_score_)


Fitting 3 folds for each of 243 candidates, totalling 729 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Best Parameters:  {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 300, 'random_state': 42, 'subsample': 1}
Best R² Score:  0.808232843875885
0.9486081370449679


In [95]:
len(x_test)

5871

In [40]:
from xgboost import XGBRegressor

xgb_model2 = XGBRegressor(colsample_bytree=0.8, learning_rate= 0.1, max_depth= 7, n_estimators= 300, random_state= 42, subsample= 1)
xgb_model2.fit(x_train, y_train)
xgb_model2_pred = xgb_model2.predict(x_test)

from sklearn.metrics import r2_score
import numpy as np

y_true = [3, -0.5, 2, 7]
y_pred = [2.5, 0.0, 2, 8]
r2 = r2_score(y_true, y_pred)
print(r2)  # Output should be approximately 0.948

0.9486081370449679


In [51]:
r2_score(y_test, grid_search.predict(x_test))

-0.6453078985214233

In [41]:
from sklearn.tree import DecisionTreeRegressor
tree_model = DecisionTreeRegressor(max_depth=5)  # Tune depth
tree_model.fit(x_train, y_train)
y_pred = tree_model.predict(x_test)
print(r2_score(y_test , y_pred))

-0.016631739294136638


