In [19]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV, train_test_split

In [8]:
df = pd.read_csv("../ETL/dataset/data_casa_depto.csv", index_col='Unnamed: 0')

In [9]:
df

Unnamed: 0,price,area,bath,room,parking,year,property_type,near_cc,near_school,near_parks,near_avenue,security,elevator,rest_area,pool,ranking
0,120375,45,2,3,1,2023,1,1,0,1,0,0,0,1,0,5.00
1,119277,45,2,3,1,2023,1,1,0,1,0,0,0,1,0,5.00
2,162297,60,2,2,1,2023,1,1,0,1,0,0,0,1,0,5.00
3,157815,60,2,2,1,2023,1,1,0,1,0,0,0,1,0,5.00
4,239998,110,3,3,0,2023,1,1,0,0,0,0,0,0,0,3.13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10279,105000,110,2,2,0,2020,2,0,0,0,0,0,0,0,0,1.51
10280,270000,150,3,6,1,1970,2,1,1,1,0,0,0,1,0,1.51
10281,290000,266,4,4,1,2016,2,1,0,0,0,0,0,1,0,3.27
10282,139000,83,2,2,1,2005,2,0,0,1,1,0,0,1,0,2.71


In [10]:
X = df.drop(columns=['price'])
y = df['price']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=32)

In [15]:
params = {
    'objective': 'reg:squarederror',
    'max_depth': 3,
    'learning_rate': 0.1,
    'n_estimators': 100
}

model = xgb.XGBRegressor(**params)

In [16]:
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [17]:
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print("RMSE:", rmse)

RMSE: 29175.665470005475


In [18]:
r2 = r2_score(y_test, y_pred)
print("R^2:", r2)

R^2: 0.7280270479141819


Improving the model

In [29]:
params = {
    'max_depth': [3, 4, 5, 6, 7, 8],
    'learning_rate': [0.001, 0.01, 0.1, 0.2, 0.4, 0.6],
    'n_estimators': [50, 100, 150, 200],
    'objective': ['reg:squarederror']
}

In [30]:
xgb_model = xgb.XGBRegressor()

In [31]:
grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=params,
    cv=4,
    scoring='neg_mean_squared_error'
)

In [32]:
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

In [34]:
best_params

{'learning_rate': 0.1,
 'max_depth': 7,
 'n_estimators': 150,
 'objective': 'reg:squarederror'}

In [35]:
best_model.fit(X_train, y_train)

In [36]:
y_pred = best_model.predict(X_test)

In [37]:
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print("RMSE:", rmse)

RMSE: 27478.910252811133


In [38]:
r2 = r2_score(y_test, y_pred)
print("R^2:", r2)

R^2: 0.7587411878535828
