In [2]:
#Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error



Prepare data

In [3]:
#Load Data
df = pd.read_csv('data/data_macro.csv')

#Prepare Features and Target
X = df.drop('resale_price', axis=1)
y = df['resale_price']

#Split Data into Training and Test Sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)



In [4]:
print(df.shape)

(64573, 109)


In [7]:
# Convert the 'date' column to numeric format (e.g., timestamp)
X_train['date'] = pd.to_datetime(X_train['date']).astype(int) / 10**9
X_test['date'] = pd.to_datetime(X_test['date']).astype(int) / 10**9

### Random Search for decision tree (Hyperparameter Tuning)

# xgboost

In [5]:
from xgboost import XGBRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV
import pandas as pd
from sklearn.metrics import mean_absolute_error

In [8]:
# Define the parameter distribution for XGBoost
xgb_param_dist = {
    'n_estimators': [50, 100, 200, 300],
    'max_depth': [3, 5, 10, 15],
    'learning_rate': [0.01, 0.1, 0.2, 0.3],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 0.1, 0.2, 0.3],
    'min_child_weight': [1, 3, 5]
}

# Initialize RandomizedSearchCV for XGBoost
xgb_search = RandomizedSearchCV(
    estimator=XGBRegressor(random_state=42),
    param_distributions=xgb_param_dist,
    n_iter=50,
    scoring='neg_mean_absolute_error',
    cv=5,
    n_jobs=-1,
    verbose=2,
    random_state=42
)

# Fit the random search on the training data
xgb_search.fit(X_train, y_train)

# Get the best parameters and the best model
best_xgb_params = xgb_search.best_params_
best_xgb_model = xgb_search.best_estimator_

print("Best XGBoost Parameters:", best_xgb_params)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best XGBoost Parameters: {'subsample': 0.8, 'n_estimators': 200, 'min_child_weight': 3, 'max_depth': 10, 'learning_rate': 0.1, 'gamma': 0, 'colsample_bytree': 1.0}


In [9]:
# Initialize the XGBoost model with the best parameters
xgb_best_model = XGBRegressor(
    subsample=1.0,
    n_estimators=200,
    min_child_weight=5,
    max_depth=15,
    learning_rate=0.1,
    gamma=0.1,
    colsample_bytree=0.6,
    random_state=42
)

# Train the model on the training data
xgb_best_model.fit(X_train, y_train)

# Predict on the test data
y_pred_xgb_best = xgb_best_model.predict(X_test)

# Evaluate the model
print('\nXGBoost (Best Parameters) Performance:')
print(f'MSE: {mean_squared_error(y_test, y_pred_xgb_best):.2f}')
print(f'MAE: {mean_absolute_error(y_test, y_pred_xgb_best):.2f}')
print(f'R²: {r2_score(y_test, y_pred_xgb_best):.2f}')


XGBoost (Best Parameters) Performance:
MSE: 1467324425.22
MAE: 26656.84
R²: 0.95


In [10]:
import joblib

# Save the trained model
joblib.dump(xgb_best_model, 'xgb_best_model.pkl')

['xgb_best_model.pkl']

In [None]:
# XGBoost Model (without hyperparameter tuning)
xgb_model = XGBRegressor(n_estimators=100, random_state=42)
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)

print('\nXGBoost Performance:')

print(f'MSE: {mean_squared_error(y_test, y_pred_xgb):.2f}')
print(f'MAE: {mean_absolute_error(y_test, y_pred_xgb):.2f}')
print(f'R²: {r2_score(y_test, y_pred_xgb):.2f}')


XGBoost Performance:
MSE: 5945716388.22
MAE: 52330.37
R²: 0.81
