In [2]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt
import seaborn as sns

plt.rc('figure', figsize=(10, 6))

In [3]:
# Load the master dataset
df = pd.read_csv('../data/processed/master_india_properties.csv', low_memory=False)

# Define target and features based on EDA
TARGET = 'Price'

# Features to use for the baseline model
NUMERICAL_FEATURES = ['Area_SqFt', 'Bedrooms', 'Longitude', 'Latitude', 'Total_Floors']
CATEGORICAL_FEATURES = ['City', 'Property_Type', 'Furnishing_Status', 'Area_Type']

# Drop rows where the target or essential features are missing
df.dropna(subset=[TARGET] + NUMERICAL_FEATURES, inplace=True)

# Select the relevant columns
X = df[NUMERICAL_FEATURES + CATEGORICAL_FEATURES]
y = df[TARGET]

print("Shape of X (features):", X.shape)
print("Shape of y (target):", y.shape)

Shape of X (features): (0, 9)
Shape of y (target): (0,)


In [4]:
# 1. Impute Missing Values
# Impute Total_Floors (the only one with NaNs in our selection) with the median
num_imputer = SimpleImputer(strategy='median')
X[NUMERICAL_FEATURES] = num_imputer.fit_transform(X[NUMERICAL_FEATURES])

# Impute categorical features with 'Missing'
cat_imputer = SimpleImputer(strategy='constant', fill_value='Missing')
X[CATEGORICAL_FEATURES] = cat_imputer.fit_transform(X[CATEGORICAL_FEATURES])


# 2. Log-transform the target variable
y_log = np.log1p(y)


# 3. One-Hot Encode Categorical Features
# This converts columns like 'City' into multiple columns like 'City_Mumbai', 'City_Delhi', etc.
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
X_encoded_cats = ohe.fit_transform(X[CATEGORICAL_FEATURES])

# Create a DataFrame with the new encoded column names
X_encoded_df = pd.DataFrame(X_encoded_cats, columns=ohe.get_feature_names_out(CATEGORICAL_FEATURES))

# Combine numerical features and encoded categorical features
X_final = pd.concat([X[NUMERICAL_FEATURES].reset_index(drop=True), X_encoded_df], axis=1)

print("Final shape of processed features:", X_final.shape)
X_final.head()

ValueError: Found array with 0 sample(s) (shape=(0, 5)) while a minimum of 1 is required by SimpleImputer.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_final, y_log, test_size=0.2, random_state=42)

print(f"Training set size: {len(X_train)}")
print(f"Testing set size: {len(X_test)}")

In [None]:
# Initialize the XGBoost model
xgbr = xgb.XGBRegressor(
    objective='reg:squarederror',
    n_estimators=1000,          # We set a high number, but early stopping will find the best one
    learning_rate=0.05,
    max_depth=7,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1                   # Use all available CPU cores
)

# Train the model
print("Training XGBoost model...")
xgbr.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    eval_metric='rmse',
    early_stopping_rounds=50,   # Stop if the validation score doesn't improve for 50 rounds
    verbose=100                 # Print progress every 100 rounds
)

In [None]:
# Make predictions on the test set
log_predictions = xgbr.predict(X_test)

# IMPORTANT: Transform predictions back to the original scale
predictions = np.expm1(log_predictions)
y_test_original = np.expm1(y_test)

# Calculate metrics
r2 = r2_score(y_test_original, predictions)
mae = mean_absolute_error(y_test_original, predictions)

print(f"\n--- Model Performance ---")
print(f"R-squared (R²): {r2:.4f}")
print(f"Mean Absolute Error (MAE): ₹{mae:,.0f}")

# Plot Actual vs. Predicted values
plt.figure(figsize=(10, 10))
plt.scatter(y_test_original, predictions, alpha=0.3)
plt.plot([y_test_original.min(), y_test_original.max()], [y_test_original.min(), y_test_original.max()], '--r', linewidth=2)
plt.title('Actual vs. Predicted Prices')
plt.xlabel('Actual Prices')
plt.ylabel('Predicted Prices')
plt.xscale('log')
plt.yscale('log')
plt.show()

In [None]:
# Get feature importances
feature_importances = pd.DataFrame({
    'feature': X_final.columns,
    'importance': xgbr.feature_importances_
}).sort_values('importance', ascending=False)

# Plot top 20 features
plt.figure(figsize=(12, 8))
sns.barplot(
    x='importance',
    y='feature',
    data=feature_importances.head(20),
    palette='rocket'
)
plt.title('Top 20 Most Important Features')
plt.show()