In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.base import RegressorMixin
import joblib
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score,mean_absolute_error, r2_score

In [2]:
data = pd.read_csv(r'Preprocessed_data.csv')

In [3]:
data.head(10)

Unnamed: 0,AREA,INT_SQFT,DIST_MAINROAD,N_BEDROOM,N_BATHROOM,N_ROOM,SALE_COND,PARK_FACIL,BUILDTYPE,UTILITY_AVAIL,STREET,MZZONE,QS_ROOMS,QS_BATHROOM,QS_BEDROOM,QS_OVERALL,REG_FEE,COMMIS,SALES_PRICE,Age_of_building
0,4,1004,131,1.0,1.0,3,0,1,0,0,2,0,4.0,3.9,4.9,4.33,380000,144400,7600000,58
1,1,1986,26,2.0,1.0,5,0,0,0,0,0,3,4.9,4.2,2.5,3.765,760122,304049,21717770,30
2,0,909,70,1.0,1.0,3,0,1,0,1,0,4,4.1,3.8,2.2,3.09,421094,92114,13159200,33
3,6,1855,14,3.0,2.0,5,2,0,2,2,2,2,4.7,3.9,3.6,4.01,356321,77042,9630290,37
4,4,1226,84,1.0,1.0,3,0,1,2,0,0,1,3.0,2.5,4.1,3.29,237000,74063,7406250,46
5,2,1220,36,2.0,1.0,4,4,0,0,2,1,3,4.5,2.6,3.1,3.32,409027,198316,12394750,16
6,2,1167,137,1.0,1.0,3,4,0,2,0,1,4,3.6,2.1,2.5,2.67,263152,33955,8488790,46
7,6,1847,176,3.0,2.0,5,2,0,0,0,0,5,2.4,4.5,2.1,3.26,604809,235204,16800250,29
8,2,771,175,1.0,1.0,2,1,0,2,2,2,5,2.9,3.7,4.0,3.55,257578,33236,8308970,48
9,6,1635,74,2.0,1.0,4,0,0,2,1,1,2,3.1,3.1,3.3,3.16,323346,121255,8083650,34


In [4]:
# Define the target variable
target = 'SALES_PRICE'

# Split the data into features (X) and target (y)
X = data.drop(columns=[target])
y = data[target]

# Model Training

In [5]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize models (same as before)
models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(),
    'Lasso Regression': Lasso(),
    'ElasticNet': ElasticNet(),
    'Support Vector Regression (SVR)': SVR(),
    'K-Nearest Neighbors (KNN)': KNeighborsRegressor(),
    'Random Forest': RandomForestRegressor(random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42),
    'XGBoost': XGBRegressor(random_state=42)
}

# Store evaluation metrics for each model
evaluation_results = []

# Iterate over models
for name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred_Train = model.predict(X_train)
    y_pred = model.predict(X_test)
    
    # Compute metrics
    mae_Train = mean_absolute_error(y_train, y_pred_Train)
    r2_Train = r2_score(y_train, y_pred_Train)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    # Append metrics to results
    evaluation_results.append({
        'Model': name,
        'MAE': mae,
        'R² Score': r2,
        'MAE_Train' : mae_Train,
        'R² Score_Train' : r2_Train
    })

# Convert results to a DataFrame for better readability
evaluation_df = pd.DataFrame(evaluation_results)

In [6]:
evaluation_df

Unnamed: 0,Model,MAE,R² Score,MAE_Train,R² Score_Train
0,Linear Regression,1067166.0,0.879424,1041187.0,0.880342
1,Ridge Regression,1067070.0,0.879421,1041075.0,0.880341
2,Lasso Regression,1067165.0,0.879424,1041186.0,0.880342
3,ElasticNet,1168969.0,0.843325,1129509.0,0.844288
4,Support Vector Regression (SVR),2952493.0,-0.019741,2902474.0,-0.022304
5,K-Nearest Neighbors (KNN),1433286.0,0.764481,1168703.0,0.838643
6,Random Forest,517754.0,0.970758,188782.9,0.995878
7,Gradient Boosting,561154.7,0.96433,506242.4,0.970415
8,XGBoost,317394.2,0.987434,78748.0,0.99921


# HyperParameter Tuning

In [7]:
# Define parameter grid for XGBoost
xgb_param_grid = {
    'n_estimators': [100, 200, 300, 500],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'max_depth': [3, 5, 7, 10],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 1, 5],
    'reg_alpha': [0, 0.5, 1],
    'reg_lambda': [1, 2, 5]
}

# Initialize XGBoost model
xgb_model = XGBRegressor(objective='reg:squarederror', random_state=42)

# Randomized Search CV
xgb_random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=xgb_param_grid,
    n_iter=50,  # Number of random combinations to try
    cv=5,
    scoring='neg_mean_absolute_error',
    verbose=2,
    random_state=42,
    n_jobs=-1
)

# Fit the RandomizedSearchCV
xgb_random_search.fit(X_train, y_train)

# Best parameters and model
xgb_best_model = xgb_random_search.best_estimator_
print("Best Parameters for XGBoost:", xgb_random_search.best_params_)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best Parameters for XGBoost: {'subsample': 1.0, 'reg_lambda': 5, 'reg_alpha': 0, 'n_estimators': 300, 'max_depth': 3, 'learning_rate': 0.2, 'gamma': 5, 'colsample_bytree': 0.8}


In [8]:
# XGBoosting Evaluation
xgb_y_pred = xgb_best_model.predict(X_test)
xgb_mae = mean_absolute_error(y_test, xgb_y_pred)
xgb_r2 = xgb_best_model.score(X_test, y_test)
print(f"XGBoosting - MAE: {xgb_mae:.2f}, R² Score: {xgb_r2}")

XGBoosting - MAE: 219982.95, R² Score: 0.9939322471618652


Saving Best Model in .pkl File and Encoding code in .pkl file

In [9]:
# Save the best model (XGBoost)
joblib.dump(xgb_best_model, 'xgboost_best_model.pkl')
print("Model saved successfully as 'xgboost_best_model.pkl'")

Model saved successfully as 'xgboost_best_model.pkl'
