In [3]:
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, HistGradientBoostingRegressor
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings("ignore")

In [4]:
# === 1. File paths ===
datasets = {
    "KNN Imputation": "../integration/imputed_datasets/imputed_knn.csv",
    "Mean Imputation": "../integration/imputed_datasets/imputed_mean.csv",
    "Median Imputation": "../integration/imputed_datasets/imputed_median.csv",
    "Mode Imputation": "../integration/imputed_datasets/imputed_mode.csv",
    "Air Quality": "../dummy_price/air_with_price.csv",
    "Noise Quality": "../dummy_price/noise_with_price.csv",
    "Real Estate": "../preprocessed_data/bangalore/real_estate_data_bangalore.csv",
    "Indian Housing Prices": "../preprocessed_data/bangalore/india_housing_prices_bangalore_cleaned.csv"
}

In [5]:
# === 2. Models to compare ===
models = {
    "Hist Gradient Boosting": HistGradientBoostingRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(n_estimators=200, n_jobs=-1, random_state=42),
    
    "XGBoost (GPU)": xgb.XGBRegressor(
        n_estimators=500,
        tree_method='gpu_hist',
        predictor='gpu_predictor',
        random_state=42,
        verbosity=0
    )
}

In [6]:
# === 3. Function to encode categorical columns ===
def encode_categoricals(df):
    label_encoders = {}
    for col in df.select_dtypes(include=['object']).columns:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col].astype(str))
        label_encoders[col] = le
    return df, label_encoders

In [7]:
def fill_missing(df):
    for col in df.columns:
        if pd.api.types.is_numeric_dtype(df[col]):
            df[col] = pd.to_numeric(df[col], errors="coerce").fillna(df[col].median())
        else:
            df[col] = df[col].fillna("missing")
    return df


In [8]:
# === 4. Function to train & evaluate ===
def calculate_regression_accuracy(y_true, y_pred, tolerance=0.1):
    """
    Calculate accuracy for regression by checking predictions within tolerance
    tolerance: percentage tolerance (0.1 = 10%)
    """
    relative_error = np.abs(y_true - y_pred) / np.abs(y_true)
    accurate_predictions = np.sum(relative_error <= tolerance)
    accuracy = accurate_predictions / len(y_true) * 100
    return accuracy

def train_and_evaluate(X_train, X_test, y_train, y_test, model):
    start_time = time.time()
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    elapsed = time.time() - start_time
    
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    mae = mean_absolute_error(y_test, preds)
    r2 = r2_score(y_test, preds)
    accuracy = calculate_regression_accuracy(y_test, preds, tolerance=0.1)  # 10% tolerance
    return rmse, mae, r2, accuracy, elapsed

In [9]:
# === 6. Loop through datasets ===
results = []

for name, path in datasets.items():
    print(f"\n=== Training on {name} ===")
    df = pd.read_csv(path)

    # Rename Price_in_Lakhs if Price not found
    if "Price" not in df.columns and "Price_in_Lakhs" in df.columns:
        df.rename(columns={"Price_in_Lakhs": "Price"}, inplace=True)

    if "Price" not in df.columns:
        raise ValueError(f"'Price' column not found in {path}. Please check the dataset.")

    # Encode categoricals
    df, _ = encode_categoricals(df)

    # Fill missing values
    df = fill_missing(df)

    # Features & Target
    X = df.drop(columns=["Price"])
    y = df["Price"]

    # Memory optimization
    for col in X.select_dtypes(include=["float64"]).columns:
        X[col] = X[col].astype("float32")
    for col in X.select_dtypes(include=["int64"]).columns:
        X[col] = X[col].astype("int32")

    # Train/test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    # Train models
    for model_name, model in models.items():
        print(f" -> Training {model_name}...")
        rmse, mae, r2, accuracy, elapsed = train_and_evaluate(
            X_train, X_test, y_train, y_test, model
        )
        results.append({
            "Dataset": name,
            "Model": model_name,
            "RMSE": rmse,
            "MAE": mae,
            "R²": r2,
            "Accuracy (%)": accuracy,
            "Train Time (s)": elapsed
        })


=== Training on KNN Imputation ===
 -> Training Hist Gradient Boosting...
 -> Training Hist Gradient Boosting...
 -> Training Random Forest...
 -> Training Random Forest...
 -> Training XGBoost (GPU)...
 -> Training XGBoost (GPU)...

=== Training on Mean Imputation ===

=== Training on Mean Imputation ===
 -> Training Hist Gradient Boosting...
 -> Training Hist Gradient Boosting...
 -> Training Random Forest...
 -> Training Random Forest...
 -> Training XGBoost (GPU)...
 -> Training XGBoost (GPU)...

=== Training on Median Imputation ===

=== Training on Median Imputation ===
 -> Training Hist Gradient Boosting...
 -> Training Hist Gradient Boosting...
 -> Training Random Forest...
 -> Training Random Forest...
 -> Training XGBoost (GPU)...
 -> Training XGBoost (GPU)...

=== Training on Mode Imputation ===

=== Training on Mode Imputation ===
 -> Training Hist Gradient Boosting...
 -> Training Hist Gradient Boosting...
 -> Training Random Forest...
 -> Training Random Forest...
 -> Tr

In [10]:
# === 5. Results table ===
results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by=["Dataset", "RMSE"])
print("\n=== Final Results ===")
display(results_df)


=== Final Results ===


Unnamed: 0,Dataset,Model,RMSE,MAE,R²,Accuracy (%),Train Time (s)
13,Air Quality,Random Forest,63.599287,47.257858,0.612469,18.393782,1.539005
14,Air Quality,XGBoost (GPU),66.556653,49.506321,0.575591,17.357513,2.133077
12,Air Quality,Hist Gradient Boosting,67.392328,49.669319,0.564867,17.357513,0.392198
22,Indian Housing Prices,Random Forest,11.001242,8.728136,0.993825,85.88661,2.471357
21,Indian Housing Prices,Hist Gradient Boosting,11.182524,8.915664,0.99362,86.731001,0.441764
23,Indian Housing Prices,XGBoost (GPU),12.208667,9.874151,0.992395,83.112183,2.283739
1,KNN Imputation,Random Forest,12.985501,0.424155,0.998167,67.484806,6.625669
2,KNN Imputation,XGBoost (GPU),33.677842,4.348531,0.987672,64.492754,4.165484
0,KNN Imputation,Hist Gradient Boosting,40.518554,7.247993,0.982155,61.804582,0.938063
4,Mean Imputation,Random Forest,12.985501,0.424155,0.998167,67.484806,6.361508


In [11]:
# Save results for later use
results_df.to_csv("model_comparison_results.csv", index=False)