In [23]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import pandas as pd
import joblib

In [14]:
# Split for regular features

X = pd.read_csv("features_X.csv")
target_y = pd.read_csv("target_y.csv")
pca_df = pd.read_csv("pca_df.csv")
y = target_y["Price"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# PCA features
X_pca_train, X_pca_test = train_test_split(pca_df, test_size=0.2, random_state=42)

In [31]:
results = []

# Making a dictionary of models we want to test
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42)
}

In [32]:
# Using Original Features
for model_name in models:
    model = models[model_name]  # Get the model from dictionary
    model.fit(X_train, y_train)  # Train the model

    # Predict on test set
    y_pred = model.predict(X_test)

    # Calculate metrics
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))

    # Store the results
    results.append({
        "Model": model_name,
        "Features": "Original",
        "R2 Score": r2,
        "RMSE": rmse
    })

In [33]:
# Using PCA Features
for model_name in models:
    model = models[model_name]
    model.fit(X_pca_train, y_train)  # Train on PCA data

    y_pred = model.predict(X_pca_test)  # Predict

    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))

    results.append({
        "Model": model_name,
        "Features": "PCA",
        "R2 Score": r2,
        "RMSE": rmse
    })

In [34]:
results_df = pd.DataFrame(results)
display(results_df)

Unnamed: 0,Model,Features,R2 Score,RMSE
0,Linear Regression,Original,0.814692,25277620.0
1,Random Forest,Original,0.994809,4230539.0
2,Gradient Boosting,Original,0.995188,4073276.0
3,Linear Regression,PCA,0.591048,37551280.0
4,Random Forest,PCA,0.957437,12114420.0
5,Gradient Boosting,PCA,0.918305,16783640.0


## Model Comparison

We trained three different regressors using two feature sets:
- The **engineered feature set** (includes manual features like `Price_per_Marla`, `Size_Category`, etc.)
- A **PCA-reduced set** of top 3 extracted components

This table shows their performance using R² and RMSE.

**Key Takeaways:**
- **Gradient Boosting** performs best overall.
- Engineered features capture domain knowledge better.
