# 1. Imports & Configuration

import pandas as pd: Imports the pandas library for data manipulation and analysis, aliased as pd. Pandas is essential for working with DataFrames, which are tabular data structures.

import numpy as np: Imports the NumPy library for numerical operations, aliased as np. NumPy provides support for arrays, matrices, and mathematical functions.

import matplotlib.pyplot as plt: Imports the pyplot module from the Matplotlib library for creating static, interactive, and animated visualizations in Python, aliased as plt.

import seaborn as sns: Imports the Seaborn library for statistical data visualization, aliased as sns. Seaborn is built on top of Matplotlib and provides a high-level interface for creating attractive and informative statistical graphics.

from statsmodels.stats.outliers_influence import variance_inflation_factor: Imports the variance_inflation_factor function from the statsmodels library. This function is used to detect multicollinearity in regression analysis.

from ydata_profiling import ProfileReport: Imports the ProfileReport class from the ydata_profiling library. This is used for generating detailed reports on the dataset, including descriptive statistics, data quality issues, and visualizations.

from sklearn.compose import ColumnTransformer: Imports ColumnTransformer for applying different preprocessing steps to different columns of the DataFrame.

from sklearn.pipeline import Pipeline: Imports Pipeline for creating a sequence of data processing steps.

from sklearn.impute import SimpleImputer: Imports SimpleImputer for handling missing values in the dataset (e.g., replacing them with the mean or median).

from sklearn.preprocessing import StandardScaler, OneHotEncoder: Imports StandardScaler for standardizing numerical features (mean=0, variance=1) and OneHotEncoder for converting categorical features into a numerical format using one-hot encoding.

from sklearn.linear_model import LinearRegression: Imports the LinearRegression model for linear regression analysis.

from sklearn.ensemble import RandomForestRegressor: Imports the RandomForestRegressor model, an ensemble learning method that operates by constructing a multitude of decision trees.

from xgboost import XGBRegressor: Imports the XGBRegressor model, an implementation of gradient boosted decision trees designed for speed and performance.

from sklearn.model_selection import train_test_split, RandomizedSearchCV: Imports train_test_split for splitting the dataset into training and testing sets, and RandomizedSearchCV for hyperparameter tuning using randomized search with cross-validation.

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score: Imports metrics for evaluating the performance of regression models: Mean Absolute Error (MAE), Mean Squared Error (MSE), and R-squared (R²).

import joblib: Imports the joblib library for saving and loading models.

from scipy.stats import randint, uniform: Imports randint for generating random integers and uniform for generating random floating-point numbers, used for hyperparameter tuning.

import os: Imports the os module for interacting with the operating system, used here for creating directories.

plt.style.use('seaborn-v0_8-whitegrid'): Sets the style of Matplotlib plots to 'seaborn-v0_8-whitegrid', which provides a clean, visually appealing style with a white grid.

np.random.seed(42): Sets the random seed for NumPy to 42. This ensures reproducibility of results that involve randomness.

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.stats.outliers_influence import variance_inflation_factor
from ydata_profiling import ProfileReport
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib
from scipy.stats import randint, uniform
import os

plt.style.use('seaborn-v0_8-whitegrid')
np.random.seed(42)

  @nb.jit


# 2. Data Loading & Validation

try...except block: This is used to handle potential errors when loading the dataset. If the file 'Data.csv' is not found, it prints an error message and exits the script.

df = pd.read_csv('Data.csv'): Reads the CSV file named 'Data.csv' into a pandas DataFrame named df.

print("✅ Dataset loaded successfully"): Prints a success message if the file is loaded without errors.

print(f"📊 Dataset shape: {df.shape}\n"): Prints the shape (number of rows and columns) of the DataFrame.

required_columns: Defines a list of column names that are expected to be present in the dataset.

missing_columns = set(required_columns) - set(df.columns): Checks if any of the required columns are missing from the DataFrame.

if missing_columns:: If there are missing columns, it prints an error message and exits.

binary_cols: Defines a list of columns that should contain binary values ('yes' or 'no').

for col in binary_cols:: Iterates through each column in binary_cols.

invalid = ~df[col].isin(['yes', 'no']): Checks for invalid values (anything other than 'yes' or 'no') in the binary columns.

if invalid.any():: If invalid values are found, it prints an error message and exits.

In [3]:
try:
    df = pd.read_csv('Data.csv')
    print("✅ Dataset loaded successfully")
    print(f"📊 Dataset shape: {df.shape}\n")
except FileNotFoundError:
    print("❌ Error: 'Data.csv' not found in current directory")
    exit()

# Validate required columns
required_columns = [
    'price', 'area', 'bedrooms', 'bathrooms', 'stories',
    'mainroad', 'guestroom', 'basement', 'hotwaterheating',
    'airconditioning', 'prefarea', 'furnishingstatus'
]
missing_columns = set(required_columns) - set(df.columns)
if missing_columns:
    print(f"❌ Missing required columns: {missing_columns}")
    exit()

# Validate binary columns
binary_cols = [
    'mainroad', 'guestroom', 'basement',
    'hotwaterheating', 'airconditioning', 'prefarea'
]
for col in binary_cols:
    invalid = ~df[col].isin(['yes', 'no'])
    if invalid.any():
        print(f"❌ Invalid values in {col}: {df[col][invalid].unique()}")
        exit()

✅ Dataset loaded successfully
📊 Dataset shape: (545, 13)



# 3. Generate EDA Report

df_eda = df.copy(): Creates a copy of the original DataFrame for EDA to avoid modifying the original data.

categorical_cols: Defines a list of categorical columns.

for col in categorical_cols:: Converts the specified columns to lowercase strings and then to the 'category' data type.

profile = ProfileReport(...): Generates a standard EDA report using ydata_profiling.

title: Sets the title of the report.

explorative=True: Enables explorative mode for more detailed analysis.

config_file=None: Uses default configuration.

profile.to_file("House_Price_EDA.html"): Saves the generated report to an HTML file.

os.makedirs('eda_plots', exist_ok=True): Creates a directory named 'eda_plots' if it doesn't exist. exist_ok=True prevents an error if the directory already exists.

html_content: Initializes an HTML string to store the content for a custom categorical analysis report.

create_percent_table(series): Defines a function to create a DataFrame with counts and percentages of each category in a given pandas Series.

for col in categorical_cols:: Iterates through each categorical column to create visualizations.

Adds HTML content for each column's distribution table and visualization.

sns.countplot(): Creates a count plot for binary features, showing the counts of each category.

sns.barplot(): Creates a bar plot for multi-class features, showing the counts of each category.

plt.savefig(): Saves the generated plot as a PNG image in the 'eda_plots' directory.

plt.close(): Closes the current Matplotlib figure to free up memory.

with open("Categorical_Analysis.html", "w") as f:: Opens a file named "Categorical_Analysis.html" in write mode and writes the html_content to it.



In [4]:
print("\n🔍 Generating EDA reports...")
df_eda = df.copy()

# Convert to proper categorical types
categorical_cols = ['furnishingstatus'] + binary_cols
for col in categorical_cols:
    df_eda[col] = df_eda[col].astype(str).str.lower().astype('category')

# Generate standard profile report
profile = ProfileReport(
    df_eda,
    title="House Price Analysis",
    explorative=True,
    config_file=None
)
profile.to_file("House_Price_EDA.html")

# Custom categorical analysis
print("📊 Generating enhanced categorical visualizations...")
os.makedirs('eda_plots', exist_ok=True)

html_content = """<html>
<head><title>Enhanced Categorical Analysis</title></head>
<body style="font-family: Arial; padding: 20px;">
<h1>Enhanced Categorical Analysis</h1>
"""

def create_percent_table(series):
    counts = series.value_counts(dropna=False)
    percents = series.value_counts(normalize=True, dropna=False).mul(100).round(1)
    return pd.DataFrame({'Count': counts, 'Percentage (%)': percents})

for col in categorical_cols:
    html_content += f"<div style='margin: 40px 0; border-top: 2px solid #eee; padding: 20px;'>"
    html_content += f"<h2>{col.title()}</h2>"
    
    # Value counts table
    html_content += "<h3>Distribution</h3>"
    html_content += create_percent_table(df_eda[col]).to_html(classes='data-table', border=0)
    
    # Visualization
    plt.figure(figsize=(12, 6))
    
    if df_eda[col].nunique() == 2:  # Binary features
        ax = sns.countplot(x=df_eda[col], order=df_eda[col].value_counts().index)
        plt.title(f"{col.title()} Distribution", fontsize=14)
        plt.xlabel('')
        
        # Add percentages on bars
        total = len(df_eda[col])
        for p in ax.patches:
            percentage = f'{100 * p.get_height()/total:.1f}%'
            ax.annotate(percentage, 
                        (p.get_x() + p.get_width() / 2., p.get_height()),
                        ha='center', va='center', 
                        xytext=(0, 5), 
                        textcoords='offset points')
    else:  # Multi-class features
        plot_data = df_eda[col].value_counts().reset_index()
        plot_data.columns = ['Category', 'Count']
        
        ax = sns.barplot(x='Count', y='Category', data=plot_data, palette='Blues_d')
        plt.title(f"{col.title()} Distribution", fontsize=14)
        plt.xlabel('Count')
        plt.ylabel('')
        
        # Add values on bars
        for p in ax.patches:
            width = p.get_width()
            ax.text(width + 5, p.get_y() + p.get_height()/2,
                    f'{int(width)}',
                    ha='left', va='center')

    plt.tight_layout()
    img_path = f'eda_plots/{col}_distribution.png'
    plt.savefig(img_path, bbox_inches='tight', dpi=100)
    plt.close()
    
    html_content += f"<img src='{img_path}' style='max-width: 800px; margin: 20px 0;'>"
    html_content += "</div>"

html_content += """</body>
<style>
.data-table {
    width: auto !important;
    margin: 15px 0;
    border-collapse: collapse;
}
.data-table th, .data-table td {
    padding: 8px 12px;
    border: 1px solid #ddd;
}
.data-table th {
    background-color: #f8f9fa;
}
</style>
</html>"""

with open("Categorical_Analysis.html", "w") as f:
    f.write(html_content)

print("✅ Generated two reports:")
print("- House_Price_EDA.html (Standard ydata Profiling)")
print("- Categorical_Analysis.html (Custom Visualizations)")


🔍 Generating EDA reports...


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

📊 Generating enhanced categorical visualizations...
✅ Generated two reports:
- House_Price_EDA.html (Standard ydata Profiling)
- Categorical_Analysis.html (Custom Visualizations)


# 4. Data Preprocessing

df[binary_cols] = df[binary_cols].replace({'yes': 1, 'no': 0}).astype(int): Converts the binary columns ('yes'/'no') to numerical (1/0) format.

numeric_features: Defines a list of numerical features.

categorical_cols: Defines a list of categorical features (only 'furnishingstatus' in this case).

preprocessor = ColumnTransformer(...): Creates a ColumnTransformer to apply different preprocessing steps to numerical and categorical features.

('num', Pipeline(...), numeric_features): Defines a pipeline for numerical features:

SimpleImputer(strategy='median'): Imputes missing values using the median.

StandardScaler(): Standardizes numerical features by removing the mean and scaling to unit variance.

('cat', OneHotEncoder(...), categorical_cols): Applies one-hot encoding to categorical features.

handle_unknown='ignore': Ignores unknown categories during the transformation.

drop='first': Drops the first category to avoid multicollinearity (dummy variable trap).

remainder='passthrough': Keeps the remaining columns (those not specified in 'num' or 'cat') as they are.

In [5]:
# Convert binary features to 0/1
df[binary_cols] = df[binary_cols].replace({'yes': 1, 'no': 0}).astype(int)

numeric_features = ['area', 'bedrooms', 'bathrooms', 'stories']
categorical_cols = ['furnishingstatus']

preprocessor = ColumnTransformer([
    ('num', Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ]), numeric_features),
    ('cat', OneHotEncoder(handle_unknown='ignore', drop='first'), categorical_cols)
], remainder='passthrough')

# 5. Model Training & Evaluation

X = df.drop('price', axis=1): Creates the feature matrix X by dropping the 'price' column (the target variable).

y = df['price']: Creates the target variable vector y containing the house prices.

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42): Splits the data into training and testing sets using an 80/20 split and a random state for reproducibility.

models: Defines a dictionary containing the models to be trained and evaluated. Each model has a name, a model object, and a dictionary of hyperparameters to tune.

LinearRegression: A simple linear regression model.

RandomForestRegressor: A random forest regressor with hyperparameters for the number of trees (n_estimators), maximum depth of trees (max_depth), and minimum samples required to split a node (min_samples_split).

XGBRegressor: An XGBoost regressor with hyperparameters for the number of trees, maximum depth, and learning rate.

results: An empty dictionary to store the results of each model.

best_model = None: Initializes a variable to store the best model found.

best_error_ratio = float('inf'): Initializes a variable to store the best error ratio (initialized to infinity).

for name, config in models.items():: Iterates through each model in the models dictionary.

pipeline = Pipeline(...): Creates a pipeline that first applies the preprocessor and then trains the specified regressor.

search = RandomizedSearchCV(...): Performs a randomized search for hyperparameter tuning using cross-validation.

n_iter=20: Specifies the number of parameter settings that are sampled.

cv=5: Uses 5-fold cross-validation.

scoring='neg_mean_absolute_error' : Uses negative MAE as the scoring metric (lower is better).

n_jobs=-1: Uses all available CPU cores for parallel processing.

random_state=42: Sets the random seed for reproducibility.

search.fit(X_train, y_train): Fits the RandomizedSearchCV object to the training data.

best_estimator = search.best_estimator_: Stores the best-performing model found during the search.

y_train_pred = best_estimator.predict(X_train): Predicts on the training set using the best model.

y_test_pred = best_estimator.predict(X_test): Predicts on the test set using the best model.

results[name] = {...}: Stores the best estimator, best parameters, and evaluation metrics for both the training and test sets in the results dictionary.

R²: R-squared, a measure of how well the model fits the data (higher is better).

MAE: Mean Absolute Error, the average absolute difference between the predicted and actual values (lower is better).

RMSE: Root Mean Squared Error, the square root of the average squared difference between the predicted and actual values (lower is better).

Error Ratio: The sum of absolute errors divided by the sum of actual values, providing a relative measure of error (lower is better).

if results[name]['Test Metrics']['Error Ratio'] < best_error_ratio:: Checks if the current model's error ratio on the test set is lower than the best error ratio found so far.

If it is, updates best_error_ratio and best_model.

print("\n📊 Final Model Evaluation Metrics:"): Prints a header for the results.

for model, data in results.items():: Iterates through the results of each model and prints the best parameters and evaluation metrics.

best_model.fit(X, y): Retrains the best model on the entire dataset (X, y).

print(f"\n🎯 Best Model: {type(best_model.named_steps['regressor']).__name__}"): Prints the name of the best model.

In [6]:
X = df.drop('price', axis=1)
y = df['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

models = {
    'LinearRegression': {
        'model': LinearRegression(),
        'params': {}
    },
    'RandomForest': {
        'model': RandomForestRegressor(random_state=42),
        'params': {
            'regressor__n_estimators': randint(100, 500),
            'regressor__max_depth': randint(2, 20),
            'regressor__min_samples_split': randint(2, 20)
        }
    },
    'XGBoost': {
        'model': XGBRegressor(random_state=42),
        'params': {
            'regressor__n_estimators': randint(100, 500),
            'regressor__max_depth': randint(3, 10),
            'regressor__learning_rate': uniform(0.01, 0.3)
        }
    }
}

results = {}
best_model = None
best_error_ratio = float('inf')

for name, config in models.items():
    print(f"\n⚙️ Tuning {name}...")
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', config['model'])
    ])
    
    search = RandomizedSearchCV(
        pipeline,
        config['params'],
        n_iter=20,
        cv=5,
        scoring='neg_mean_absolute_error',
        n_jobs=-1,
        random_state=42
    )
    search.fit(X_train, y_train)
    
    # Store best estimator and metrics
    best_estimator = search.best_estimator_
    y_train_pred = best_estimator.predict(X_train)
    y_test_pred = best_estimator.predict(X_test)
    
    results[name] = {
        'Best Estimator': best_estimator,
        'Best Params': search.best_params_,
        'Train Metrics': {
            'R²': r2_score(y_train, y_train_pred),
            'MAE': mean_absolute_error(y_train, y_train_pred),
            'RMSE': np.sqrt(mean_squared_error(y_train, y_train_pred)),
            'Error Ratio': np.sum(np.abs(y_train - y_train_pred)) / np.sum(y_train)
        },
        'Test Metrics': {
            'R²': r2_score(y_test, y_test_pred),
            'MAE': mean_absolute_error(y_test, y_test_pred),
            'RMSE': np.sqrt(mean_squared_error(y_test, y_test_pred)),
            'Error Ratio': np.sum(np.abs(y_test - y_test_pred)) / np.sum(y_test)
        }
    }
    
    if results[name]['Test Metrics']['Error Ratio'] < best_error_ratio:
        best_error_ratio = results[name]['Test Metrics']['Error Ratio']
        best_model = best_estimator

# Print results
print("\n📊 Final Model Evaluation Metrics:")
for model, data in results.items():
    print(f"\n=== {model} ===")
    print(f"Best Parameters: {data['Best Params']}")
    print("\nTraining Set:")
    print(f"R²: {data['Train Metrics']['R²']:.3f} | MAE: {data['Train Metrics']['MAE']:,.2f}")
    print(f"RMSE: {data['Train Metrics']['RMSE']:,.2f} | Error Ratio: {data['Train Metrics']['Error Ratio']:.4f}")
    print("\nTest Set:")
    print(f"R²: {data['Test Metrics']['R²']:.3f} | MAE: {data['Test Metrics']['MAE']:,.2f}")
    print(f"RMSE: {data['Test Metrics']['RMSE']:,.2f} | Error Ratio: {data['Test Metrics']['Error Ratio']:.4f}")

# Retrain best model on full data
best_model.fit(X, y)
print(f"\n🎯 Best Model: {type(best_model.named_steps['regressor']).__name__}")


⚙️ Tuning LinearRegression...





⚙️ Tuning RandomForest...

⚙️ Tuning XGBoost...

📊 Final Model Evaluation Metrics:

=== LinearRegression ===
Best Parameters: {}

Training Set:
R²: 0.686 | MAE: 719,242.89
RMSE: 984,051.92 | Error Ratio: 0.1528

Test Set:
R²: 0.653 | MAE: 970,043.40
RMSE: 1,324,506.96 | Error Ratio: 0.1937

=== RandomForest ===
Best Parameters: {'regressor__max_depth': 13, 'regressor__min_samples_split': 7, 'regressor__n_estimators': 485}

Training Set:
R²: 0.880 | MAE: 426,009.21
RMSE: 607,401.47 | Error Ratio: 0.0905

Test Set:
R²: 0.599 | MAE: 1,041,018.30
RMSE: 1,423,643.49 | Error Ratio: 0.2079

=== XGBoost ===
Best Parameters: {'regressor__learning_rate': 0.013979488347959958, 'regressor__max_depth': 3, 'regressor__n_estimators': 415}

Training Set:
R²: 0.807 | MAE: 555,798.79
RMSE: 771,961.38 | Error Ratio: 0.1181

Test Set:
R²: 0.648 | MAE: 962,949.88
RMSE: 1,333,239.67 | Error Ratio: 0.1923

🎯 Best Model: XGBRegressor


# 6. Feature Importance Analysis

plot_feature_importance(model): Defines a function to visualize feature importance.

preprocessor = model.named_steps['preprocessor']: Extracts the preprocessor from the trained pipeline.

feature_names = preprocessor.get_feature_names_out(): Gets the names of the features after preprocessing.

regressor = model.named_steps['regressor']: Extracts the regressor (model) from the pipeline.

if hasattr(regressor, 'feature_importances_'):: Checks if the regressor has a feature_importances_ attribute (common in tree-based models).

If it does, it gets the feature importances from this attribute.

elif hasattr(regressor, 'coef_'):: Checks if the regressor has a coef_ attribute (common in linear models).

If it does, it uses the coefficients as a measure of importance.

else:: If neither attribute is found, it prints a warning and returns.

indices = np.argsort(importances)[-10:]: Gets the indices of the top 10 most important features.

plt.figure(figsize=(10, 6)): Creates a Matplotlib figure with a specified size.

plt.title('Top 10 Feature Importances'): Sets the title of the plot.

plt.barh(...): Creates a horizontal bar plot of the feature importances.

clean_names = [name.split('__')[-1] for name in feature_names[indices]]: Cleans the feature names by removing any prefixes added by the preprocessor (e.g., "num__" or "cat__").

plt.yticks(...): Sets the y-axis tick labels to the cleaned feature names.

plt.xlabel('Relative Importance'): Sets the x-axis label.

plt.tight_layout(): Adjusts the plot layout to prevent labels from overlapping.

plt.savefig('feature_importance.png'): Saves the plot as a PNG image.

plt.close(): Closes the Matplotlib figure.

print("✅ Feature importance plot generated successfully"): Prints a success message.

except Exception as e:: Handles any errors during the process and prints an error message.

plot_feature_importance(best_model): Calls the function to generate the feature importance plot for the best model.

In [7]:
def plot_feature_importance(model):
    """Visualize feature importance using model's built-in preprocessor"""
    try:
        # Get preprocessor from the trained pipeline
        preprocessor = model.named_steps['preprocessor']
        feature_names = preprocessor.get_feature_names_out()
        
        # Get importance values
        regressor = model.named_steps['regressor']
        if hasattr(regressor, 'feature_importances_'):
            importances = regressor.feature_importances_
        elif hasattr(regressor, 'coef_'):
            importances = regressor.coef_
        else:
            print("⚠️ Model doesn't support feature importance analysis")
            return

        # Create plot with clear labels
        indices = np.argsort(importances)[-10:]
        plt.figure(figsize=(10, 6))
        plt.title('Top 10 Feature Importances')
        plt.barh(range(len(indices)), importances[indices], align='center')
        
        # Clean feature names by removing prefixes
        clean_names = [name.split('__')[-1] for name in feature_names[indices]]
        plt.yticks(range(len(indices)), clean_names)
        plt.xlabel('Relative Importance')
        plt.tight_layout()
        plt.savefig('feature_importance.png')
        plt.close()
        print("✅ Feature importance plot generated successfully")
    except Exception as e:
        print(f"⚠️ Error generating feature importance: {str(e)}")

# Update the call
print("\n🔍 Feature Importance Analysis")
plot_feature_importance(best_model)


🔍 Feature Importance Analysis
✅ Feature importance plot generated successfully


# 7. Collinearity Check

check_collinearity(model): Defines a function to check for multicollinearity among numerical features using Variance Inflation Factor (VIF).

preprocessor = model.named_steps['preprocessor']: Extracts the preprocessor from the pipeline.

X_processed = preprocessor.transform(X_train): Applies the preprocessing steps to the training data.

vif_data = pd.DataFrame(...): Creates a DataFrame to store the VIF values.

variance_inflation_factor(...): Calculates the VIF for each numerical feature.

X_processed[:, :len(numeric_features)]: Selects only the numerical features from the processed data.

for i in range(len(numeric_features)): Iterates through each numerical feature.

print(vif_data.to_string(index=False)): Prints the VIF values in a tabular format without the index.

except Exception as e:: Handles potential errors during the process and prints an error message.

if 'LinearRegression' in results:: Checks if a LinearRegression model was trained.

If it was, it calls the check_collinearity function to perform the multicollinearity check.

In [8]:
def check_collinearity(model):
    """Simplified collinearity check without tabulate dependency"""
    try:
        # Process training data
        preprocessor = model.named_steps['preprocessor']
        X_processed = preprocessor.transform(X_train)
        
        # Calculate VIF for numerical features only
        vif_data = pd.DataFrame({
            'Feature': numeric_features,
            'VIF': [variance_inflation_factor(X_processed[:, :len(numeric_features)], i)
                   for i in range(len(numeric_features))]
        })
        
        print("\n📊 Collinearity Analysis (VIF > 5 indicates high collinearity)")
        print(vif_data.to_string(index=False))
        
    except Exception as e:
        print(f"⚠️ Collinearity check failed: {str(e)}")
        print("Note: Install 'tabulate' for better table formatting: pip install tabulate")

# Update the collinearity check call
if 'LinearRegression' in results:
    print("\n🔍 Checking multicollinearity for linear model")
    check_collinearity(results['LinearRegression']['Best Estimator'])


🔍 Checking multicollinearity for linear model

📊 Collinearity Analysis (VIF > 5 indicates high collinearity)
  Feature      VIF
     area 1.049350
 bedrooms 1.328012
bathrooms 1.253273
  stories 1.240142


# 8. Prediction Interface

joblib.dump(best_model, 'best_house_price_model.joblib'): Saves the best-performing model to a file named 'best_house_price_model.joblib' using the joblib library. This allows you to load and reuse the trained model later without having to retrain it.

print("\n💾 Best model saved as best_house_price_model.joblib"): Prints a message indicating that the model has been saved.

validate_input(feature, value): Defines a function to validate and convert user input for each feature.

if feature in numeric_features:: Checks if the feature is numerical.

Converts the input value to a float.

Performs specific validation based on the feature (e.g., bedrooms must be between 0 and 10).

Raises a ValueError if the input is invalid.

if feature in binary_cols:: Checks if the feature is binary.

Converts the input to lowercase and removes leading/trailing whitespace.

Converts 'yes'/'1' to 1 and 'no'/'0' to 0.

Raises a ValueError if the input is not 'yes', 'no', '1', or '0'.

if feature == 'furnishingstatus':: Checks if the feature is 'furnishingstatus'.

Validates that the input is one of the allowed values ('furnished', 'semi-furnished', 'unfurnished').

Raises a ValueError if the input is invalid.

return value: Returns the validated and converted value.

except ValueError as e:: Handles ValueError exceptions raised during validation and prints an error message.

predict_price(): Defines a function to provide a user interface for predicting house prices.

print("\n🏠 Property Price Prediction Interface"): Prints a header for the interface.

feature_values = {}: Initializes an empty dictionary to store the feature values entered by the user.

for feature in X.columns:: Iterates through each feature in the feature matrix X.

while True:: Enters a loop that continues until valid input is received.

value = input(f"{feature}: "): Prompts the user to enter a value for the current feature.

validated = validate_input(feature, value): Calls the validate_input function to validate and convert the input.

feature_values[feature] = [validated]: Stores the validated value in the feature_values dictionary.

break: Exits the loop if the input is valid.

except ValueError as e:: Handles ValueError exceptions raised during validation and prints an error message.

try...except block: Handles potential errors during model loading and prediction.

model = joblib.load('best_house_price_model.joblib'): Loads the saved best model from the file.

prediction = model.predict(pd.DataFrame(feature_values))[0]: Creates a DataFrame from the user input and uses the loaded model to predict the house price.

print(f"\n💵 Predicted Price: ₹{prediction:,.2f}"): Prints the predicted price, formatted with commas as thousands separators.

except Exception as e:: Handles any exceptions during the process and prints an error message.

print("\n🚀 Production-Ready Price Prediction System!"): Prints a final message indicating that the system is ready.

In [9]:
joblib.dump(best_model, 'best_house_price_model.joblib')
print("\n💾 Best model saved as best_house_price_model.joblib")

def validate_input(feature, value):
    """Validate and convert input features"""
    try:
        if feature in numeric_features:
            val = float(value)
            if feature == 'bedrooms' and not 0 <= val <= 10:
                raise ValueError("Bedrooms must be 0-10")
            if feature == 'bathrooms' and val < 0:
                raise ValueError("Bathrooms can't be negative")
            if feature == 'area' and val < 100:
                raise ValueError("Area must be ≥100 sqft")
            return val
        
        if feature in binary_cols:
            clean_val = str(value).lower().strip()
            if clean_val in ['yes', '1']: return 1
            if clean_val in ['no', '0']: return 0
            raise ValueError(f"Must be yes/no or 0/1 for {feature}")
        
        if feature == 'furnishingstatus':
            valid = ['furnished', 'semi-furnished', 'unfurnished']
            if value.lower() not in valid:
                raise ValueError(f"Must be one of {valid}")
            return value
        
        return value
    except ValueError as e:
        raise ValueError(f"Invalid {feature}: {str(e)}")

def predict_price():
    """Safe prediction interface"""
    print("\n🏠 Property Price Prediction Interface")
    feature_values = {}
    
    for feature in X.columns:
        while True:
            try:
                value = input(f"{feature}: ")
                validated = validate_input(feature, value)
                feature_values[feature] = [validated]
                break
            except ValueError as e:
                print(f"❌ Error: {str(e)}")
    
    try:
        model = joblib.load('best_house_price_model.joblib')
        prediction = model.predict(pd.DataFrame(feature_values))[0]
        print(f"\n💵 Predicted Price: ${prediction:,.2f}")
    except Exception as e:
        print(f"❌ Prediction failed: {str(e)}")

# Uncomment to enable interactive predictions
predict_price()

print("\n🚀 Production-Ready Price Prediction System!")


💾 Best model saved as best_house_price_model.joblib

🏠 Property Price Prediction Interface
❌ Error: Invalid furnishingstatus: Must be one of ['furnished', 'semi-furnished', 'unfurnished']

💵 Predicted Price: $3,028,940.75

🚀 Production-Ready Price Prediction System!
