In [1]:
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder, TargetEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, mean_absolute_error, r2_score
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from joblib import dump, load
from sklearn.pipeline import Pipeline

df = pd.read_csv("../house_data_done.csv")
df = df.drop("Unnamed: 0", axis=1)

X = df.drop(columns=["price"])  # Features
y = df["price"]                 # Target

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Log-transform target for training
y_train_logged = np.log1p(y_train)
y_test_logged = np.log1p(y_test)

class FeatureEngineering(BaseEstimator, TransformerMixin):
    def fit(self, X,y=None):
        return self

    def transform(self, X):
        X = X.copy()
        X["house_lux_combo"] = (X.bedroom + X.bathroom + X.toilets) * (1 + X["Stable Electricity"] * 2)

        X["extras_bucket"] = pd.cut(
            X["extras"], 
            bins=[-1, 2, 4, 6, 16], 
            labels=["low", "medium", "high", "very_high"]
        )
        X = pd.get_dummies(X, columns=["extras_bucket"], dtype=int)

        X["parking_ratio"] = X["parking_lot"] / (X["bedroom"] + 1)
        X["Visitors toilet"] = X["toilets"] - X["bathroom"]
        X["is_self_contain"] = (X["bedroom"] == 1).astype(int)
        X["luxury_scores"] = X["extras"] + X["serviced"] + X["Stable Electricity"] + X["parking_lot"]
        X["is_lagos"] = (X.get("state_Lagos", 0) == 1).astype(int)

        X.loc[
            (X["parking_lot"] == 1) | (X["serviced"] == 1),
            "extras"
        ] = X["extras"].clip(lower=1)

        X["top_tier_houses"] = (
            ((X["bedroom"] > 4) | (X["bathroom"] > 4)) &
            (X["extras"] > 5) &
            (X["house_lux_combo"] > 10) &
            ((X["is_lagos"] == 1) | (X.get("town_Ikoyi", 0) == 1))
        ).astype(int)

        return X

        
class RestoreColumnNames(BaseEstimator, TransformerMixin):
    def __init__(self, preprocessor):
        self.preprocessor = preprocessor

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # Get feature names from ColumnTransformer
        feature_names = []
        for name, transformer, cols in self.preprocessor.transformers_:
            if name != 'remainder':
                if hasattr(transformer, 'get_feature_names_out'):
                    feature_names.extend(transformer.get_feature_names_out(cols))
                else:
                    feature_names.extend(cols)
            else:
                # Passthrough columns
                if cols == 'drop':
                    continue
                if isinstance(cols, list):
                    feature_names.extend(cols)
                else:
                    feature_names.extend(self.preprocessor.feature_names_in_[cols])

        return pd.DataFrame(X, columns=feature_names)


# 2. Preprocessing
state_and_town_feat = ["state", "town"]
title_feat = ["title"]

one_hot = OneHotEncoder(handle_unknown='ignore', sparse_output=False, drop='first')
t_encoder = TargetEncoder(target_type='continuous', cv=5)

preprocessor = ColumnTransformer(
    [("One_hot", one_hot, state_and_town_feat),
     ("T_Encoder", t_encoder, title_feat)],
    remainder="passthrough"
)

# 3. Pipeline (Preprocessing + Feature Engineering + Model)
model = RandomForestRegressor(
    n_estimators=300,
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    random_state=42
)

pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("restore_names", RestoreColumnNames(preprocessor)),
    ("feature_engineering", FeatureEngineering()),
    ("model", model)
])

# 4. Fit
y_train_logged = np.log1p(y_train)  # log target
pipeline.fit(X_train, y_train_logged)

# 5. Save
dump(pipeline, "house_price_pipeline.joblib")
    

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



['house_price_pipeline.joblib']

In [23]:
def HousePrediction(df, filename, log_target=True):
    """
    Predict house prices from given dataframe using a saved pipeline.
    
    df : pandas DataFrame with same columns as training data.
    filename : Path to saved joblib pipeline.
    log_target : If True, model was trained on log1p target.
    """
    # Load pipeline (preprocessing + feature engineering + model)
    pipeline = load(filename)
    
    # Predict
    y_pred = pipeline.predict(df)
    
    # Unlog if model was trained on log values
    if log_target:
        y_pred = np.around(np.expm1(y_pred),1)
    
    return y_pred

In [24]:
import pandas as pd
from joblib import dump, load
data = {
    "bedroom" : [3],
    "parking_lot" : [1],
    "bathroom" : [3], 
    "toilets" : [3],
    "town" : ["Ajah"],
    "state" : ["Lagos"],
    "serviced" : [0],
    "extras" : [3],
    "Stable Electricity" : [1],
    "title" : ["flat apartment"],
}
df = pd.DataFrame(data)
df

Unnamed: 0,bedroom,parking_lot,bathroom,toilets,town,state,serviced,extras,Stable Electricity,title
0,3,1,3,3,Ajah,Lagos,0,3,1,flat apartment


In [25]:
HousePrediction(df, "house_price_pipeline.joblib")

array([2354127.1])