In [1]:
# train_and_export.py
import joblib
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor, VotingRegressor
from sklearn.compose import TransformedTargetRegressor

from xgboost import XGBRegressor
import category_encoders as ce

# 1) Load data
df = pd.read_csv('../../Dataset/gurgaon_properties_post_feature_selection_v2.csv')
df['furnishing_type'] = df['furnishing_type'].replace({0.0: 'unfurnished',
                                                       1.0: 'semifurnished',
                                                       2.0: 'furnished'})
X = df.drop(columns=['price'])
y = df['price']

# 2) Preprocessor (same as your CV pipeline)
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1),
         ['property_type', 'sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']),
        ('cat1', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'), ['agePossession']),
        ('target_enc', ce.TargetEncoder(), ['sector'])
    ],
    remainder='passthrough'
)

# 3) Models (your tuned params)
best_rf_params = {
    'n_estimators': 454,
    'max_depth': 39,
    'min_samples_split': 2,
    'min_samples_leaf': 1,
    'max_features': None,
    'bootstrap': True,
    'random_state': 42,
    'n_jobs': -1,
}
best_xgb_params = {
    'n_estimators': 791,
    'learning_rate': 0.0266,
    'max_depth': 7,
    'subsample': 0.9518,
    'colsample_bytree': 0.7703,
    'gamma': 0.00923,
    'reg_alpha': 1.108,
    'reg_lambda': 1.4735,
    'min_child_weight': 4,
    'random_state': 42,
    'n_jobs': -1,
}

rf_pipe = Pipeline([('preprocessor', preprocessor), ('model', RandomForestRegressor(**best_rf_params))])
xgb_pipe = Pipeline([('preprocessor', preprocessor), ('model', XGBRegressor(**best_xgb_params))])

# 4) Blended final regressor with your optimized weights
w_rf, w_xgb = 0.4519, 0.5481  # from your Optuna run
blend = VotingRegressor(estimators=[('rf', rf_pipe), ('xgb', xgb_pipe)],
                        weights=[w_rf, w_xgb])

# 5) Wrap with TransformedTargetRegressor (train in log, predict in original scale)
final_model = TransformedTargetRegressor(
    regressor=blend,
    func=np.log1p,          # y -> log1p(y)
    inverse_func=np.expm1   # pred_log -> price
)

# 6) Fit on all data
final_model.fit(X, y)

# 7) Save artifact (+ expected columns for safety)
joblib.dump(final_model, 'gurgaon_price_model.joblib', compress=3)
pd.Series(X.columns).to_json('expected_columns.json', orient='values')
print("Saved: gurgaon_price_model.joblib and expected_columns.json")

Saved: gurgaon_price_model.joblib and expected_columns.json


In [None]:
import joblib
import pandas as pd
import json

# Load
model = joblib.load('gurgaon_price_model.joblib')
expected_columns = pd.Index(json.load(open('expected_columns.json')))

# Prepare input
payload = [{
    "bedRoom": 3,
    "bathroom": 3,
    "built_up_area": 1800,
    "servant room": 0,
    "store room": 0,
    "property_type": "Apartment",
    "sector": "Sector 56",
    "balcony": "yes",
    "agePossession": "0-1 Year Old Property",
    "furnishing_type": "semifurnished",
    "luxury_category": "mid",
    "floor_category": "mid"
}]
X_new = pd.DataFrame(payload)

# Reindex to expected columns (adds missing columns if any)
X_new = X_new.reindex(columns=expected_columns)

# Predict price (already in original scale, INR)
pred = model.predict(X_new)
print(pred)

In [None]:
import joblib
import pandas as pd
import json

# Load
model = joblib.load('gurgaon_price_model.joblib')
expected_columns = pd.Index(json.load(open('expected_columns.json')))

# Prepare input
payload = [{
    "bedRoom": 3,
    "bathroom": 3,
    "built_up_area": 1800,
    "servant room": 0,
    "store room": 0,
    "property_type": "Apartment",
    "sector": "Sector 56",
    "balcony": "yes",
    "agePossession": "0-1 Year Old Property",
    "furnishing_type": "semifurnished",
    "luxury_category": "mid",
    "floor_category": "mid"
}]
X_new = pd.DataFrame(payload)

# Reindex to expected columns (adds missing columns if any)
X_new = X_new.reindex(columns=expected_columns)

# Predict price (already in original scale, INR)
pred = model.predict(X_new)
print(pred)

In [2]:
df = pd.read_csv('../../Dataset/gurgaon_properties_post_feature_selection_v2.csv')

In [3]:
df.head()

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,flat,sector 36,0.82,3.0,2.0,2,New Property,850.0,0.0,0.0,0.0,Low,Low Floor
1,flat,sector 89,0.95,2.0,2.0,2,New Property,1226.0,1.0,0.0,0.0,Low,Mid Floor
2,flat,sohna road,0.32,2.0,2.0,1,New Property,1000.0,0.0,0.0,0.0,Low,High Floor
3,flat,sector 92,1.6,3.0,4.0,3+,Relatively New,1615.0,1.0,0.0,1.0,High,Mid Floor
4,flat,sector 102,0.48,2.0,2.0,1,Relatively New,582.0,0.0,1.0,0.0,High,Mid Floor


In [4]:
df['property_type'].value_counts()

property_type
flat     2804
house     750
Name: count, dtype: int64