In [5]:
# train_and_export.py
import joblib
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor, VotingRegressor
from sklearn.compose import TransformedTargetRegressor

from xgboost import XGBRegressor
import category_encoders as ce

# 1) Load data
df = pd.read_csv('../../Dataset/gurgaon_properties_post_feature_selection_v2.csv')
df['furnishing_type'] = df['furnishing_type'].replace({0.0: 'unfurnished',
                                                       1.0: 'semifurnished',
                                                       2.0: 'furnished'})
X = df.drop(columns=['price'])
y = df['price']

# 2) Preprocessor (same as your CV pipeline)
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1),
         ['property_type', 'sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']),
        ('cat1', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'), ['agePossession']),
        ('target_enc', ce.TargetEncoder(), ['sector'])
    ],
    remainder='passthrough'
)

# 3) Models (your tuned params)
best_rf_params = {
    'n_estimators': 454,
    'max_depth': 39,
    'min_samples_split': 2,
    'min_samples_leaf': 1,
    'max_features': None,
    'bootstrap': True,
    'random_state': 42,
    'n_jobs': -1,
}
best_xgb_params = {
    'n_estimators': 791,
    'learning_rate': 0.0266,
    'max_depth': 7,
    'subsample': 0.9518,
    'colsample_bytree': 0.7703,
    'gamma': 0.00923,
    'reg_alpha': 1.108,
    'reg_lambda': 1.4735,
    'min_child_weight': 4,
    'random_state': 42,
    'n_jobs': -1,
}

rf_pipe = Pipeline([('preprocessor', preprocessor), ('model', RandomForestRegressor(**best_rf_params))])
xgb_pipe = Pipeline([('preprocessor', preprocessor), ('model', XGBRegressor(**best_xgb_params))])

# 4) Blended final regressor with your optimized weights
w_rf, w_xgb = 0.4519, 0.5481  # from your Optuna run
blend = VotingRegressor(estimators=[('rf', rf_pipe), ('xgb', xgb_pipe)],
                        weights=[w_rf, w_xgb])

# 5) Wrap with TransformedTargetRegressor (train in log, predict in original scale)
final_model = TransformedTargetRegressor(
    regressor=blend,
    func=np.log1p,          # y -> log1p(y)
    inverse_func=np.expm1   # pred_log -> price
)

# 6) Fit on all data
final_model.fit(X, y)

# 7) Save artifact (+ expected columns for safety)
joblib.dump(final_model, 'gurgaon_price_model.joblib', compress=3)
pd.Series(X.columns).to_json('expected_columns.json', orient='values')
print("Saved: gurgaon_price_model.joblib and expected_columns.json")

Saved: gurgaon_price_model.joblib and expected_columns.json


In [None]:
import pandas as pd
import json

# Load the same dataset used for training

# Load expected columns
with open("Saved_Model/expected_columns.json", "r") as f:
    expected_columns = json.load(f)

summary = {}

for col in expected_columns:
    if df[col].dtype == "object":
        # For categorical variables, show top 5 unique values
        summary[col] = df[col].dropna().unique()[:5].tolist()
    else:
        # For numeric, show type and range
        summary[col] = f"{df[col].dtype} (min: {df[col].min()}, max: {df[col].max()})"

# Save this as a JSON file for later use
with open("Saved_Model/expected_columns_with_examples.json", "w") as f:
    json.dump(summary, f, indent=4)

print(json.dumps(summary, indent=4))


In [None]:
import joblib
import pandas as pd
import json

# Load
model = joblib.load('gurgaon_price_model.joblib')
expected_columns = pd.Index(json.load(open('expected_columns.json')))

# Prepare input
payload = [{
    "bedRoom": 3,
    "bathroom": 3,
    "built_up_area": 1800,
    "servant room": 0,
    "store room": 0,
    "property_type": "Apartment",
    "sector": "Sector 56",
    "balcony": "yes",
    "agePossession": "0-1 Year Old Property",
    "furnishing_type": "semifurnished",
    "luxury_category": "mid",
    "floor_category": "mid"
}]
X_new = pd.DataFrame(payload)

# Reindex to expected columns (adds missing columns if any)
X_new = X_new.reindex(columns=expected_columns)

# Predict price (already in original scale, INR)
pred = model.predict(X_new)
print(pred)

In [None]:
import joblib
import pandas as pd
import json

# Load
model = joblib.load('gurgaon_price_model.joblib')
expected_columns = pd.Index(json.load(open('expected_columns.json')))

# Prepare input
payload = [{
    "bedRoom": 3,
    "bathroom": 3,
    "built_up_area": 1800,
    "servant room": 0,
    "store room": 0,
    "property_type": "Apartment",
    "sector": "Sector 56",
    "balcony": "yes",
    "agePossession": "0-1 Year Old Property",
    "furnishing_type": "semifurnished",
    "luxury_category": "mid",
    "floor_category": "mid"
}]
X_new = pd.DataFrame(payload)

# Reindex to expected columns (adds missing columns if any)
X_new = X_new.reindex(columns=expected_columns)

# Predict price (already in original scale, INR)
pred = model.predict(X_new)
print(pred)

In [6]:
# train_and_export.py
import joblib
import numpy as np
import pandas as pd
import json

from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor, VotingRegressor
from sklearn.compose import TransformedTargetRegressor
from xgboost import XGBRegressor
import category_encoders as ce

# 1) Load data
df = pd.read_csv('../../Dataset/gurgaon_properties_post_feature_selection_v2.csv')
df['furnishing_type'] = df['furnishing_type'].replace({
    0.0: 'unfurnished',
    1.0: 'semifurnished',
    2.0: 'furnished'
})

X = df.drop(columns=['price'])
y = df['price']

# 2) Preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1),
         ['property_type', 'sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']),
        ('cat1', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'), ['agePossession']),
        ('target_enc', ce.TargetEncoder(), ['sector'])
    ],
    remainder='passthrough'
)

# 3) Model params
best_rf_params = {
    'n_estimators': 454,
    'max_depth': 39,
    'min_samples_split': 2,
    'min_samples_leaf': 1,
    'max_features': None,
    'bootstrap': True,
    'random_state': 42,
    'n_jobs': -1,
}
best_xgb_params = {
    'n_estimators': 791,
    'learning_rate': 0.0266,
    'max_depth': 7,
    'subsample': 0.9518,
    'colsample_bytree': 0.7703,
    'gamma': 0.00923,
    'reg_alpha': 1.108,
    'reg_lambda': 1.4735,
    'min_child_weight': 4,
    'random_state': 42,
    'n_jobs': -1,
}

# 4) Build models
rf_pipe = Pipeline([('preprocessor', preprocessor), ('model', RandomForestRegressor(**best_rf_params))])
xgb_pipe = Pipeline([('preprocessor', preprocessor), ('model', XGBRegressor(**best_xgb_params))])

w_rf, w_xgb = 0.4519, 0.5481
blend = VotingRegressor(estimators=[('rf', rf_pipe), ('xgb', xgb_pipe)], weights=[w_rf, w_xgb])

# 5) Log-transformed regressor
final_model = TransformedTargetRegressor(
    regressor=blend,
    func=np.log1p,
    inverse_func=np.expm1
)

# 6) Fit model
print("Training final blended model...")
final_model.fit(X, y)

# 7) Save model and expected columns
joblib.dump(final_model, 'gurgaon_price_model.joblib', compress=3)
pd.Series(X.columns).to_json('expected_columns.json', orient='values')

# 8) Generate detailed column summary
print("Generating detailed JSON for Flask form...")
column_info = {}

for col in X.columns:
    if df[col].dtype == 'object':
        unique_vals = df[col].dropna().unique()
        if len(unique_vals) > 10:
            examples = unique_vals[:10].tolist()
        else:
            examples = unique_vals.tolist()
        column_info[col] = {
            "type": "categorical",
            "examples": examples
        }
    else:
        column_info[col] = {
            "type": "numeric",
            "min": float(df[col].min()) if not pd.isnull(df[col].min()) else None,
            "max": float(df[col].max()) if not pd.isnull(df[col].max()) else None,
            "mean": float(df[col].mean()) if not pd.isnull(df[col].mean()) else None
        }

# 9) Save this summary
with open('expected_columns_with_examples.json', 'w') as f:
    json.dump(column_info, f, indent=4)

print("\n✅ Exported files:")
print("- gurgaon_price_model.joblib")
print("- expected_columns.json")
print("- expected_columns_with_examples.json")


Training final blended model...
Generating detailed JSON for Flask form...

✅ Exported files:
- gurgaon_price_model.joblib
- expected_columns.json
- expected_columns_with_examples.json


In [7]:
df.head()

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,flat,sector 36,0.82,3.0,2.0,2,New Property,850.0,0.0,0.0,unfurnished,Low,Low Floor
1,flat,sector 89,0.95,2.0,2.0,2,New Property,1226.0,1.0,0.0,unfurnished,Low,Mid Floor
2,flat,sohna road,0.32,2.0,2.0,1,New Property,1000.0,0.0,0.0,unfurnished,Low,High Floor
3,flat,sector 92,1.6,3.0,4.0,3+,Relatively New,1615.0,1.0,0.0,semifurnished,High,Mid Floor
4,flat,sector 102,0.48,2.0,2.0,1,Relatively New,582.0,0.0,1.0,unfurnished,High,Mid Floor


In [9]:
pd.set_option('display.max_rows', None)  # Show all rows
df['sector'].value_counts(dropna=False)


sector
sohna road           163
sector 85            108
sector 102           107
sector 92            100
sector 69             93
sector 90             89
sector 65             87
sector 81             87
sector 109            86
sector 79             76
sector 83             68
sector 104            67
sector 33             67
sector 37d            63
sector 86             62
sector 2              61
sector 50             61
sector 107            59
sector 108            58
sector 43             58
sector 89             57
sector 95             55
sector 48             55
sector 70a            54
sector 56             53
sector 37             52
sector 70             49
sector 84             49
sector 67             47
sector 49             45
sector 4              45
sector 113            43
sector 103            42
sector 99             42
sector 66             41
sector 82             41
sector 61             41
sector 26             40
sector 28             39
sector 106        

In [10]:
import pandas as pd

# Example: your dataframe
# df = pd.read_csv("your_file.csv")

unique_sectors = df['sector'].dropna().unique()  # get unique values, ignore NaN
unique_sectors_list = [str(x) for x in unique_sectors]  # convert all to string

output = {
    "type": "categorical",
    "examples": unique_sectors_list
}

print(output)


{'type': 'categorical', 'examples': ['sector 36', 'sector 89', 'sohna road', 'sector 92', 'sector 102', 'gwal pahari', 'sector 108', 'sector 105', 'sector 26', 'sector 109', 'sector 28', 'sector 65', 'sector 12', 'sector 85', 'sector 70a', 'sector 30', 'sector 107', 'sector 3', 'sector 2', 'sector 41', 'sector 4', 'sector 62', 'sector 49', 'sector 81', 'sector 66', 'sector 86', 'sector 48', 'sector 51', 'sector 37', 'sector 111', 'sector 67', 'sector 113', 'sector 13', 'sector 61', 'sector 69', 'sector 67a', 'sector 37d', 'sector 82', 'sector 53', 'sector 74', 'sector 52', 'sector 43', 'sector 14', 'sector 25', 'sector 95', 'sector 56', 'sector 83', 'sector 104', 'sector 88a', 'sector 55', 'sector 50', 'sector 84', 'sector 91', 'sector 76', 'sector 82a', 'sector 78', 'manesar', 'sector 93', 'sector 7', 'sector 71', 'sector 110', 'sector 33', 'sector 70', 'sector 103', 'sector 90', 'sector 38', 'sector 79', 'sector 112', 'sector 22', 'sector 59', 'sector 99', 'sector 9', 'sector 58', 's

In [11]:
import pandas as pd

# Example: your dataframe
# df = pd.read_csv("your_file.csv")

# Get unique values as strings
unique_sectors = df['sector'].dropna().unique()
unique_sectors_list = [str(x) for x in unique_sectors]

# Create the final format
output = {
    "sector": {
        "type": "categorical",
        "examples": unique_sectors_list
    }
}

print(output)


{'sector': {'type': 'categorical', 'examples': ['sector 36', 'sector 89', 'sohna road', 'sector 92', 'sector 102', 'gwal pahari', 'sector 108', 'sector 105', 'sector 26', 'sector 109', 'sector 28', 'sector 65', 'sector 12', 'sector 85', 'sector 70a', 'sector 30', 'sector 107', 'sector 3', 'sector 2', 'sector 41', 'sector 4', 'sector 62', 'sector 49', 'sector 81', 'sector 66', 'sector 86', 'sector 48', 'sector 51', 'sector 37', 'sector 111', 'sector 67', 'sector 113', 'sector 13', 'sector 61', 'sector 69', 'sector 67a', 'sector 37d', 'sector 82', 'sector 53', 'sector 74', 'sector 52', 'sector 43', 'sector 14', 'sector 25', 'sector 95', 'sector 56', 'sector 83', 'sector 104', 'sector 88a', 'sector 55', 'sector 50', 'sector 84', 'sector 91', 'sector 76', 'sector 82a', 'sector 78', 'manesar', 'sector 93', 'sector 7', 'sector 71', 'sector 110', 'sector 33', 'sector 70', 'sector 103', 'sector 90', 'sector 38', 'sector 79', 'sector 112', 'sector 22', 'sector 59', 'sector 99', 'sector 9', 'sec