In [None]:
import pandas as pd

In [None]:
df=pd.read_csv("Bengaluru_House_Data.csv")
print(df.columns)
df.head()

In [None]:
df.info()
df.isnull().sum()

In [None]:
df = df.drop(["society","balcony","availability", "area_type"], axis=1, errors="ignore")
df = df.rename(columns={"bath": "bathroom"})
df = df.dropna()
df.isnull().sum()

In [None]:

df["bathroom"] = df["bathroom"].astype(int)
df["bhk"] = df["size"].str.split().str[0].astype(int)
df = df.drop("size", axis=1)
df.head()


In [None]:
def convert_sqft_to_num(x):
    tokens = str(x).split('-')
    if len(tokens) == 2:
        return (float(tokens[0]) + float(tokens[1])) / 2
    try:
        return float(x)
    except:
        return None

df["total_sqft"] = df["total_sqft"].apply(convert_sqft_to_num)
df = df.dropna()
df.head()

In [None]:
df["sqft_per_bhk"] = df["total_sqft"] / df["bhk"]


In [None]:
df = df[df["sqft_per_bhk"] >= 300]
df=df.drop(["sqft_per_bhk"], axis=1, errors="ignore")
df.head()

In [None]:
location_stats = df["location"].value_counts()

location_stats_less_10 = location_stats[location_stats <= 10]

df["location"] = df["location"].apply(
    lambda x: "other" if x in location_stats_less_10 else x
)
df.head()

In [None]:
df = pd.get_dummies(df, columns=["location"], drop_first=True)
df.head()

In [None]:
df = df[df["price"] < df["price"].quantile(0.99)]

In [None]:
import numpy as np
X = df.drop("price", axis=1)
y = np.log1p(df["price"])

In [None]:
import sys
!"{sys.executable}" -m pip install xgboost

In [None]:
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
import joblib

In [None]:
X_train,X_test,y_train,y_test=train_test_split(
    X,y, test_size=0.2, random_state=42
)

model = XGBRegressor(
    n_estimators=600,
    max_depth=8,
    learning_rate=0.03,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)
model.fit(X_train,y_train)

In [None]:
pred = np.expm1(model.predict(X_test))
y_true = np.expm1(y_test)

In [None]:
rmse=np.sqrt(mean_squared_error(y_true, pred))
print("RMSE:", rmse)

In [None]:
joblib.dump(model, "../backend/model/house_model.pkl")
joblib.dump(X.columns.tolist(),"../backend/model/columns.pkl")
print("Model saved!")