In [24]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import onnx
import skl2onnx
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType

In [25]:
df = pd.read_csv("india_housing_prices.csv")

print("Initial Data:\n", df.head())
print("\nColumns:", df.columns)

Initial Data:
    ID        State      City      Locality      Property_Type  BHK  \
0   1   Tamil Nadu   Chennai   Locality_84          Apartment    1   
1   2  Maharashtra      Pune  Locality_490  Independent House    3   
2   3       Punjab  Ludhiana  Locality_167          Apartment    2   
3   4    Rajasthan   Jodhpur  Locality_393  Independent House    2   
4   5    Rajasthan    Jaipur  Locality_466              Villa    4   

   Size_in_SqFt  Price_in_Lakhs  Price_per_SqFt  Year_Built  ...  \
0          4740          489.76            0.10        1990  ...   
1          2364          195.52            0.08        2008  ...   
2          3642          183.79            0.05        1997  ...   
3          2741          300.29            0.11        1991  ...   
4          4823          182.90            0.04        2002  ...   

  Age_of_Property  Nearby_Schools  Nearby_Hospitals  \
0              35              10                 3   
1              17               8            

In [26]:
df['num_amenities'] = df['Amenities'].apply(lambda x: len(str(x).split(',')) if pd.notna(x) else 0)
df['Size_in_SqFt_scaled'] = df['Size_in_SqFt'] * 2
df['Age_of_Property_scaled'] = df['Age_of_Property'] * 0.1

# Encode categorical variables
label_encoders = {}
categorical_cols = ['Public_Transport_Accessibility', 'City', 'State']

for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    label_encoders[col] = le

In [27]:
df['Parking_Space'] = df['Parking_Space'].map({'Yes': 1, 'No': 0}).fillna(0)
df['Security'] = df['Security'].map({'Yes': 1, 'No': 0}).fillna(0)
df['Availability_Status'] = df['Availability_Status'].map({'Ready_to_Move': 1, 'Under_Construction': 0}).fillna(0)


In [28]:
features = [
    'BHK', 'Size_in_SqFt_scaled', 'Age_of_Property_scaled',
    'Nearby_Schools', 'Nearby_Hospitals', 'num_amenities',
    'Public_Transport_Accessibility', 'Parking_Space', 'Security',
    'Availability_Status', 'City', 'State'
]

X = df[features].fillna(0).values
y = df['Price_in_Lakhs'].fillna(df['Price_in_Lakhs'].mean()).values

In [29]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [30]:
scaler_X = StandardScaler()
X_train_scaled = scaler_X.fit_transform(X_train)
X_test_scaled = scaler_X.transform(X_test)

In [None]:
model = RandomForestRegressor(
    n_estimators=200,
    random_state=42,
    max_depth=15,
    min_samples_split=5
)

model.fit(X_train_scaled, y_train)


In [None]:
y_pred = model.predict(X_test_scaled)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("\nModel Performance:")
print("RMSE:", rmse)
print("R² Score:", r2)

In [None]:
importance = dict(zip(features, model.feature_importances_))
sorted_importance = sorted(importance.items(), key=lambda x: x[1], reverse=True)
print("\nFeature Importances:")
for feat, score in sorted_importance:
    print(f"{feat}: {score:.4f}")


In [None]:
initial_type = [('float_input', FloatTensorType([None, X_train.shape[1]]))]
onnx_model = convert_sklearn(model, initial_types=initial_type)
with open("price_predictor.onnx", "wb") as f:
    f.write(onnx_model.SerializeToString())

print("\n✅ Model exported to price_predictor.onnx")

In [None]:
output_df = pd.DataFrame({
    "Actual": y_test,
    "Predicted": y_pred
})
output_df.to_csv("predictions_for_tableau.csv", index=False)

print("✅ Predictions saved to predictions_for_tableau.csv")