In [23]:
import pandas as pd
import numpy as np

# Load dataset
df = pd.read_csv("house_prices.csv")

# Initial inspection
df.head()
df.info()
df.describe()
df.shape


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 187531 entries, 0 to 187530
Data columns (total 21 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Index              187531 non-null  int64  
 1   Title              187531 non-null  object 
 2   Description        184508 non-null  object 
 3   Amount(in rupees)  187531 non-null  object 
 4   Price (in rupees)  169866 non-null  float64
 5   location           187531 non-null  object 
 6   Carpet Area        106858 non-null  object 
 7   Status             186916 non-null  object 
 8   Floor              180454 non-null  object 
 9   Transaction        187448 non-null  object 
 10  Furnishing         184634 non-null  object 
 11  facing             117298 non-null  object 
 12  overlooking        106095 non-null  object 
 13  Society            77853 non-null   object 
 14  Bathroom           186703 non-null  object 
 15  Balcony            138596 non-null  object 
 16  Ca

(187531, 21)

In [24]:
# Drop irrelevant columns
df = df.drop([
    'Index','Title','Description',
    'Dimensions','Plot Area',
    'Amount(in rupees)'
], axis=1)

# Drop rows where target is missing
df = df.dropna(subset=['Price (in rupees)'])

# Drop high missing or useless column
df = df.drop('Society', axis=1)
df = df.drop('Status', axis=1)


In [25]:
# Convert text-number columns BEFORE handling missing values
df['Bathroom'] = df['Bathroom'].str.extract('(\d+)').astype(float)
df['Balcony'] = df['Balcony'].str.extract('(\d+)').astype(float)
df['Carpet Area'] = df['Carpet Area'].str.extract('(\d+)').astype(float)
df['Super Area'] = df['Super Area'].str.extract('(\d+)').astype(float)
df['Floor'] = df['Floor'].str.extract('(\d+)').astype(float)
df['Car Parking'] = df['Car Parking'].str.extract('(\d+)').astype(float)


  df['Bathroom'] = df['Bathroom'].str.extract('(\d+)').astype(float)
  df['Balcony'] = df['Balcony'].str.extract('(\d+)').astype(float)
  df['Carpet Area'] = df['Carpet Area'].str.extract('(\d+)').astype(float)
  df['Super Area'] = df['Super Area'].str.extract('(\d+)').astype(float)
  df['Floor'] = df['Floor'].str.extract('(\d+)').astype(float)
  df['Car Parking'] = df['Car Parking'].str.extract('(\d+)').astype(float)


In [26]:
# Handle missing values
df.fillna(df.median(numeric_only=True), inplace=True)

for col in df.select_dtypes(include='object').columns:
    df[col].fillna(df[col].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)


In [27]:
# Feature-Label separation (AFTER all cleaning)
y = df['Price (in rupees)']
X = df.drop('Price (in rupees)', axis=1)

In [28]:
# Encode categorical variables
X = pd.get_dummies(X, drop_first=True)

print("Final shape:", X.shape)

Final shape: (169866, 118)


In [29]:
# Train-test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)

# Model training
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(
    n_estimators=100,
    random_state=42,
    n_jobs=-1
)

In [32]:
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

In [31]:

# Evaluation
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

print("MAE:", mean_absolute_error(y_test, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))
print("R2 Score:", r2_score(y_test, y_pred))

MAE: 1028.0654861822084
RMSE: 44546.775781735974
R2 Score: -0.002860888534579331
