<a href="https://colab.research.google.com/github/Luckynirwan12/Delhi-House-Price-Prediction/blob/main/House_Price_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **House Price Prediction**

### Importing Dependencies

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score

### Importing the Delhi House Price Dataset

In [None]:
house_price_df = pd.read_csv('/content/Delhi House Price.csv')
house_price_df.head()

In [None]:
# Check the number of row and columns
house_price_df.shape

In [None]:
# Check info about the data
house_price_df.info()

In [None]:
# check null values
house_price_df.isnull().sum()

In [None]:
# Remove the null values
house_price_df = house_price_df.dropna()

In [None]:
# Now again check null values
house_price_df.isnull().sum()

In [None]:
# Get count of each values in each column
for column in house_price_df.columns:
  print(house_price_df[column].value_counts())
  print('-'*20)

In [None]:
# Drop unnessecary columns
house_price_df.drop(columns= ['Status', 'Transaction', 'Per_Sqft'], inplace=True)

In [None]:
house_price_df.info()

In [None]:
house_price_df.describe()

In [None]:
# Now we replace the location which are appeared less than 10 to the other
house_price_df['Locality'].value_counts()

In [None]:
house_price_df['Locality'] = house_price_df['Locality'].apply(lambda x: 'Other' if house_price_df['Locality'].value_counts()[x] <= 10 else x)

In [None]:
house_price_df['Locality'].value_counts()

### Detect and Remove the Outliers

In [None]:
# Custom threshold-based outlier removal
def remove_custom_outliers(df):
    df = df[df['Area'] <= 3000]           # Cap Area at 3000 sq.ft
    df = df[df['Parking'] <= 5]           # Cap Parking at 5
    df = df[df['Price'] <= 5e7]           # Cap Price at 5 Crore (₹50,000,000)
    return df

# Apply on your DataFrame
final_price_df = remove_custom_outliers(house_price_df)

# Compare before vs after
print(f"Original rows: {house_price_df.shape[0]}")
print(f"After outlier removal: {final_price_df.shape[0]}")

In [None]:
final_price_df.describe()

In [None]:
final_price_df.head()

In [None]:
# save th house_price_df
final_price_df.to_csv('house_price_df.csv', index=False)

### Train Model

In [None]:
# Features and target
X = final_price_df.drop('Price', axis=1)
y = final_price_df['Price']

In [None]:
from sklearn.preprocessing import LabelEncoder

cat_cols = ['Furnishing', 'Locality', 'Type']
le = LabelEncoder()

for col in cat_cols:
    X[col] = le.fit_transform(X[col])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

xgb_model = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
xgb_model.fit(X_train, y_train)

# Predict
y_pred = xgb_model.predict(X_test)

# Evaluation
print("RMSE:", mean_squared_error(y_test, y_pred))
print("R² Score:", r2_score(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred))


### Deploy Model

In [None]:
import pickle

with open("xgb_house_price_model.pkl", "wb") as f:
    pickle.dump(xgb_model, f)