In [71]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, mean_absolute_error
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import r2_score

In [3]:
data=pd.read_csv("C:/Users/mahik/OneDrive/Desktop/pbl/Bengaluru_House_Data.csv.csv")
data.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [4]:
data.shape

(13320, 9)

In [5]:
data.describe()

Unnamed: 0,bath,balcony,price
count,13247.0,12711.0,13320.0
mean,2.69261,1.584376,112.565627
std,1.341458,0.817263,148.971674
min,1.0,0.0,8.0
25%,2.0,1.0,50.0
50%,2.0,2.0,72.0
75%,3.0,2.0,120.0
max,40.0,3.0,3600.0


In [6]:
data.columns

Index(['area_type', 'availability', 'location', 'size', 'society',
       'total_sqft', 'bath', 'balcony', 'price'],
      dtype='object')

In [7]:
data['area_type'].unique()

array(['Super built-up  Area', 'Plot  Area', 'Built-up  Area',
       'Carpet  Area'], dtype=object)

In [8]:
data1=data.drop(['area_type','society','balcony','availability'],axis='columns')
data1.shape

(13320, 5)

In [23]:
data1.columns

Index(['location', 'size', 'total_sqft', 'bath', 'price', 'bhk'], dtype='object')

In [9]:
data1.isnull().sum()

location       1
size          16
total_sqft     0
bath          73
price          0
dtype: int64

In [35]:
data1.dropna(subset=['location', 'size', 'total_sqft', 'bath', 'size', 'price'], inplace=True)

In [42]:
data1['size'] = data['size'].apply(lambda x: int(x.split(' ')[0]))


In [47]:
data1['total_sqft'] = pd.to_numeric(data['total_sqft'], errors='coerce')

In [48]:
data1.dropna(subset=['total_sqft'], inplace=True)

In [49]:
encoder = LabelEncoder()
data1['location'] = encoder.fit_transform(data1['location'])

In [50]:
X = data1[['location', 'size', 'total_sqft', 'bath', 'bhk']]
y = data1['price']

In [51]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [52]:
numeric_features = ['size', 'total_sqft', 'bath', 'bhk']
imputer = SimpleImputer(strategy='median')
X_train_numeric = imputer.fit_transform(X_train[numeric_features])
X_test_numeric = imputer.transform(X_test[numeric_features])

In [66]:
rf_model = RandomForestRegressor(random_state=42)
gb_model = GradientBoostingRegressor(random_state=42)
xgb_model = XGBRegressor(random_state=42)

In [67]:
rf_model.fit(X_train, y_train)
gb_model.fit(X_train, y_train)
xgb_model.fit(X_train, y_train)

In [68]:
rf_pred = rf_model.predict(X_test)
gb_pred = gb_model.predict(X_test)
xgb_pred = xgb_model.predict(X_test)

In [69]:
ensemble_pred = np.mean([rf_pred, gb_pred, xgb_pred], axis=0)

In [70]:
mse = mean_squared_error(y_test, ensemble_pred)
mae = mean_absolute_error(y_test, ensemble_pred)
print("Ensemble Mean Squared Error:", mse)
print("Ensemble Mean Absolute Error:", mae)

Ensemble Mean Squared Error: 8547.139095020157
Ensemble Mean Absolute Error: 36.37898346606146


In [72]:
r_squared = r2_score(y_test, ensemble_pred)
print("Ensemble R-squared:", r_squared)

Ensemble R-squared: 0.7139643635897177
