# **Import Library**

In [81]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import r2_score
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer

# **Read Dataset**

In [2]:
data = pd.read_csv('/content/Bengaluru_House_Data.csv')

In [3]:
data.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [4]:
data.shape

(13320, 9)

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     13320 non-null  object 
 1   availability  13320 non-null  object 
 2   location      13319 non-null  object 
 3   size          13304 non-null  object 
 4   society       7818 non-null   object 
 5   total_sqft    13320 non-null  object 
 6   bath          13247 non-null  float64
 7   balcony       12711 non-null  float64
 8   price         13320 non-null  float64
dtypes: float64(3), object(6)
memory usage: 936.7+ KB


# **Dataset Cleaning**

In [6]:
def print_value_counts(data):
  for column in data.columns:
    print(f"\nValue counts for column '{column}':")
    print(data[column].value_counts())

print_value_counts(data)


Value counts for column 'area_type':
area_type
Super built-up  Area    8790
Built-up  Area          2418
Plot  Area              2025
Carpet  Area              87
Name: count, dtype: int64

Value counts for column 'availability':
availability
Ready To Move    10581
18-Dec             307
18-May             295
18-Apr             271
18-Aug             200
                 ...  
16-Oct               1
17-Jan               1
16-Nov               1
16-Jan               1
14-Jul               1
Name: count, Length: 81, dtype: int64

Value counts for column 'location':
location
Whitefield                         540
Sarjapur  Road                     399
Electronic City                    302
Kanakpura Road                     273
Thanisandra                        234
                                  ... 
3rd Stage Raja Rajeshwari Nagar      1
Chuchangatta Colony                  1
Electronic City Phase 1,             1
Chikbasavanapura                     1
Abshot Layout                

In [7]:
data.isna().sum()

Unnamed: 0,0
area_type,0
availability,0
location,1
size,16
society,5502
total_sqft,0
bath,73
balcony,609
price,0


In [8]:
new_data = data.drop(['area_type', 'availability', 'society', 'balcony'], axis=1)
new_data.head()

Unnamed: 0,location,size,total_sqft,bath,price
0,Electronic City Phase II,2 BHK,1056,2.0,39.07
1,Chikka Tirupathi,4 Bedroom,2600,5.0,120.0
2,Uttarahalli,3 BHK,1440,2.0,62.0
3,Lingadheeranahalli,3 BHK,1521,3.0,95.0
4,Kothanur,2 BHK,1200,2.0,51.0


In [9]:
new_data.describe()

Unnamed: 0,bath,price
count,13247.0,13320.0
mean,2.69261,112.565627
std,1.341458,148.971674
min,1.0,8.0
25%,2.0,50.0
50%,2.0,72.0
75%,3.0,120.0
max,40.0,3600.0


In [10]:
new_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   location    13319 non-null  object 
 1   size        13304 non-null  object 
 2   total_sqft  13320 non-null  object 
 3   bath        13247 non-null  float64
 4   price       13320 non-null  float64
dtypes: float64(2), object(3)
memory usage: 520.4+ KB


In [11]:
new_data['location'].value_counts()

Unnamed: 0_level_0,count
location,Unnamed: 1_level_1
Whitefield,540
Sarjapur Road,399
Electronic City,302
Kanakpura Road,273
Thanisandra,234
...,...
3rd Stage Raja Rajeshwari Nagar,1
Chuchangatta Colony,1
"Electronic City Phase 1,",1
Chikbasavanapura,1


In [12]:
new_data['location'] = new_data['location'].fillna('Sarjapur Road')

In [13]:
data['size'].value_counts()

Unnamed: 0_level_0,count
size,Unnamed: 1_level_1
2 BHK,5199
3 BHK,4310
4 Bedroom,826
4 BHK,591
3 Bedroom,547
1 BHK,538
2 Bedroom,329
5 Bedroom,297
6 Bedroom,191
1 Bedroom,105


In [14]:
new_data['size'] = new_data['size'].fillna('2 BHK')

In [15]:
new_data['bath'] = new_data['bath'].fillna(new_data['bath'].median())

In [16]:
new_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   location    13320 non-null  object 
 1   size        13320 non-null  object 
 2   total_sqft  13320 non-null  object 
 3   bath        13320 non-null  float64
 4   price       13320 non-null  float64
dtypes: float64(2), object(3)
memory usage: 520.4+ KB


In [17]:
new_data['bhk'] = new_data['size'].str.split().str.get(0).astype(int)

In [18]:
new_data[new_data.bhk > 20]

Unnamed: 0,location,size,total_sqft,bath,price,bhk
1718,2Electronic City Phase II,27 BHK,8000,27.0,230.0,27
4684,Munnekollal,43 Bedroom,2400,40.0,660.0,43


In [19]:
new_data['total_sqft'].unique()

array(['1056', '2600', '1440', ..., '1133 - 1384', '774', '4689'],
      dtype=object)

In [20]:
def convertRanege(x):

  temp = x.split('-')
  if len(temp) == 2:
    return (float(temp[0]) + float(temp[1]))/2
  try:
    return float(x)
  except:
    return None

In [21]:
new_data['total_sqft'] = new_data['total_sqft'].apply(convertRanege)

In [22]:
new_data.head()

Unnamed: 0,location,size,total_sqft,bath,price,bhk
0,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.0,4
2,Uttarahalli,3 BHK,1440.0,2.0,62.0,3
3,Lingadheeranahalli,3 BHK,1521.0,3.0,95.0,3
4,Kothanur,2 BHK,1200.0,2.0,51.0,2


# Price Per Square feet

In [23]:
new_data['Price_per_sqft'] = new_data['price'] * 100000 / new_data['total_sqft']

In [24]:
new_data['Price_per_sqft']

Unnamed: 0,Price_per_sqft
0,3699.810606
1,4615.384615
2,4305.555556
3,6245.890861
4,4250.000000
...,...
13315,6689.834926
13316,11111.111111
13317,5258.545136
13318,10407.336319


In [25]:
new_data.describe()

Unnamed: 0,total_sqft,bath,price,bhk,Price_per_sqft
count,13274.0,13320.0,13320.0,13320.0,13274.0
mean,1559.626694,2.688814,112.565627,2.802778,7907.501
std,1238.405258,1.338754,148.971674,1.294496,106429.6
min,1.0,1.0,8.0,1.0,267.8298
25%,1100.0,2.0,50.0,2.0,4266.865
50%,1276.0,2.0,72.0,3.0,5434.306
75%,1680.0,3.0,120.0,3.0,7311.746
max,52272.0,40.0,3600.0,43.0,12000000.0


In [26]:
new_data['location'].value_counts()

Unnamed: 0_level_0,count
location,Unnamed: 1_level_1
Whitefield,540
Sarjapur Road,399
Electronic City,302
Kanakpura Road,273
Thanisandra,234
...,...
Mango Garden Layout,1
Milk Colony,1
"Basnashankari,6th stage,",1
Near ullas theater,1


In [27]:
new_data['location'] = new_data['location'].apply(lambda x: x.strip())
location_count = new_data['location'].value_counts()

In [28]:
location_count_less_10 = location_count[location_count <= 10]
location_count_less_10

Unnamed: 0_level_0,count
location,Unnamed: 1_level_1
1st Block Koramangala,10
Dairy Circle,10
Nagadevanahalli,10
Sadashiva Nagar,10
Naganathapura,10
...,...
Xavier Layout,1
Ramanagara Channapatna,1
Maheswari Nagar,1
Hsr layout sector3,1


In [29]:
new_data.describe()

Unnamed: 0,total_sqft,bath,price,bhk,Price_per_sqft
count,13274.0,13320.0,13320.0,13320.0,13274.0
mean,1559.626694,2.688814,112.565627,2.802778,7907.501
std,1238.405258,1.338754,148.971674,1.294496,106429.6
min,1.0,1.0,8.0,1.0,267.8298
25%,1100.0,2.0,50.0,2.0,4266.865
50%,1276.0,2.0,72.0,3.0,5434.306
75%,1680.0,3.0,120.0,3.0,7311.746
max,52272.0,40.0,3600.0,43.0,12000000.0


In [30]:
(new_data['total_sqft'] / new_data['bhk']).describe()

Unnamed: 0,0
count,13274.0
mean,575.074878
std,388.205175
min,0.25
25%,473.333333
50%,552.5
75%,625.0
max,26136.0


In [31]:
new_data = new_data[((new_data['total_sqft'] / new_data['bhk']) >= 300)]
new_data.describe()

Unnamed: 0,total_sqft,bath,price,bhk,Price_per_sqft
count,12530.0,12530.0,12530.0,12530.0,12530.0
mean,1594.564544,2.559537,111.382401,2.650838,6303.979357
std,1261.271296,1.077938,152.077329,0.976678,4162.237981
min,300.0,1.0,8.44,1.0,267.829813
25%,1116.0,2.0,49.0,2.0,4210.526316
50%,1300.0,2.0,70.0,3.0,5294.117647
75%,1700.0,3.0,115.0,3.0,6916.666667
max,52272.0,16.0,3600.0,16.0,176470.588235


In [32]:
def remove_outliers_sqft(df):
  df_output = pd.DataFrame()
  for key, subdf in df.groupby('location'):
    m = np.mean(subdf.Price_per_sqft)
    st = np.std(subdf.Price_per_sqft)

    gen_df = subdf[(subdf.Price_per_sqft > (m - st)) & (subdf.Price_per_sqft <= (m + st))]
    df_output = pd.concat([df_output, gen_df], ignore_index=True)
  return df_output
new_data = remove_outliers_sqft(new_data)
new_data.describe()

Unnamed: 0,total_sqft,bath,price,bhk,Price_per_sqft
count,9324.0,9324.0,9324.0,9324.0,9324.0
mean,1509.649779,2.460854,94.45964,2.559846,5725.704373
std,899.036354,0.950378,110.669062,0.849226,2532.744932
min,300.0,1.0,10.0,1.0,1250.0
25%,1110.0,2.0,49.0,2.0,4260.30352
50%,1287.5,2.0,67.0,2.0,5188.45727
75%,1650.0,3.0,100.0,3.0,6405.370803
max,30400.0,14.0,2912.0,10.0,35000.0


In [33]:
def bhk_remove_outliers(df):
  exclude_indices = np.array([])
  for location, location_df in df.groupby('location'):
    bhk_stats = {}
    for bhk, bhk_df in location_df.groupby('bhk'):
      bhk_stats[bhk] = {
          'mean': np.mean(bhk_df.Price_per_sqft),
          'std': np.std(bhk_df.Price_per_sqft),
          'count': bhk_df.shape[0]
      }

      print(location, bhk_stats)
      for bhk, bhk_df in location_df.groupby('bhk'):
        stats = bhk_stats.get(bhk-1)
        if stats and stats['count'] > 5:
          exclude_indices = np.append(exclude_indices, bhk_df[bhk_df.Price_per_sqft < (stats['mean'])].index.values)
  return df.drop(exclude_indices, axis='index')

In [34]:
new_data = bhk_remove_outliers(new_data)

1st Block BEL Layout {3: {'mean': np.float64(5519.480519480519), 'std': 0.0, 'count': 1}}
1st Block HBR Layout {1: {'mean': np.float64(7500.0), 'std': 0.0, 'count': 1}}
1st Block HBR Layout {1: {'mean': np.float64(7500.0), 'std': 0.0, 'count': 1}, 4: {'mean': np.float64(4761.9047619047615), 'std': 0.0, 'count': 1}}
1st Block HRBR Layout {2: {'mean': np.float64(5360.0), 'std': 0.0, 'count': 1}}
1st Block HRBR Layout {2: {'mean': np.float64(5360.0), 'std': 0.0, 'count': 1}, 3: {'mean': np.float64(3478.2608695652175), 'std': 0.0, 'count': 1}}
1st Block Jayanagar {2: {'mean': np.float64(11983.805668016194), 'std': 0.0, 'count': 1}}
1st Block Jayanagar {2: {'mean': np.float64(11983.805668016194), 'std': 0.0, 'count': 1}, 3: {'mean': np.float64(11756.16905248807), 'std': 701.6243657657865, 'count': 3}}
1st Block Jayanagar {2: {'mean': np.float64(11983.805668016194), 'std': 0.0, 'count': 1}, 3: {'mean': np.float64(11756.16905248807), 'std': 701.6243657657865, 'count': 3}, 4: {'mean': np.float

In [35]:
new_data.shape

(7535, 7)

In [36]:
new_data = new_data.drop(['size', 'Price_per_sqft'], axis=1)

# *Cleaned Dataset*

In [37]:
new_data.head()

Unnamed: 0,location,total_sqft,bath,price,bhk
0,1st Block BEL Layout,1540.0,3.0,85.0,3
1,1st Block HBR Layout,600.0,1.0,45.0,1
2,1st Block HBR Layout,3150.0,4.0,150.0,4
3,1st Block HRBR Layout,2300.0,3.0,80.0,3
4,1st Block HRBR Layout,1250.0,2.0,67.0,2


In [60]:
new_data.to_csv('Clean_data')

# Define Target Veriable

In [56]:
X = new_data.drop('price', axis=1)
y = new_data['price']

# Data Spliting

In [57]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

# Apply Encoding

In [75]:
colms_trans = make_column_transformer((OneHotEncoder(handle_unknown='ignore'), ['location']),remainder='passthrough')

# Apply Scaling

In [76]:
scaler = StandardScaler()

# Apply Randomforest Model

In [77]:
random = RandomForestRegressor()

In [78]:
pipe = make_pipeline(colms_trans, StandardScaler(with_mean=False), random)

In [79]:
pipe.fit(X_train, y_train)

# Random forest Result

In [80]:
pred_rf = pipe.predict(X_test)
print('R2 Score', r2_score(y_test, pred_rf))

R2 Score 0.6869046137700419


In [82]:
import pickle

# Save Model using Pickle

In [83]:
# Saving the trained model
with open('random_forest_model.pkl', 'wb') as file:
    pickle.dump(pipe, file)

print("Model saved successfully!")

Model saved successfully!
