House Price Prediction

In [None]:
import pandas as pd
import numpy as np

dataset = "bengaluru_house_prices.csv"

df = pd.read_csv(dataset)
print(df.isna().sum())



In [None]:
# Dropping unnecessary features

c = ["area_type","availability","society","balcony"]
df.drop(columns=c,inplace=True)

In [None]:
# Getting the number of BHK in standard formate

size = []

for i in df['size']:
    size = df['size'].str.strip().str.get(0).astype(int)

df['size']=size.astype(int)   

In [None]:
# Filling nan values using imputer

from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy="mean")
df[['size']] = imputer.fit_transform(df[['size']])

c = SimpleImputer(strategy="most_frequent")
df[['location']] = c.fit_transform(df[['location']])

i = SimpleImputer(strategy='mean')
df[['total_sqft']] = i.fit_transform(df[["total_sqft"]])

In [None]:
df.iloc[2500:2505] # for printing a specific range of values

df.dropna(subset=['bath'],inplace=True) # removing nan values of bath

In [None]:
#helper function to remove range

def range_fix(x):
    temp = x.split("-")
    if len(temp)==2:
        return (float(temp[0])+float(temp[1]))//2

    try:
        return x
    except :
        return None
    
df['total_sqft'] = df['total_sqft'].apply(range_fix)

# force convertion from object to float
df['total_sqft'] = pd.to_numeric(df['total_sqft'],errors='coerce')

# Adding a new feature for square feet rate
df['sqft_rate'] = ((df['price'] * 1000000) / df['total_sqft']).round(2)

In [None]:
for c in df.columns:
    print(c , " : ",df[c].dtype)

In [None]:
#taking a count of all location

location_count = df['location'].value_counts()

#replace location name with others if there occurance is less then 10

location_count_10 = location_count[location_count<10].index.to_list()
df['location'] = df['location'].apply(lambda x : "Other" if x in location_count_10 else x)


In [None]:
# Handling outliers

# outliers exists in bhk total_sqft:1,53k , 

data = df[(df['total_sqft']/df['size'])>=300]
data = data.rename(columns={'size':'BHK','bath':'Bathroom','location':'Location','total_sqft':'Size'}) # renaming columns

import numpy as np

def remove_outlier_sqft(df):
    df_output= pd.DataFrame()
    for key , subdf in df.groupby('location'):
        m = np.mean(subdf.sqft_rate)
        st = np.std(subdf.sqft_rate)
        gen_df = subdf[(subdf.sqft_rate>(m-st)) & (subdf.sqft_rate<=(m+st))]
        df_output = pd.concat([df_output,gen_df],ignore_index=True)
    return df_output
data = remove_outlier_sqft(data)
data.describe()

def bhk_outlier_remover(df):
    excluder_indices = np.array([])
    for location , location_df in df.groupby('location'):
        bhk_stat = []
        for bhk , bhk_df in location_df.groupby('BHK'):
            bhk_stat[BHK]={
                'mean' : np.mean(bhk_df.sqft_rate),
                'std' : np.std(bhk_df.sqft_rate),
                'count' : bhk_df.shaper[0]
            }
        for bhk , bhk_df in location_df.groupby('BHK'):
            stats = bhk_stat.get(bhk-1)
            if stats and stats['count']>5:
              excluder_indices= np.append(excluder_indices,bhk_df[bhk_df.sqft_rate<(stats['mean'])].index.values)  
    return df.drop(excluder_indices,axis='index')

Cleaned data

In [None]:
data = data.drop(columns=(('sqft_rate')),inplace=True)
data.to_csv('Cleaned Data')

In [None]:
data['Location'].unique()

Training the model

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression , Lasso , Ridge
from sklearn.metrics import r2_score
from sklearn.preprocessing import OneHotEncoder , StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

import pandas as pd

X = data.drop(columns=['Cost'])
y = data.Cost

X_train , X_test , y_train , y_test = train_test_split(X,y,random_state=42,test_size=0.2)

print(X_train.shape,"\n",X_test.shape,'\n',y_train.shape,'\n',y_test.shape)

In [None]:
column_transformer = make_column_transformer((OneHotEncoder(sparse_output=False),['Location']),remainder='passthrough')
scaler = StandardScaler()
lr = LinearRegression()
pipe = make_pipeline(column_transformer,scaler,lr)
pipe.fit(X_train,y_train)

In [209]:
y_pred_lr = pipe.predict(X_test)
print(r2_score(y_test,y_pred_lr))

0.43923361801065997


In [None]:
print(y_pred_lr)

[ 52.80772757 -33.49937313  58.08194659 ... 136.41245288 286.26727451
 115.67174124]
