# Bangalore House Price Prediction - Supervised Regression Problem

# Import Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
path = r"Bangalore house data.csv"
df= pd.read_csv(path)
df.shape


# Exploratory Data Analysis

In [None]:
# get the information of data
df.info()

In [None]:
# We have only 3 neumerical features - bath, balcony and price
# 4 categorical features - area type, size, society, and total_srft
# Target Feature =======>>>>>> price >>>>>>
# Price in lakh

In [None]:
df.describe()
#observe 75% and max value it shows huge diff

In [None]:
sns.pairplot(df)

# bath and price have slightly linear correlation with some outliers

In [None]:
# value count of each feature
def value_count(df):
  for var in df.columns:
    print(df[var].value_counts())
    print("--------------------------------")

In [None]:
value_count(df)

In [None]:
# correlation heatmap
num_vars = ["bath", "balcony", "price"]
sns.heatmap(df[num_vars].corr(),cmap="coolwarm", annot=True)

# correlation of bath is greater than a balcony with price

## Data cleaning

In [None]:
df.isnull().sum() # find the howmuch missing data available

In [None]:
df.isnull().mean()*100 # % of measing value

#society has 41.3% missing value (need to drop)

In [None]:
# visualize missing value using heatmap to get idea where is the value missing

# plt.figure(figsize=(16,9))
sns.heatmap(df.isnull())

In [None]:
# Drop ----------> society feature
# because 41.3% missing value
df2 = df.drop('society', axis='columns')
df2.shape

In [None]:
df2.isnull().sum()

In [None]:
# fill mean value in --------> balcony feature
# because it contain 4.5% missing value
df2['balcony'] = df2['balcony'].fillna(df2['balcony'].mean())
df2.isnull().sum()

In [None]:
# drop na value rows from df2
# because there is very less % value missing
df3 = df2.dropna()
df3.shape

In [None]:
df3.isnull().sum()

In [None]:
df3.head()

In [None]:
df3.info()

## Feature Engineering

In [None]:
# to show all th ecolumns and rows
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

### Converting 'total_sqft' cat feature in numeric

In [None]:
df3['total_sqft'].value_counts()

# here we observe that 'total_sqft' contain string value in diff format
#float, int like value 1689.28,817 
# range value: 540 - 740 
# number and string: 142.84Sq. Meter, 117Sq. Yards, 1Grounds

# best strategy is to convert it into number by spliting it

In [None]:
total_sqft_int = []
for str_val in df3['total_sqft']:
  try:
    total_sqft_int.append(float(str_val)) 
                    # if '123.4' like this value in str then conver in float
  except:
    try:
      temp = []
      temp = str_val.split('-')
      total_sqft_int.append((float(temp[0])+float(temp[-1]))/2)
                    # '123 - 534' this str value split and take mean
    except:
      total_sqft_int.append(np.nan) 
                    # if value not contain in above format then consider as nan

In [None]:
# reset the index of dataframe
df4 = df3.reset_index(drop=True) # drop=True - don't add index column in df

In [None]:
# join df4 and total_srft_int list
df5 = df4.join(pd.DataFrame({'total_sqft_int':total_sqft_int}))
df5.head()

In [None]:
df5.isnull().sum()

In [None]:
# drop na value
df6 = df5.dropna()
df6.shape

In [None]:
df6.info()

## Working on <<<< Size >>>> feature

In [None]:
df6['size'].value_counts()

# size feature shows the number of rooms 

In [None]:
"""
in  size feature we assume that 
2 BHK = 2 Bedroom == 2 RK
so takes only number and remove sufix text
"""
size_int = []
for str_val in df6['size']:
  temp=[]
  temp = str_val.split(" ")
  try:
    size_int.append(int(temp[0]))
  except:
    size_int.append(np.nan)
    print("Noice = ",str_val)

In [None]:
df6 = df6.reset_index(drop=True)

In [None]:
# join df6 and list size_int
df7 = df6.join(pd.DataFrame({'bhk':size_int}))
df7.shape

In [None]:
df7.tail()

In [None]:
# here we consider  1 BHK requierd min 350 sqft are
df7[df7['total_sqft_int']/df7['bhk'] < 350]


In [None]:
# if 1 BHK total_sqft are < 350 then we ae going to remove them
df8 = df7[~(df7['total_sqft_int']/df7['bhk'] < 350)]
df8.shape

In [None]:
# create new feature that is price per squre foot 

#price in lakh so conver into rupee and then / by total_sqft_int
df8['price_per_sqft'] = df8['price']*100000 / df8['total_sqft_int']  
df8.head()

In [None]:
df8.price_per_sqft.describe()

#here we can see huge difference between min and max price_per_sqft

# Working on <<<< Bath >>>> feature

In [None]:
df8.bath.unique()

In [None]:
df8[df8.bath > df8.bhk+2]

In [None]:
# here we are considering data only total no. bathroom =  bhk + 1
df9 = df8[df8.bath < df8.bhk+2]
df9.shape

In [None]:
df9.head()

# Categorical Variable Encoding

In [None]:
df10 = df9.drop(["size","total_sqft"], axis =1)
df10.head()

In [None]:
df10.info()

## Working on <<<<<< area_type >>>>> feature

In [None]:
df10['area_type'].value_counts()

In [None]:
df11 = df10.copy()
# appy Ohe-Hot  encoding on 'area_type' feature
for cat_var in ["Super built-up  Area","Built-up  Area","Plot  Area"]:
  df11["area_type"+cat_var] = np.where(df11['area_type']==cat_var, 1,0)
df11.shape      # this was only name areatype + car_var

In [None]:
df11.head(2)

In [None]:
df11.info()

# Working on <<<< Location >>>> feature

In [None]:
location_value_count = df11['location'].value_counts()
location_value_count

In [None]:
location_gert_20 = location_value_count[location_value_count>=20].index
location_gert_20

In [None]:
# location count is greter than 19 then we create column of that feature 
# then if this location present in location feature then set value 1 else 0 ( ohe hot encoding)
df12 = df11.copy()
for cat_var in location_gert_20:
  df12['location_'+cat_var]=np.where(df12['location']==cat_var, 1,0)
df12.shape

In [None]:
df12.head()

## Drop categorical variable

In [None]:
df13 = df12.drop(["area_type",'location'], axis =1)
df13.shape

In [None]:
df13.info()

In [None]:
df13.head()

In [None]:
df13.to_csv('data.csv', index=False) 