In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import matplotlib
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
df = pd.read_csv('../input/bengaluru-house-price-data/Bengaluru_House_Data.csv')
df.head()

In [3]:
df.shape

In [4]:
df.groupby(['area_type'])['area_type'].value_counts()

In [5]:
df=df.drop(['area_type', 'availability', 'society','balcony'],axis=1)

In [6]:
df.head()

In [7]:
df.isnull().sum()

In [8]:
df = df.dropna()

In [9]:
df.isna().sum()

In [10]:
df['size'].value_counts()
#we need to convert all Bedroom to BHK

In [11]:
df['bhk'] = df['size'].apply(lambda x: int(x.split(' ')[0]))

In [12]:
df['bhk'].value_counts()

In [13]:
df['total_sqft'].unique()

In [14]:
def avg(x):
    
    tokens = x.split('-')
    if(len(tokens) == 2):
        return (float(tokens[0])+float(tokens[1]))/2
    try:
        return float(x)
    except:
        return None
    

In [15]:
df['total_sqft'] = df['total_sqft'].apply(avg)

In [16]:
df['total_sqft'].unique()

In [17]:
# Let us now do some feature engineering, we will create a column called price per sq_feet
df['price_per_sqft'] = df['price']*100000/df['total_sqft']
df.head(10)

In [18]:
# Let us go through location data
df['location'].value_counts()

In [19]:
df['location'].apply(lambda x: x.strip())

In [20]:
location_stats = df.groupby('location')['location'].agg('count').sort_values(ascending=False)
location_stats


In [21]:
# Usually to deal with categorical data we would use one hot encoding, but in this case that would lead to an additional
# 1304 columns which is way too many features, so we are gonna take all the locations that have less than 10 occurences
# and write them as 'other' location to simplify our ML algorithm
len(location_stats[location_stats <= 10])

In [22]:
location_less_than_10 = location_stats[location_stats <= 10]
location_less_than_10

In [23]:
df['location'] = df['location'].apply(lambda x: 'other' if x in location_less_than_10 else x)
len(df['location'].unique())
# Now we will only have 242 columns when we convert to OHE (One-Hot-Encoding)

In [24]:
df['location'].value_counts()

In [25]:
# Now let us do some Outlier detection
# Let us look at sq_ft per bedroom 
df[df['total_sqft']/df['bhk'] < 300]


In [26]:
df = df[~(df['total_sqft']/df['bhk'] < 300)]
df.shape

In [27]:
df['price_per_sqft'].describe()
# Since we are buliding a generic model we will remove the extreme cases.
# Let us write a function that can do this based on standard deviation
# We will keep the data that is within 1 standard deviation of the mean, so about 68% of the data
# All of this will be done per location

In [28]:
def remove_pps_outliers(df):
    df_out = pd.DataFrame()
    for key,subdf in df.groupby('location'):
        mean = np.mean(subdf.price_per_sqft)
        std = np.std(subdf.price_per_sqft)
        reduced_df = subdf[ (subdf['price_per_sqft']> (mean-std)) & (subdf['price_per_sqft'] < (mean+std))]
        df_out = pd.concat([df_out,reduced_df],ignore_index = True)
    
    return df_out
    
    
df = remove_pps_outliers(df)
df.shape

In [29]:
""" Let us visualize if there are any houses of similar sqft where the price of a 2 BHK > 3BHK, this could be cause of 
many reasons such as spacial amenities etc. """

def plot_scatter_chart(df,location):
    bhk2 = df[(df.bhk == 2) & (df.location == location)]
    bhk3 = df[(df.bhk == 3) & (df.location == location)]
    matplotlib.rcParams['figure.figsize'] = (15,10)
    plt.scatter(bhk2.total_sqft,bhk2.price,color='b', marker='o',label='2 BHK')
    plt.scatter(bhk3.total_sqft,bhk3.price,color='green', marker='+', label = '3 BHK')
    plt.xlabel('Total Square Footage')
    plt.ylabel('Total Price in Lakhs')
    plt.title(location)
    plt.legend()
    

In [30]:
plot_scatter_chart(df,'Whitefield')

In [31]:
plot_scatter_chart(df,'Rajaji Nagar')

In [32]:
plot_scatter_chart(df,'Electronic City')

In [33]:
""" Since this is a general model, we will remove the 'outliers' where the price of a 3BHK is greater than the
price of a 2BHK given that they have the same square footage"""
def remove_bhk_outliers(df):
    exclude_indices = np.array([])
    for location,location_df in df.groupby('location'):
        bhk_stats = {}
        for bhk, bhk_df in location_df.groupby('bhk'):
            bhk_stats[bhk]  = {
                'mean': np.mean(bhk_df.price_per_sqft),
                'std': np.std(bhk_df.price_per_sqft),
                'count': bhk_df.shape[0]
            }
        
        """Now we will filter every 2BHK house whose price_per_sqft is < the mean of 1BHK of same sq_ft, similarily 
            we will do for all BHK """
        for bhk,bhk_df in location_df.groupby('bhk'):
            stats = bhk_stats.get(bhk-1)
            if stats and stats['count'] > 5:
                exclude_indices = np.append(exclude_indices,bhk_df[bhk_df.price_per_sqft < (stats['mean'])].index.values)
        
    return df.drop(exclude_indices,axis='index')


In [34]:
df = remove_bhk_outliers(df)
df.shape

In [35]:
plot_scatter_chart(df,'Electronic City')

In [36]:
plot_scatter_chart(df,'Whitefield')

In [37]:
matplotlib.rcParams['figure.figsize'] = (20,10)
plt.hist(df.price_per_sqft,rwidth=0.8)
plt.xlabel("Price per Square Feet")
plt.ylabel("Count")

#Sort of like a normal dist

In [38]:
#Let us look at bathrooms
plt.hist(df.bath,rwidth=0.8)
plt.xlabel("Number of bathrooms")
plt.ylabel("Count")

In [39]:
""" It would be considered abnormal if the number of bathrooms were to exceed the number of rooms by >2 """
df[df['bath']-df['bhk'] >2]

In [40]:
df = df[df.bath<df.bhk+2]
df.shape

In [41]:
df.head(10)

In [42]:
""" Let us create a new df where we drop price_per_sqft since it served as a way to detect outliers and size since we 
already have bhk"""
df10 = df.drop(['size','price_per_sqft'],axis='columns')
df10.head(10)

In [43]:
""" We are going to use K-fold cross validation and grid search cv to come up with the best parameters 

K-fold cross validation: https://machinelearningmastery.com/k-fold-cross-validation/

GridSearch CV: https://www.mygreatlearning.com/blog/gridsearchcv/


Let us convert our categorical location data into data our model can use, using OHE, aka dummies in pandas"""
dummies = pd.get_dummies(df10.location)
dummies.head(10)

In [44]:
df11 = pd.concat([df10,dummies.drop(['other'],axis='columns')],axis='columns')
df11.head(5)

In [45]:
df12 = df11.drop(['location'],axis='columns')
df12.head(5)

In [46]:
X = df12.drop('price',axis='columns')
X.head(5)

In [47]:
y = df12.price
y.head(5)

In [49]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(X,y,test_size=0.2,random_state=10)

In [54]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train,Y_train)
lr.score(X_test,Y_test)

In [61]:
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score

cv = ShuffleSplit(n_splits=5,test_size=0.2,random_state=0)

cross_val_score(LinearRegression(),X, y,cv=cv)

In [65]:
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor

def find_best_model(X,y):
    algos = {
        'linear regression' : {
            'model': LinearRegression(),
            'params':{
                'normalize':[True,False]
            }
        },
        'lasso':{
            'model': Lasso(),
            'params':{
                'alpha':[1,2],
                'selection':['random','cyclic']
            }
        },
        'decision tree':{
            'model':DecisionTreeRegressor(),
            'params': {
                'criterion':['mse','friedman_mse'],
                'splitter':['best','random']
            }
        }
    }

    scores=[]
    cv = ShuffleSplit(n_splits=5,test_size=0.2,random_state = 0)
    for algo_name,config in algos.items():
        gs = GridSearchCV(config['model'],config['params'],cv=cv,return_train_score=False)
        gs.fit(X,y)
        scores.append({
            'model':algo_name,
            'best params':gs.best_params_,
            'best score':gs.best_score_
        })

    return pd.DataFrame(scores,columns=['model','best params','best score'])


find_best_model(X,y)

In [66]:
X.columns

In [88]:
np.where(X.columns=='Indira Nagar')[0][0]

In [95]:
def predict_price(bath,bhk,location,sqft):
    location_index = np.where(X.columns==location)[0][0]
    x = np.zeros(len(X.columns))
    x[0] = sqft
    x[1] = bath
    x[2] = bhk
    if location_index >= 0:
        x[location_index] = 1
        
    return lr.predict([x])[0]

In [96]:
predict_price(2,2,'Indira Nagar',1500)

In [103]:
predict_price(2,5,'Whitefield',2000)