In [1]:
import json  # will be needed for saving preprocessing details

import numpy as np  

import pandas as pd 

import matplotlib.pyplot as plt 

%matplotlib inline  

import matplotlib 

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression 

import joblib    # for saving algorithm and preprocessing objects

In [2]:
matplotlib.rcParams["figure.figsize"]=(20,10) 

df= pd.read_csv(r"Bengaluru_House_Data.csv") 

df.groupby('area_type')['area_type'].agg('count') 

df2=df.drop(['area_type','society','balcony','availability'],axis='columns') 

df2.isnull().sum() 

df3=df2.dropna() 

df3['size'].unique() 

df3['bhk']=df3['size'].apply(lambda x: int(x.split(' ')[0])) 

df3[df3.bhk>20] 

def is_float(x): 

    try: 

        float(x)  #try to convert a value if not possible then it is a exception 

    except: 

        return False  

     

    return True 

df3[~df3['total_sqft'].apply(is_float)].head(10) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df3['bhk']=df3['size'].apply(lambda x: int(x.split(' ')[0]))


Unnamed: 0,location,size,total_sqft,bath,price,bhk
30,Yelahanka,4 BHK,2100 - 2850,4.0,186.0,4
122,Hebbal,4 BHK,3067 - 8156,4.0,477.0,4
137,8th Phase JP Nagar,2 BHK,1042 - 1105,2.0,54.005,2
165,Sarjapur,2 BHK,1145 - 1340,2.0,43.49,2
188,KR Puram,2 BHK,1015 - 1540,2.0,56.8,2
410,Kengeri,1 BHK,34.46Sq. Meter,1.0,18.5,1
549,Hennur Road,2 BHK,1195 - 1440,2.0,63.77,2
648,Arekere,9 Bedroom,4125Perch,9.0,265.0,9
661,Yelahanka,2 BHK,1120 - 1145,2.0,48.13,2
672,Bettahalsoor,4 Bedroom,3090 - 5002,4.0,445.0,4


In [3]:
def convert_range(x): 

    Tokens = x.split('-') 

    if len(Tokens)==2: 

        return (float(Tokens[0]) +float(Tokens[1]))/2 

    try: 

        return float(x) 

    except: 

        return None

In [4]:
df4=df3.copy() 

df4['total_sqft'] = df4['total_sqft'].apply(convert_range) 

In [5]:
df5 = df4.copy() 

df5['price_per_sqft'] = df['price']*100000/df5['total_sqft'] 

df5.location = df5.location.apply(lambda x: x.strip()) 

In [6]:
loc_stats=df5.groupby('location') ['location'].agg('count').sort_values(ascending=False) 

len(loc_stats[loc_stats<=10].unique())

10

In [7]:
loc_less_than_10 = loc_stats[loc_stats<=10]
loc_less_than_10

location
Basapura                 10
1st Block Koramangala    10
Gunjur Palya             10
Kalkere                  10
Sector 1 HSR Layout      10
                         ..
1 Giri Nagar              1
Kanakapura Road,          1
Kanakapura main  Road     1
Karnataka Shabarimala     1
whitefiled                1
Name: location, Length: 1052, dtype: int64

In [8]:
df5.location = df5.location.apply(lambda x: 'other' if x in loc_less_than_10 else x ) 

df5[df5.total_sqft/df5.bhk<300].head(10)

Unnamed: 0,location,size,total_sqft,bath,price,bhk,price_per_sqft
9,other,6 Bedroom,1020.0,6.0,370.0,6,36274.509804
45,HSR Layout,8 Bedroom,600.0,9.0,200.0,8,33333.333333
58,Murugeshpalya,6 Bedroom,1407.0,4.0,150.0,6,10660.98081
68,Devarachikkanahalli,8 Bedroom,1350.0,7.0,85.0,8,6296.296296
70,other,3 Bedroom,500.0,3.0,100.0,3,20000.0
78,Kaval Byrasandra,2 BHK,460.0,1.0,22.0,2,4782.608696
89,Rajaji Nagar,6 Bedroom,710.0,6.0,160.0,6,22535.211268
119,Hennur Road,2 Bedroom,276.0,3.0,23.0,2,8333.333333
129,Vishwapriya Layout,7 Bedroom,950.0,7.0,115.0,7,12105.263158
149,other,6 Bedroom,1034.0,5.0,185.0,6,17891.682785


In [9]:
df6 = df5[~(df5.total_sqft/df5.bhk<300)] 

In [10]:
def remove_outliers(df):         #take dataframe as input 

    df_out = pd.DataFrame() 

    for key, subdf in df.groupby('location'): 

        m = np.mean(subdf.price_per_sqft) 

        st = np.std(subdf.price_per_sqft) 

        reduced_df = subdf[(subdf.price_per_sqft>(m-st)) & (subdf.price_per_sqft<=(m+st))] 

        df_out = pd.concat([df_out,reduced_df] , ignore_index=True) 

    return df_out      

In [11]:
df7 = remove_outliers(df6) 

 

{ 

    '1':{ 

        'mean':4000, 

        'std':2000, 

        'count':34 

    }, 

    '2':{ 

         'mean':4300, 

        'std':2300, 

        'count':22 

    } 

} 

{'1': {'mean': 4000, 'std': 2000, 'count': 34},
 '2': {'mean': 4300, 'std': 2300, 'count': 22}}

In [12]:
def remove_bhk(df): 

    exclude_indices = np.array([]) 

    for location, location_df in df.groupby('location'): 

        bhk_stats = {} 

        for bhk, bhk_df in location_df.groupby('bhk'): 

            bhk_stats[bhk] = { 

                 'mean': np.mean(bhk_df.price_per_sqft), 

                 'std': np.std(bhk_df.price_per_sqft), 

                 'count':bhk_df.shape[0] 

            } 

        for bhk, bhk_df in location_df.groupby('bhk'): 

            stats = bhk_stats.get(bhk-1) 

            if stats and stats['count']>5: 

                exclude_indices = np.append(exclude_indices, bhk_df[bhk_df.price_per_sqft<(stats['mean'])].index.values) 

    return df.drop(exclude_indices, axis='index')

In [13]:
df8 = remove_bhk(df7) 

df9 = df8[df8.bath<df8.bhk + 2] 

df10 = df9.drop(['size','price_per_sqft'],axis='columns') 

dummies = pd.get_dummies(df10.location) 

df11 = pd.concat([df10,dummies],axis ='columns') 

df11 = pd.concat([df10,dummies.drop('other',axis='columns')],axis ='columns') 

df12 = df11.drop('location',axis='columns') 

In [14]:
X = df12.drop('price',axis='columns') 

y = df12.price 

In [15]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=10) 

In [16]:
lr_clf = LinearRegression() 

lr_clf = lr_clf.fit(X_train.values , y_train.values)    #model is trained 

lr_clf.score(X_test.values , y_test.values) 

0.8452277697874291

In [17]:
joblib.dump(lr_clf, "./lr_clf.joblib", compress=True)

['./lr_clf.joblib']

In [18]:
def predict_price(location,sqft,bath,bhk): 
    loc_index = np.where(X.columns==location)[0][0]
    x = np.zeros(len(X.columns))
    x[0] = sqft
    x[1] = bath
    x[2] = bhk
    if loc_index >= 0:
        x[loc_index] = 1
    return lr_clf.predict([x])[0]

In [19]:
predict_price('Vijayanagar',1000, 2, 2)

62.297093745810045

In [26]:
locations = df5.location.unique()
BHK = X.bhk.unique()
BHK = np.sort(BHK)
Bath = X.bath.unique()
Bath = np.sort(Bath)
min_sqft = X.total_sqft.min()
max_sqft = X.total_sqft.max()

Variables = [locations, BHK, Bath, min_sqft, max_sqft]
joblib.dump(Variables, './Variable.joblib', compress=True)

['./Variable.joblib']