In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('Mumbai_House_Prices.csv')

In [3]:
data.head()

Unnamed: 0,bhk,type,locality,area,price,price_unit,region,status,age
0,3,Apartment,Lak And Hanware The Residency Tower,685,2.5,Cr,Andheri West,Ready to move,New
1,2,Apartment,Radheya Sai Enclave Building No 2,640,52.51,L,Naigaon East,Under Construction,New
2,2,Apartment,Romell Serene,610,1.73,Cr,Borivali West,Under Construction,New
3,2,Apartment,Soundlines Codename Urban Rainforest,876,59.98,L,Panvel,Under Construction,New
4,2,Apartment,Origin Oriana,659,94.11,L,Mira Road East,Under Construction,New


In [4]:
def convert_price(row):
    if row['price_unit']=='L':
        return row['price']/100    #1 cr = 100 L
    elif row['price_unit'] == 'Cr':
        return row['price']
    else:
        return np.nan

data['price_in_cr'] = data.apply(convert_price, axis=1)
    

In [5]:
data[['price', 'price_unit', 'price_in_cr']].head()

Unnamed: 0,price,price_unit,price_in_cr
0,2.5,Cr,2.5
1,52.51,L,0.5251
2,1.73,Cr,1.73
3,59.98,L,0.5998
4,94.11,L,0.9411


In [6]:
data.drop(columns=['locality','status','age','type','price','price_unit'], inplace=True)

In [7]:
data


Unnamed: 0,bhk,area,region,price_in_cr
0,3,685,Andheri West,2.5000
1,2,640,Naigaon East,0.5251
2,2,610,Borivali West,1.7300
3,2,876,Panvel,0.5998
4,2,659,Mira Road East,0.9411
...,...,...,...,...
76033,3,1527,Juhu,7.0000
76034,5,3049,Juhu,12.0000
76035,4,3313,Napeansea Road,10.0000
76036,2,1305,Bandra East,4.2500


In [8]:
data.dropna(inplace=True)

In [9]:
data['region']=data['region'].apply(lambda x:x.strip())

In [10]:
data.region.value_counts()

region
Thane West         14868
Mira Road East      9902
Dombivali           3041
Kandivali East      2568
Kharghar            2362
                   ...  
Police Colony          1
GTB Nagar              1
Bandra                 1
Sector 14 Vashi        1
Goregaon               1
Name: count, Length: 228, dtype: int64

In [11]:
region_stats = data.groupby('region')['region'].agg('count').sort_values(ascending=False)

In [12]:
region_less_than_10_entries = region_stats[region_stats<=10].index

In [13]:
region_less_than_10_entries

Index(['Sector 20 Kamothe', 'Kanjurmarg East', 'Vasind', 'Koproli',
       'Maneklal Estate', 'Dahisar West', 'kasaradavali thane west',
       'Cuffe Parade', 'Rabale', 'Palava',
       ...
       'Sector 20 Ulwe', 'Sector 18 Kharghar', 'Sector 14 Vashi',
       'Sector 11 Koparkhairane', 'Roadpali', 'Rambaug', 'Police Colony',
       'Pestom Sagar Colony', 'Pen', 'vile parle west'],
      dtype='object', name='region', length=114)

In [14]:
data['region'] = data['region'].apply(lambda x:'other' if x in region_less_than_10_entries else x)

In [15]:
data['region'].value_counts().get('other',0)

np.int64(330)

In [16]:
data['bhk'].unique()

array([ 3,  2,  5,  1,  4,  6,  9,  8, 10,  7])

In [17]:
data.area.unique()

array([ 685,  640,  610, ..., 1974, 1634, 3049], shape=(2331,))

In [18]:
data['area'].value_counts()

area
650     1662
1050    1381
1100    1151
1650    1041
750      950
        ... 
2369       1
1945       1
1442       1
2672       1
3049       1
Name: count, Length: 2331, dtype: int64

In [20]:
data.dropna(inplace=True)

In [23]:
data.describe()

Unnamed: 0,bhk,area,price_in_cr,sqft_per_bed
count,76038.0,76038.0,76038.0,76038.0
mean,2.015111,1024.53685,1.684178,513.278131
std,0.922754,670.276165,2.176655,161.417111
min,1.0,127.0,0.0449,127.0
25%,1.0,640.0,0.64,403.5
50%,2.0,872.0,1.1,502.5
75%,3.0,1179.0,1.94,600.0
max,10.0,16000.0,60.0,4930.0


In [24]:
data['sqft_per_bed'] = data['area']/data['bhk']

In [25]:
data.sqft_per_bed.describe()

count    76038.000000
mean       513.278131
std        161.417111
min        127.000000
25%        403.500000
50%        502.500000
75%        600.000000
max       4930.000000
Name: sqft_per_bed, dtype: float64

In [26]:
data2 = data[data['sqft_per_bed'] >=300 ]

In [27]:
data2

Unnamed: 0,bhk,area,region,price_in_cr,sqft_per_bed
1,2,640,Naigaon East,0.5251,320.00
2,2,610,Borivali West,1.7300,305.00
3,2,876,Panvel,0.5998,438.00
4,2,659,Mira Road East,0.9411,329.50
5,2,826,Parel,3.3000,413.00
...,...,...,...,...,...
76033,3,1527,Juhu,7.0000,509.00
76034,5,3049,Juhu,12.0000,609.80
76035,4,3313,Napeansea Road,10.0000,828.25
76036,2,1305,Bandra East,4.2500,652.50


In [31]:
data2['price_per_sqft'] = round(data2['price_in_cr']*1000000/data2['area'],2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data2['price_per_sqft'] = round(data2['price_in_cr']*1000000/data2['area'],2)


In [32]:
data2.price_per_sqft

1         820.47
2        2836.07
3         684.70
4        1428.07
5        3995.16
          ...   
76033    4584.15
76034    3935.72
76035    3018.41
76036    3256.70
76037    4807.69
Name: price_per_sqft, Length: 71567, dtype: float64

In [33]:
data3 = data2[data2['price_per_sqft'] >= 2000]
data3

Unnamed: 0,bhk,area,region,price_in_cr,sqft_per_bed,price_per_sqft
2,2,610,Borivali West,1.73,305.00,2836.07
5,2,826,Parel,3.30,413.00,3995.16
9,2,671,other,2.72,335.50,4053.65
13,2,605,Jogeshwari West,1.55,302.50,2561.98
16,1,450,Powai,1.39,450.00,3088.89
...,...,...,...,...,...,...
76033,3,1527,Juhu,7.00,509.00,4584.15
76034,5,3049,Juhu,12.00,609.80,3935.72
76035,4,3313,Napeansea Road,10.00,828.25,3018.41
76036,2,1305,Bandra East,4.25,652.50,3256.70


In [34]:
data3.drop(columns = ['sqft_per_bed', 'price_per_sqft'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data3.drop(columns = ['sqft_per_bed', 'price_per_sqft'], axis=1, inplace=True)


In [35]:
data3

Unnamed: 0,bhk,area,region,price_in_cr
2,2,610,Borivali West,1.73
5,2,826,Parel,3.30
9,2,671,other,2.72
13,2,605,Jogeshwari West,1.55
16,1,450,Powai,1.39
...,...,...,...,...
76033,3,1527,Juhu,7.00
76034,5,3049,Juhu,12.00
76035,4,3313,Napeansea Road,10.00
76036,2,1305,Bandra East,4.25


In [36]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler #onehoteencoder creates binary values of characters

In [37]:
from sklearn.model_selection import train_test_split

In [38]:
from sklearn.linear_model import LinearRegression

In [39]:
from sklearn.pipeline import make_pipeline

In [40]:
from sklearn.compose import make_column_transformer #for onehotencoder to work

In [60]:
col_trans = make_column_transformer((OneHotEncoder(sparse_output=False, handle_unknown='ignore'),['region']),remainder='passthrough')

In [61]:
lr= LinearRegression()

In [62]:
scaler = StandardScaler()

In [63]:
model = make_pipeline(col_trans,scaler,lr)

In [64]:
data_input = data3.drop(columns = ['price_in_cr'])
data_output = data3[['price_in_cr']]


In [65]:
data_input

Unnamed: 0,bhk,area,region
2,2,610,Borivali West
5,2,826,Parel
9,2,671,other
13,2,605,Jogeshwari West
16,1,450,Powai
...,...,...,...
76033,3,1527,Juhu
76034,5,3049,Juhu
76035,4,3313,Napeansea Road
76036,2,1305,Bandra East


In [66]:
x_train, x_test, y_train, y_test = train_test_split(data_input, data_output, test_size=0.2)

In [67]:
model.fit(x_train, y_train)

0,1,2
,steps,"[('columntransformer', ...), ('standardscaler', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('onehotencoder', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [68]:
model.score(x_test, y_test)

0.831865429781381

In [69]:
input = pd.DataFrame([[3,1000,"Borivali West"]],columns = ['bhk','area','region'])

In [70]:
model.predict(input)

array([[2.58214203]])

In [71]:
import pickle as pk