In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
df = pd.read_csv('Bengaluru_House_Data.csv')
df.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [None]:
df.shape

(13320, 9)

In [None]:
#df['area_type'].value_counts() or
df.groupby('area_type').size() # both provide same output

area_type
Built-up  Area          2418
Carpet  Area              87
Plot  Area              2025
Super built-up  Area    8790
dtype: int64

In [None]:
df.columns

Index(['area_type', 'availability', 'location', 'size', 'society',
       'total_sqft', 'bath', 'balcony', 'price'],
      dtype='object')

In [None]:
# Finding the correlation between features

# We can either use  Visualization or .corr() Fuction for Correlation

#.corr() => 1 indicates perfect positive correlation , -1 indicates negative correlation, 0 indicates no correlation

print("Bath price correlation:",df['bath'].corr(df['price']))
print("Balcony price correlation{}".format(df['balcony'].corr(df['price'])))



Bath price correlation: 0.45634510346350105
Balcony price correlation0.12035530938067719


In [None]:
# Dropping the columns that does not have a significant impact on the prices
df1=df.drop(['area_type', 'availability','society','balcony'],axis=1)
df1.head()

Unnamed: 0,location,size,total_sqft,bath,price
0,Electronic City Phase II,2 BHK,1056,2.0,39.07
1,Chikka Tirupathi,4 Bedroom,2600,5.0,120.0
2,Uttarahalli,3 BHK,1440,2.0,62.0
3,Lingadheeranahalli,3 BHK,1521,3.0,95.0
4,Kothanur,2 BHK,1200,2.0,51.0


In [None]:
df1.isnull().sum()

location       1
size          16
total_sqft     0
bath          73
price          0
dtype: int64

In [None]:
df2 = df1.dropna()
df2.head()

Unnamed: 0,location,size,total_sqft,bath,price
0,Electronic City Phase II,2 BHK,1056,2.0,39.07
1,Chikka Tirupathi,4 Bedroom,2600,5.0,120.0
2,Uttarahalli,3 BHK,1440,2.0,62.0
3,Lingadheeranahalli,3 BHK,1521,3.0,95.0
4,Kothanur,2 BHK,1200,2.0,51.0


In [None]:
df2.isnull().sum()
#all null values are removed

location      0
size          0
total_sqft    0
bath          0
price         0
dtype: int64

In [None]:
df2['size'].unique() # Here the  column is inconsistently labeled

array(['2 BHK', '4 Bedroom', '3 BHK', '4 BHK', '6 Bedroom', '3 Bedroom',
       '1 BHK', '1 RK', '1 Bedroom', '8 Bedroom', '2 Bedroom',
       '7 Bedroom', '5 BHK', '7 BHK', '6 BHK', '5 Bedroom', '11 BHK',
       '9 BHK', '9 Bedroom', '27 BHK', '10 Bedroom', '11 Bedroom',
       '10 BHK', '19 BHK', '16 BHK', '43 Bedroom', '14 BHK', '8 BHK',
       '12 Bedroom', '13 BHK', '18 Bedroom'], dtype=object)

In [None]:
df2['BHK'] = df2['size'].astype('str').apply(lambda x :int(x.split(' ')[0]))

df2.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['BHK'] = df2['size'].astype('str').apply(lambda x :int(x.split(' ')[0]))


Unnamed: 0,location,size,total_sqft,bath,price,BHK
0,Electronic City Phase II,2 BHK,1056,2.0,39.07,2
1,Chikka Tirupathi,4 Bedroom,2600,5.0,120.0,4
2,Uttarahalli,3 BHK,1440,2.0,62.0,3
3,Lingadheeranahalli,3 BHK,1521,3.0,95.0,3
4,Kothanur,2 BHK,1200,2.0,51.0,2


In [None]:
df2.dtypes

location       object
size           object
total_sqft     object
bath          float64
price         float64
BHK             int64
dtype: object

In [None]:
df2['BHK'].unique()

array([ 2,  4,  3,  6,  1,  8,  7,  5, 11,  9, 27, 10, 19, 16, 43, 14, 12,
       13, 18])

In [None]:
# Above case we can see that some home has 43 bedrooms which seems quit odd

# Appx atleast 400 - 500 sqft is required for a 1 BHK house

df2[df2['BHK']>15]

Unnamed: 0,location,size,total_sqft,bath,price,BHK
1718,2Electronic City Phase II,27 BHK,8000,27.0,230.0,27
3379,1Hanuman Nagar,19 BHK,2000,16.0,490.0,19
3609,Koramangala Industrial Layout,16 BHK,10000,16.0,550.0,16
4684,Munnekollal,43 Bedroom,2400,40.0,660.0,43
11559,1Kasavanhalli,18 Bedroom,1200,18.0,200.0,18


In [None]:
#from using dtypes we found that still total_sqft is defined as object


df2['total_sqft'].unique()

#and by seeing the unique values we can see that some data is in the range like ' 1133-1382

array(['1056', '2600', '1440', ..., '1133 - 1384', '774', '4689'],
      dtype=object)

In [None]:
def is_float(x):
  try:
    float(x)
  except:
    return False
  return True

  # using this code we are trying to convert the data in 'total sqft' into float , if its not valid it will throw  an exception

In [None]:
df2[~ df2['total_sqft'].apply(is_float)].head(10) #which are not converted to float are displayed

Unnamed: 0,location,size,total_sqft,bath,price,BHK
30,Yelahanka,4 BHK,2100 - 2850,4.0,186.0,4
122,Hebbal,4 BHK,3067 - 8156,4.0,477.0,4
137,8th Phase JP Nagar,2 BHK,1042 - 1105,2.0,54.005,2
165,Sarjapur,2 BHK,1145 - 1340,2.0,43.49,2
188,KR Puram,2 BHK,1015 - 1540,2.0,56.8,2
410,Kengeri,1 BHK,34.46Sq. Meter,1.0,18.5,1
549,Hennur Road,2 BHK,1195 - 1440,2.0,63.77,2
648,Arekere,9 Bedroom,4125Perch,9.0,265.0,9
661,Yelahanka,2 BHK,1120 - 1145,2.0,48.13,2
672,Bettahalsoor,4 Bedroom,3090 - 5002,4.0,445.0,4


In [None]:
def convert_sqft_to_num(x) :
  tokens =x.split('-')
  if len(tokens)==2 :
    return(float(tokens[0])+float(tokens[1]))/2
  try:
     return float(x)
  except:
    return None



In [None]:
#Checking the function
print(convert_sqft_to_num('100-200'))
print(convert_sqft_to_num('34.46Sq. Meter'))
print(convert_sqft_to_num('45'))

150.0
None
45.0


In [None]:
df3 = df2.copy()

In [None]:
df3['total_sqft']= df3['total_sqft'].apply(convert_sqft_to_num)


In [None]:
df3.head(10)

Unnamed: 0,location,size,total_sqft,bath,price,BHK
0,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.0,4
2,Uttarahalli,3 BHK,1440.0,2.0,62.0,3
3,Lingadheeranahalli,3 BHK,1521.0,3.0,95.0,3
4,Kothanur,2 BHK,1200.0,2.0,51.0,2
5,Whitefield,2 BHK,1170.0,2.0,38.0,2
6,Old Airport Road,4 BHK,2732.0,4.0,204.0,4
7,Rajaji Nagar,4 BHK,3300.0,4.0,600.0,4
8,Marathahalli,3 BHK,1310.0,3.0,63.25,3
9,Gandhi Bazar,6 Bedroom,1020.0,6.0,370.0,6


In [None]:
df3.dtypes


location       object
size           object
total_sqft    float64
bath          float64
price         float64
BHK             int64
dtype: object

In [None]:
#Price per sqft calculation

df4 = df3.copy()

df4['Price_per_sqft']= (df4['price']*100000)/df4['total_sqft']
df4.head()

Unnamed: 0,location,size,total_sqft,bath,price,BHK,Price_per_sqft
0,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2,3699.810606
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.0,4,4615.384615
2,Uttarahalli,3 BHK,1440.0,2.0,62.0,3,4305.555556
3,Lingadheeranahalli,3 BHK,1521.0,3.0,95.0,3,6245.890861
4,Kothanur,2 BHK,1200.0,2.0,51.0,2,4250.0


In [None]:
len(df4['location'].unique())

# so any unique location is there which may cause so one hot coding can cause dimentional issues

1304

In [None]:
df4['location'] = df4['location'].apply(lambda x : x.strip())
location_stats = df4['location'].value_counts().sort_values(ascending=False)
location_stats

location
Whitefield                       535
Sarjapur  Road                   392
Electronic City                  304
Kanakpura Road                   266
Thanisandra                      236
                                ... 
Old Mangammanapalya Road           1
HAL Layout                         1
Gubbi Cross, Hennur Main Road      1
Jeevanhalli                        1
Abshot Layout                      1
Name: count, Length: 1293, dtype: int64

In [None]:
len(location_stats[location_stats<=10])

1052

In [None]:
location_stats_less_than_10 = location_stats[location_stats<=10]
print(location_stats_less_than_10)


location
Ganga Nagar                      10
Nagadevanahalli                  10
Dodsworth Layout                 10
1st Block Koramangala            10
Kalkere                          10
                                 ..
Old Mangammanapalya Road          1
HAL Layout                        1
Gubbi Cross, Hennur Main Road     1
Jeevanhalli                       1
Abshot Layout                     1
Name: count, Length: 1052, dtype: int64


In [None]:
len(df4['location'].unique())

1293

In [None]:
#Converting all location which is less that 10 to others

df4['location'] = df4['location'].apply(lambda x: 'Other' if x in location_stats_less_than_10 else x)
len(df4['location'].unique())

242

In [None]:
df4.location.head(15)

0     Electronic City Phase II
1             Chikka Tirupathi
2                  Uttarahalli
3           Lingadheeranahalli
4                     Kothanur
5                   Whitefield
6             Old Airport Road
7                 Rajaji Nagar
8                 Marathahalli
9                        Other
10                  Whitefield
11                  Whitefield
12          7th Phase JP Nagar
13                   Gottigere
14                    Sarjapur
Name: location, dtype: object

In [None]:
df4.head(20)

Unnamed: 0,location,size,total_sqft,bath,price,BHK,Price_per_sqft
0,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2,3699.810606
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.0,4,4615.384615
2,Uttarahalli,3 BHK,1440.0,2.0,62.0,3,4305.555556
3,Lingadheeranahalli,3 BHK,1521.0,3.0,95.0,3,6245.890861
4,Kothanur,2 BHK,1200.0,2.0,51.0,2,4250.0
5,Whitefield,2 BHK,1170.0,2.0,38.0,2,3247.863248
6,Old Airport Road,4 BHK,2732.0,4.0,204.0,4,7467.057101
7,Rajaji Nagar,4 BHK,3300.0,4.0,600.0,4,18181.818182
8,Marathahalli,3 BHK,1310.0,3.0,63.25,3,4828.244275
9,Other,6 Bedroom,1020.0,6.0,370.0,6,36274.509804


In [None]:
# For a single bhk lets keep  the average sqft required is 300 sqft

df4[df4['total_sqft']/df4['BHK']<300]

#From below cases we can see the anomalies data sets where the sqft and bhk doesnt justify

Unnamed: 0,location,size,total_sqft,bath,price,BHK,Price_per_sqft
9,Other,6 Bedroom,1020.0,6.0,370.0,6,36274.509804
45,HSR Layout,8 Bedroom,600.0,9.0,200.0,8,33333.333333
58,Murugeshpalya,6 Bedroom,1407.0,4.0,150.0,6,10660.980810
68,Devarachikkanahalli,8 Bedroom,1350.0,7.0,85.0,8,6296.296296
70,Other,3 Bedroom,500.0,3.0,100.0,3,20000.000000
...,...,...,...,...,...,...,...
13277,Other,7 Bedroom,1400.0,7.0,218.0,7,15571.428571
13279,Other,6 Bedroom,1200.0,5.0,130.0,6,10833.333333
13281,Margondanahalli,5 Bedroom,1375.0,5.0,125.0,5,9090.909091
13303,Vidyaranyapura,5 Bedroom,774.0,5.0,70.0,5,9043.927649


In [None]:
df5 = df4[~(df4['total_sqft']/df4['BHK']<300)]
df5

# here the ~ will remove all the data points where it is less than 300

Unnamed: 0,location,size,total_sqft,bath,price,BHK,Price_per_sqft
0,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2,3699.810606
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.00,4,4615.384615
2,Uttarahalli,3 BHK,1440.0,2.0,62.00,3,4305.555556
3,Lingadheeranahalli,3 BHK,1521.0,3.0,95.00,3,6245.890861
4,Kothanur,2 BHK,1200.0,2.0,51.00,2,4250.000000
...,...,...,...,...,...,...,...
13315,Whitefield,5 Bedroom,3453.0,4.0,231.00,5,6689.834926
13316,Other,4 BHK,3600.0,5.0,400.00,4,11111.111111
13317,Raja Rajeshwari Nagar,2 BHK,1141.0,2.0,60.00,2,5258.545136
13318,Padmanabhanagar,4 BHK,4689.0,4.0,488.00,4,10407.336319


In [None]:
df5['Price_per_sqft'].describe()

#There are some unlikely price per prediction data

count     12456.000000
mean       6308.502826
std        4168.127339
min         267.829813
25%        4210.526316
50%        5294.117647
75%        6916.666667
max      176470.588235
Name: Price_per_sqft, dtype: float64

In [None]:
# Based on domain expertise suggestion

# minimum sqft of the house can be 2300

#maximum sqft lets keep 30,000

df5[(df5['Price_per_sqft']>30000)|(df5['Price_per_sqft']<2300)]



Unnamed: 0,location,size,total_sqft,bath,price,BHK,Price_per_sqft
132,Electronic City,2 BHK,880.0,1.0,16.50,2,1875.000000
365,Chandapura,1 BHK,530.0,1.0,11.66,1,2200.000000
514,Banashankari Stage III,4 Bedroom,8500.0,4.0,145.00,4,1705.882353
674,Yelahanka,3 BHK,35000.0,3.0,130.00,3,371.428571
767,Sarjapur,5 Bedroom,4360.0,4.0,90.00,5,2064.220183
...,...,...,...,...,...,...,...
12570,Bommasandra Industrial Area,2 BHK,7000.0,2.0,135.00,2,1928.571429
12574,Other,1 BHK,2559.0,1.0,55.00,1,2149.277061
13067,Other,10 Bedroom,7150.0,13.0,3600.00,10,50349.650350
13200,Other,6 Bedroom,8000.0,6.0,2800.00,6,35000.000000


In [None]:
# data frame after removing the above outliers
df6 = df5.copy()
df6 = df6 [~((df6['Price_per_sqft']>30000)|(df6['Price_per_sqft']<2300))]

df6.shape

(12417, 7)

In [None]:
df6.head()

#Now lets remove the unnecessary columns from the dataframe

Unnamed: 0,location,size,total_sqft,bath,price,BHK,Price_per_sqft
0,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2,3699.810606
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.0,4,4615.384615
2,Uttarahalli,3 BHK,1440.0,2.0,62.0,3,4305.555556
3,Lingadheeranahalli,3 BHK,1521.0,3.0,95.0,3,6245.890861
4,Kothanur,2 BHK,1200.0,2.0,51.0,2,4250.0


In [None]:
df7 = df6.drop(['size','Price_per_sqft'], axis=1)
df7.head()

Unnamed: 0,location,total_sqft,bath,price,BHK
0,Electronic City Phase II,1056.0,2.0,39.07,2
1,Chikka Tirupathi,2600.0,5.0,120.0,4
2,Uttarahalli,1440.0,2.0,62.0,3
3,Lingadheeranahalli,1521.0,3.0,95.0,3
4,Kothanur,1200.0,2.0,51.0,2


In [None]:
#cleaning the locatio column
df7.location = df7.location.astype('object')
df7.locaton  = df7.location.str.strip()
df7.location.value_counts().sort_values(ascending =False)


  df7.locaton  = df7.location.str.strip()


location
Other                   2530
Whitefield               533
Sarjapur  Road           388
Electronic City          285
Kanakpura Road           264
                        ... 
Banjara Layout             8
5th Block Hbr Layout       7
Vishveshwarya Layout       7
Vishwapriya Layout         7
Marsur                     5
Name: count, Length: 242, dtype: int64

Model Building

In [None]:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder()
encoded_location_data = encoder.fit_transform(df7.location.values.reshape(-1,1)).toarray()
encoded_location_data


array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [None]:
# Convert to DataFrame with column names
encoded_loc_df = pd.DataFrame(encoded_location_data, columns=encoder.get_feature_names_out(['location']))
encoded_loc_df.head()

Unnamed: 0,location_1st Block Jayanagar,location_1st Phase JP Nagar,location_2nd Phase Judicial Layout,location_2nd Stage Nagarbhavi,location_5th Block Hbr Layout,location_5th Phase JP Nagar,location_6th Phase JP Nagar,location_7th Phase JP Nagar,location_8th Phase JP Nagar,location_9th Phase JP Nagar,...,location_Vijayanagar,location_Vishveshwarya Layout,location_Vishwapriya Layout,location_Vittasandra,location_Whitefield,location_Yelachenahalli,location_Yelahanka,location_Yelahanka New Town,location_Yelenahalli,location_Yeshwanthpur
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
df8 = pd.concat([df7,encoded_loc_df],axis=1)
df8.head(20)

Unnamed: 0,location,total_sqft,bath,price,BHK,location_1st Block Jayanagar,location_1st Phase JP Nagar,location_2nd Phase Judicial Layout,location_2nd Stage Nagarbhavi,location_5th Block Hbr Layout,...,location_Vijayanagar,location_Vishveshwarya Layout,location_Vishwapriya Layout,location_Vittasandra,location_Whitefield,location_Yelachenahalli,location_Yelahanka,location_Yelahanka New Town,location_Yelenahalli,location_Yeshwanthpur
0,Electronic City Phase II,1056.0,2.0,39.07,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Chikka Tirupathi,2600.0,5.0,120.0,4.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Uttarahalli,1440.0,2.0,62.0,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Lingadheeranahalli,1521.0,3.0,95.0,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Kothanur,1200.0,2.0,51.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Whitefield,1170.0,2.0,38.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
6,Old Airport Road,2732.0,4.0,204.0,4.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Rajaji Nagar,3300.0,4.0,600.0,4.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,Marathahalli,1310.0,3.0,63.25,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,Whitefield,1800.0,2.0,70.0,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# In order to avoid the dummy variable trap we are removing one column from the dummmy varaible
# And location data frame is not needed anymore , so dropping both of these columns

df9 = df8.drop(['location','location_1st Block Jayanagar'],axis=1)
df9.head()

Unnamed: 0,total_sqft,bath,price,BHK,location_1st Phase JP Nagar,location_2nd Phase Judicial Layout,location_2nd Stage Nagarbhavi,location_5th Block Hbr Layout,location_5th Phase JP Nagar,location_6th Phase JP Nagar,...,location_Vijayanagar,location_Vishveshwarya Layout,location_Vishwapriya Layout,location_Vittasandra,location_Whitefield,location_Yelachenahalli,location_Yelahanka,location_Yelahanka New Town,location_Yelenahalli,location_Yeshwanthpur
0,1056.0,2.0,39.07,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2600.0,5.0,120.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1440.0,2.0,62.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1521.0,3.0,95.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1200.0,2.0,51.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
df9.isnull().sum()

total_sqft                     878
bath                           832
price                          832
BHK                            832
location_1st Phase JP Nagar    832
                              ... 
location_Yelachenahalli        832
location_Yelahanka             832
location_Yelahanka New Town    832
location_Yelenahalli           832
location_Yeshwanthpur          832
Length: 245, dtype: int64

In [None]:
df10 = df9.dropna()
df10.isnull().sum()

total_sqft                     0
bath                           0
price                          0
BHK                            0
location_1st Phase JP Nagar    0
                              ..
location_Yelachenahalli        0
location_Yelahanka             0
location_Yelahanka New Town    0
location_Yelenahalli           0
location_Yeshwanthpur          0
Length: 245, dtype: int64

In [None]:
df10.shape

(11541, 245)

In [None]:
X =df10.drop(['price'],axis='columns')
X.head()

Unnamed: 0,total_sqft,bath,BHK,location_1st Phase JP Nagar,location_2nd Phase Judicial Layout,location_2nd Stage Nagarbhavi,location_5th Block Hbr Layout,location_5th Phase JP Nagar,location_6th Phase JP Nagar,location_7th Phase JP Nagar,...,location_Vijayanagar,location_Vishveshwarya Layout,location_Vishwapriya Layout,location_Vittasandra,location_Whitefield,location_Yelachenahalli,location_Yelahanka,location_Yelahanka New Town,location_Yelenahalli,location_Yeshwanthpur
0,1056.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2600.0,5.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1440.0,2.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1521.0,3.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1200.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
X.dtypes

total_sqft                            float64
bath                                  float64
BHK                                   float64
location_1st Phase JP Nagar           float64
location_2nd Phase Judicial Layout    float64
                                       ...   
location_Yelachenahalli               float64
location_Yelahanka                    float64
location_Yelahanka New Town           float64
location_Yelenahalli                  float64
location_Yeshwanthpur                 float64
Length: 244, dtype: object

In [None]:
Y = df10.price
Y.head()

0     39.07
1    120.00
2     62.00
3     95.00
4     51.00
Name: price, dtype: float64

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.2,random_state=10)

In [None]:
from sklearn.linear_model import LinearRegression
lr_clf = LinearRegression()
lr_clf.fit(X_train,Y_train)
lr_clf.score(X_test,Y_test)

0.5773744862112178

In [None]:
# using cross validation

from sklearn.model_selection import ShuffleSplit # will randomize the data samples,each of the time data points will be distributed equally
from sklearn.model_selection import cross_val_score

cv = ShuffleSplit(n_splits=5 , test_size=0.2, random_state=0)

cross_val_score(LinearRegression(),X,Y,cv=cv)


array([0.60267658, 0.59784372, 0.67182421, 0.56610325, 0.63071003])

In [None]:
from sklearn.model_selection import GridSearchCV, ShuffleSplit
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.tree import DecisionTreeRegressor

def find_best_model(x, y):
    algos = {
        'linear_regression': {
            'model': LinearRegression(),
            'params': {
                'fit_intercept': [True, False]
            }
        },
        'lasso': {
            'model': Lasso(),
            'params': {
                'alpha': [1, 2],
                'selection': ['random', 'cyclic']
            }
        },
        'decision_tree': {
            'model': DecisionTreeRegressor(),
            'params': {
                'criterion': ['squared_error', 'friedman_mse'],
                'splitter': ['best', 'random']
            }
        }
    }

    scores = []
    cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
    for algo_name, config in algos.items():
        gs = GridSearchCV(config['model'], config['params'], cv=cv, return_train_score=False)
        gs.fit(x, y)
        scores.append({
            'model': algo_name,
            'best_score': gs.best_score_,
            'best_params': gs.best_params_
        })

    return scores



In [None]:
find_best_model(X,Y)

[{'model': 'linear_regression',
  'best_score': 0.6138315580317537,
  'best_params': {'fit_intercept': True}},
 {'model': 'lasso',
  'best_score': 0.6224632634497812,
  'best_params': {'alpha': 1, 'selection': 'random'}},
 {'model': 'decision_tree',
  'best_score': 0.3187492020662826,
  'best_params': {'criterion': 'squared_error', 'splitter': 'best'}}]

In [None]:
#So Lasso Regression gives the best score

from sklearn.linear_model import Lasso
lasso= Lasso()
lasso.fit(X_train,Y_train)
