In [1]:
print("Hello Data")

Hello Data


### Loading Required Libraries

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score


### Loading Dataset

In [5]:
df = pd.read_csv('Bengaluru_House_Data.csv')

In [6]:
df

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.00
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.00
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.00
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.00
...,...,...,...,...,...,...,...,...,...
13315,Built-up Area,Ready To Move,Whitefield,5 Bedroom,ArsiaEx,3453,4.0,0.0,231.00
13316,Super built-up Area,Ready To Move,Richards Town,4 BHK,,3600,5.0,,400.00
13317,Built-up Area,Ready To Move,Raja Rajeshwari Nagar,2 BHK,Mahla T,1141,2.0,1.0,60.00
13318,Super built-up Area,18-Jun,Padmanabhanagar,4 BHK,SollyCl,4689,4.0,1.0,488.00


### Feature Engineering

In [7]:
# Extracting  the numbers from the size column and change its name to bedrooms.

df['bedrooms'] = df['size'].str.extract(r'(\d+)').astype('float')
df['Total_sqft'] = df['total_sqft'].str.extract(r'(\d+)').astype('float')
df


Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price,bedrooms,Total_sqft
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07,2.0,1056.0
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.00,4.0,2600.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.00,3.0,1440.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.00,3.0,1521.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.00,2.0,1200.0
...,...,...,...,...,...,...,...,...,...,...,...
13315,Built-up Area,Ready To Move,Whitefield,5 Bedroom,ArsiaEx,3453,4.0,0.0,231.00,5.0,3453.0
13316,Super built-up Area,Ready To Move,Richards Town,4 BHK,,3600,5.0,,400.00,4.0,3600.0
13317,Built-up Area,Ready To Move,Raja Rajeshwari Nagar,2 BHK,Mahla T,1141,2.0,1.0,60.00,2.0,1141.0
13318,Super built-up Area,18-Jun,Padmanabhanagar,4 BHK,SollyCl,4689,4.0,1.0,488.00,4.0,4689.0


In [8]:
# Checking for NaN values in our dataframe
df.isna().sum()


area_type          0
availability       0
location           1
size              16
society         5502
total_sqft         0
bath              73
balcony          609
price              0
bedrooms          16
Total_sqft         0
dtype: int64

In [9]:
# Deleting NaN Vlues from our required columns
df.dropna(subset=['bath', 'bedrooms'], inplace=True)


In [10]:
df

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price,bedrooms,Total_sqft
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07,2.0,1056.0
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.00,4.0,2600.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.00,3.0,1440.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.00,3.0,1521.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.00,2.0,1200.0
...,...,...,...,...,...,...,...,...,...,...,...
13315,Built-up Area,Ready To Move,Whitefield,5 Bedroom,ArsiaEx,3453,4.0,0.0,231.00,5.0,3453.0
13316,Super built-up Area,Ready To Move,Richards Town,4 BHK,,3600,5.0,,400.00,4.0,3600.0
13317,Built-up Area,Ready To Move,Raja Rajeshwari Nagar,2 BHK,Mahla T,1141,2.0,1.0,60.00,2.0,1141.0
13318,Super built-up Area,18-Jun,Padmanabhanagar,4 BHK,SollyCl,4689,4.0,1.0,488.00,4.0,4689.0


In [11]:
#  Selecting our relevant features that are total area, bedrooms and bathrooms

X = df[['Total_sqft', 'bedrooms', 'bath']]
y = df['price']


In [12]:
X

Unnamed: 0,Total_sqft,bedrooms,bath
0,1056.0,2.0,2.0
1,2600.0,4.0,5.0
2,1440.0,3.0,2.0
3,1521.0,3.0,3.0
4,1200.0,2.0,2.0
...,...,...,...
13315,3453.0,5.0,4.0
13316,3600.0,4.0,5.0
13317,1141.0,2.0,2.0
13318,4689.0,4.0,4.0


In [13]:
y

0         39.07
1        120.00
2         62.00
3         95.00
4         51.00
          ...  
13315    231.00
13316    400.00
13317     60.00
13318    488.00
13319     17.00
Name: price, Length: 13247, dtype: float64

### Splitting data 

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
X_train

Unnamed: 0,Total_sqft,bedrooms,bath
7256,2400.0,9.0,8.0
9590,1174.0,2.0,2.0
12977,2800.0,7.0,6.0
3929,1100.0,2.0,2.0
3506,1200.0,5.0,5.0
...,...,...,...
12034,966.0,2.0,2.0
5223,3122.0,4.0,6.0
5423,1400.0,3.0,3.0
868,1225.0,2.0,2.0


In [16]:
y_train

7256     325.00
9590      74.00
12977    110.00
3929      38.00
3506     185.00
          ...  
12034     58.00
5223     237.00
5423      35.00
868       46.55
7316      48.60
Name: price, Length: 10597, dtype: float64

In [17]:
X_test

Unnamed: 0,Total_sqft,bedrooms,bath
64,3000.0,8.0,8.0
8682,8400.0,4.0,5.0
5968,1240.0,5.0,5.0
5794,1350.0,9.0,8.0
8837,1100.0,2.0,2.0
...,...,...,...
1731,1757.0,3.0,3.0
3884,1531.0,3.0,3.0
4981,654.0,1.0,1.0
9051,1147.0,2.0,2.0


In [18]:
y_test

64       140.00
8682    1675.00
5968     300.00
5794     200.00
8837      70.00
         ...   
1731     132.00
3884      72.00
4981      37.00
9051      75.00
6755      24.86
Name: price, Length: 2650, dtype: float64

### Training the Model

In [19]:
model = LinearRegression()
model.fit(X_train, y_train)

LinearRegression()

### Pridiction

In [20]:
y_pred = model.predict(X_test)

In [21]:
print(mean_squared_error(y_test, y_pred))
print(r2_score(y_test, y_pred))

13215.257924961234
0.369173394409281


### Saving the Model

In [23]:
import joblib

joblib.dump(model, 'Property_Price_Predictor.pkl')

['Property_Price_Predictor.pkl']