In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')

In [2]:
data=pd.read_csv('housing.csv')

In [3]:
data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,median_house_value
0,-122.23,37.88,41,880,129.0,322,126,8.3252,NEAR BAY,452600
1,-122.22,37.86,21,7099,1106.0,2401,1138,8.3014,NEAR BAY,358500
2,-122.24,37.85,52,1467,190.0,496,177,7.2574,NEAR BAY,352100
3,-122.25,37.85,52,1274,235.0,558,219,5.6431,NEAR BAY,341300
4,-122.25,37.85,52,1627,280.0,565,259,3.8462,NEAR BAY,342200


In [4]:
data.shape

(20640, 10)

In [5]:
data.isna().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
ocean_proximity         0
median_house_value      0
dtype: int64

In [6]:
data=data.dropna(axis=0,how='any')

In [7]:
encoder=LabelEncoder()
data['ocean_proximity']=encoder.fit_transform(data['ocean_proximity'])
ocean_mapping={index:label for index,label in enumerate(encoder.classes_)}
print(ocean_mapping)

{0: '<1H OCEAN', 1: 'INLAND', 2: 'ISLAND', 3: 'NEAR BAY', 4: 'NEAR OCEAN'}


In [8]:
y=data['median_house_value']
X=data.drop('median_house_value',axis=1)

In [9]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)

In [10]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
X_train=scaler.fit_transform(X_train)
X_test=scaler.fit_transform(X_test)

## Linear Regression

In [11]:
from sklearn.linear_model import LinearRegression
regressor=LinearRegression()
regressor.fit(X_train,y_train)

In [12]:
y_pred=regressor.predict(X_test)

In [13]:
y_pred

array([ 42902.33528351, 293914.72928154, 273271.00643565, ...,
       198185.58442024, 290348.24260756, 202457.12966898])

In [14]:
y_test

10957    187200
11496    271400
17723    283700
20414    173800
13237    345400
          ...  
6866     233300
15531    182800
4466     169600
17362    193600
10869    205400
Name: median_house_value, Length: 4087, dtype: int64

In [15]:
print(f"Linear Regression:{regressor.score(X_test,y_test)}")

Linear Regression:0.6354625636136542


In [16]:
from sklearn.metrics import mean_squared_error
rmse1=mean_squared_error(y_test,y_pred)
print(rmse1)

4957414729.054353


## Decision Tree

In [17]:
from sklearn.tree import DecisionTreeRegressor
dtree=DecisionTreeRegressor()
dtree.fit(X_train,y_train)

In [18]:
print(f"Decision Tree Regression:{dtree.score(X_test,y_test)}")

Decision Tree Regression:0.565687378914254


In [19]:
y_predict=dtree.predict(X_test)

In [20]:
y_predict

array([111000., 345700., 500001., ..., 257700., 338100., 142600.])

In [21]:
rmse2=mean_squared_error(y_test,y_predict)
print(rmse2)

5906300889.499878


## Random Forest 

In [22]:
from sklearn.ensemble import RandomForestRegressor
clf = RandomForestRegressor()
clf.fit(X_train,y_train)

In [23]:
print(f"Random Forest Regression:{clf.score(X_test,y_test)}")

Random Forest Regression:0.7543858835372124


In [24]:
y_prediction=clf.predict(X_test)
print(y_prediction)

[157058.   282959.01 317492.05 ... 183709.   226683.03 183039.01]


In [25]:
rmse3=mean_squared_error(y_test,y_prediction)
print(rmse3)

3340153622.317793
