In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.linear_model import LinearRegression,Ridge,Lasso,ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import ElasticNet
import statsmodels.formula.api as smf

from sklearn.metrics import mean_squared_error,r2_score
from math import sqrt

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

from matplotlib.axes._axes import _log as matplotlib_axes_logger
matplotlib_axes_logger.setLevel('ERROR')

CSV_READ = pd.read_csv("../input/california-housing-prices/housing.csv")

In [2]:
CSV_READ.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value', 'ocean_proximity'],
      dtype='object')

In [3]:
CSV_READ.isnull().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

In [4]:
CSV_READ.total_bedrooms = CSV_READ.total_bedrooms.fillna(CSV_READ.total_bedrooms.mean())
CSV_READ.isnull().sum()

longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
median_house_value    0
ocean_proximity       0
dtype: int64

In [5]:
CSV_READ['ocean_proximity'] = LabelEncoder().fit_transform(CSV_READ['ocean_proximity'])

In [6]:
names = CSV_READ.columns

scaler = StandardScaler()

scaled_df = scaler.fit_transform(CSV_READ)
scaled_df = pd.DataFrame(scaled_df, columns=names)
scaled_df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-1.327835,1.052548,0.982143,-0.804819,-0.975228,-0.974429,-0.977033,2.344766,2.129631,1.291089
1,-1.322844,1.043185,-0.607019,2.04589,1.355088,0.861439,1.669961,2.332238,1.314156,1.291089
2,-1.332827,1.038503,1.856182,-0.535746,-0.829732,-0.820777,-0.843637,1.782699,1.258693,1.291089
3,-1.337818,1.038503,1.856182,-0.624215,-0.722399,-0.766028,-0.733781,0.932968,1.1651,1.291089
4,-1.337818,1.038503,1.856182,-0.462404,-0.615066,-0.759847,-0.629157,-0.012881,1.1729,1.291089


In [7]:
X_Features = ['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'ocean_proximity']
X = scaled_df[X_Features]
Y = scaled_df['median_house_value']

print(type(X))
print(type(Y))

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>


In [8]:
print(CSV_READ.shape)
print(X.shape)
print(Y.shape)

(20640, 10)
(20640, 9)
(20640,)


In [9]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=0.2,random_state=1)

print (x_train.shape, y_train.shape)
print (x_test.shape, y_test.shape)

(16512, 9) (16512,)
(4128, 9) (4128,)


In [10]:
linreg=LinearRegression()
linreg.fit(x_train,y_train)

y_predict = linreg.predict(x_test)

print(sqrt(mean_squared_error(y_test,y_predict)))
print((r2_score(y_test,y_predict)))

0.6056598120301221
0.6276223517950296


In [11]:
dtreg=DecisionTreeRegressor()
dtreg.fit(x_train,y_train)

y_predict = dtreg.predict(x_test)

print(sqrt(mean_squared_error(y_test,y_predict)))
print((r2_score(y_test,y_predict)))

0.6036472382378824
0.6300930203492975


In [12]:
rfreg=RandomForestRegressor()
rfreg.fit(x_train,y_train)

y_predict = rfreg.predict(x_test)

print(sqrt(mean_squared_error(y_test,y_predict)))
print((r2_score(y_test,y_predict)))

0.4269636186916928
0.8149420140937664


In [13]:
lassoreg=Lasso(alpha=0.001,normalize=True)
lassoreg.fit(x_train,y_train)

print(sqrt(mean_squared_error(y_test,lassoreg.predict(x_test))))
print('R2 Value/Coefficient of determination:{}'.format(lassoreg.score(x_test,y_test)))

0.719314096707071
R2 Value/Coefficient of determination:0.4747534206169961


In [14]:
ridgereg=Ridge(alpha=0.001,normalize=True)
ridgereg.fit(x_train,y_train)

print(sqrt(mean_squared_error(y_test,ridgereg.predict(x_test))))
print('R2 Value/Coefficient of determination:{}'.format(ridgereg.score(x_test,y_test)))

0.6056048844852343
R2 Value/Coefficient of determination:0.6276898909055972


In [15]:
elasticreg = ElasticNet(alpha=0.001,normalize=True)
elasticreg.fit(x_train,y_train)

print(sqrt(mean_squared_error(y_test,elasticreg.predict(x_test))))
print('R2 Value/Coefficient of determination:{}'.format(elasticreg.score(x_test,y_test)))

0.944358169398106
R2 Value/Coefficient of determination:0.09468529806704551
