In [1]:
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.model_selection import train_test_split

In [3]:
from sklearn.datasets import fetch_california_housing
housing = fetch_california_housing()
print(housing)

{'data': array([[   8.3252    ,   41.        ,    6.98412698, ...,    2.55555556,
          37.88      , -122.23      ],
       [   8.3014    ,   21.        ,    6.23813708, ...,    2.10984183,
          37.86      , -122.22      ],
       [   7.2574    ,   52.        ,    8.28813559, ...,    2.80225989,
          37.85      , -122.24      ],
       ...,
       [   1.7       ,   17.        ,    5.20554273, ...,    2.3256351 ,
          39.43      , -121.22      ],
       [   1.8672    ,   18.        ,    5.32951289, ...,    2.12320917,
          39.43      , -121.32      ],
       [   2.3886    ,   16.        ,    5.25471698, ...,    2.61698113,
          39.37      , -121.24      ]]), 'target': array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894]), 'frame': None, 'target_names': ['MedHouseVal'], 'feature_names': ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude'], 'DESCR': '.. _california_housing_dataset:\n\nCalifornia Housing dataset\n-

In [4]:
#transforming dataset to data frame
#feature_names --> column names
#data --> required data or independent variable x
#target --> price of house or dependent variable y

df_x = pd.DataFrame(housing.data, columns = housing.feature_names)
df_y = pd.DataFrame(housing.target)


In [5]:
#Some statistics from dataset
df_x.describe()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,3.870671,28.639486,5.429,1.096675,1425.476744,3.070655,35.631861,-119.569704
std,1.899822,12.585558,2.474173,0.473911,1132.462122,10.38605,2.135952,2.003532
min,0.4999,1.0,0.846154,0.333333,3.0,0.692308,32.54,-124.35
25%,2.5634,18.0,4.440716,1.006079,787.0,2.429741,33.93,-121.8
50%,3.5348,29.0,5.229129,1.04878,1166.0,2.818116,34.26,-118.49
75%,4.74325,37.0,6.052381,1.099526,1725.0,3.282261,37.71,-118.01
max,15.0001,52.0,141.909091,34.066667,35682.0,1243.333333,41.95,-114.31


In [6]:
#Initializing Linear Regression
reg = linear_model.LinearRegression()

In [7]:
#Splitting for test and train dataset
x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, test_size = 0.33, random_state = 42)

In [8]:
#training the model
reg.fit(x_train, y_train)

In [9]:
#print coeffecients/ weight for each column
print(reg.coef_)

[[ 4.44870466e-01  9.55004561e-03 -1.21991503e-01  7.79144696e-01
  -7.68990808e-08 -3.29948505e-03 -4.19131153e-01 -4.34103468e-01]]


In [10]:
#print the predictions
y_pred = reg.predict(x_test)
print(y_pred)

[[0.72563462]
 [1.76650223]
 [2.70545812]
 ...
 [1.25803135]
 [1.66673014]
 [2.25826279]]


In [11]:
#print actual test values
print(y_test)

             0
20046  0.47700
3024   0.45800
15663  5.00001
20484  2.18600
9814   2.78000
...        ...
15316  1.66100
14772  0.93600
12870  1.07000
13476  1.30700
16123  3.07100

[6812 rows x 1 columns]


In [13]:
#check for accuracy (Mean Square Error (MSE))

print(np.mean((y_pred - y_test)**2))

0    0.536969
dtype: float64


In [14]:
#check for accuracy using MSE method with help of sklearn
from sklearn.metrics import mean_squared_error
print(mean_squared_error(y_pred, y_test))

0.5369686543372465
