In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [3]:
from sklearn.datasets import load_boston

boston = load_boston()
boston.keys()

dict_keys(['data', 'target', 'feature_names', 'DESCR'])

In [7]:
print(boston.DESCR)

Boston House Prices dataset

Notes
------
Data Set Characteristics:  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive
    
    :Median Value (attribute 14) is usually the target

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pupil-teacher ratio by town
      

In [12]:
boston['feature_names']

array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',
       'TAX', 'PTRATIO', 'B', 'LSTAT'], dtype='<U7')

In [76]:
dataset = pd.DataFrame(boston.data, columns=boston['feature_names'])
dataset.drop(labels=['INDUS', 'AGE'], axis=1, inplace=True)
dataset.head()

Unnamed: 0,CRIM,ZN,CHAS,NOX,RM,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,0.0,0.538,6.575,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,0.0,0.469,6.421,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,0.0,0.469,7.185,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,0.0,0.458,6.998,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,0.0,0.458,7.147,6.0622,3.0,222.0,18.7,396.9,5.33


In [77]:
dataset["Home Value $1000's"] = boston['target']
dataset.head()

Unnamed: 0,CRIM,ZN,CHAS,NOX,RM,DIS,RAD,TAX,PTRATIO,B,LSTAT,Home Value $1000's
0,0.00632,18.0,0.0,0.538,6.575,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,0.0,0.469,6.421,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,0.0,0.469,7.185,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,0.0,0.458,6.998,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,0.0,0.458,7.147,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [27]:
dataset.shape

(506, 13)

In [None]:
sns.pairplot(dataset)

In [89]:
dataset.corr('pearson')

Unnamed: 0,CRIM,ZN,CHAS,NOX,RM,DIS,RAD,TAX,PTRATIO,B,LSTAT,Home Value $1000's
CRIM,1.0,-0.199458,-0.055295,0.417521,-0.21994,-0.377904,0.622029,0.579564,0.28825,-0.377365,0.45222,-0.385832
ZN,-0.199458,1.0,-0.042697,-0.516604,0.311991,0.664408,-0.311948,-0.314563,-0.391679,0.17552,-0.412995,0.360445
CHAS,-0.055295,-0.042697,1.0,0.091203,0.091251,-0.099176,-0.007368,-0.035587,-0.121515,0.048788,-0.053929,0.17526
NOX,0.417521,-0.516604,0.091203,1.0,-0.302188,-0.76923,0.611441,0.668023,0.188933,-0.380051,0.590879,-0.427321
RM,-0.21994,0.311991,0.091251,-0.302188,1.0,0.205246,-0.209847,-0.292048,-0.355501,0.128069,-0.613808,0.69536
DIS,-0.377904,0.664408,-0.099176,-0.76923,0.205246,1.0,-0.494588,-0.534432,-0.232471,0.291512,-0.496996,0.249929
RAD,0.622029,-0.311948,-0.007368,0.611441,-0.209847,-0.494588,1.0,0.910228,0.464741,-0.444413,0.488676,-0.381626
TAX,0.579564,-0.314563,-0.035587,0.668023,-0.292048,-0.534432,0.910228,1.0,0.460853,-0.441808,0.543993,-0.468536
PTRATIO,0.28825,-0.391679,-0.121515,0.188933,-0.355501,-0.232471,0.464741,0.460853,1.0,-0.177383,0.374044,-0.507787
B,-0.377365,0.17552,0.048788,-0.380051,0.128069,0.291512,-0.444413,-0.441808,-0.177383,1.0,-0.366087,0.333461


In [80]:
# train test split

X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

In [81]:
from sklearn.linear_model import LinearRegression

regressor = LinearRegression()
regressor.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [82]:
y_pred = regressor.predict(X_test)

In [83]:
regressor.score(X_test, y_test)

0.7652869579606006

In [84]:
from sklearn.metrics import accuracy_score

regressor.score(X_test, y_test)

0.7652869579606006

In [85]:
pd.DataFrame(regressor.coef_, index=dataset.columns[:-1], columns=['Coefficient'])

Unnamed: 0,Coefficient
CRIM,-0.105795
ZN,0.04734
CHAS,2.909313
NOX,-18.796163
RM,3.826847
DIS,-1.566995
RAD,0.31035
TAX,-0.011228
PTRATIO,-0.978319
B,0.009602


In [86]:
from sklearn import metrics

print('Mean Absolute Error: ', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Square Error: ', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Square Error: ', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Mean Absolute Error:  3.1731632891983113
Mean Square Error:  14.504932111739016
Root Mean Square Error:  3.808534115869125


In [12]:
# To calculate p-value
import statsmodels.api as sm
from scipy import stats

  from pandas.core import datetools


In [87]:
X2 = sm.add_constant(X)
est = sm.OLS(y, X2)
est2 = est.fit()
print(est2.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.741
Model:                            OLS   Adj. R-squared:                  0.735
Method:                 Least Squares   F-statistic:                     128.2
Date:                Wed, 05 Sep 2018   Prob (F-statistic):          5.74e-137
Time:                        21:05:13   Log-Likelihood:                -1498.9
No. Observations:                 506   AIC:                             3022.
Df Residuals:                     494   BIC:                             3073.
Df Model:                          11                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         36.3694      5.069      7.176      0.0