<H3>LINEAR REGRESSION

In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import f_regression
from sklearn.model_selection import train_test_split

In [2]:
from sklearn.datasets import load_boston
data = load_boston()

In [3]:
X = pd.DataFrame(data=data['data'], columns=list(data['feature_names']))
y = pd.DataFrame(data=data['target'])

In [4]:
X.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [12]:
X.shape #NOTICE! VERY FEW SAMPLES. That's why there's such a difference between the train and validation score.

(506, 13)

In [5]:
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [6]:
reg = LinearRegression()
reg.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [7]:
r_squared = reg.score(X,y) #R-squared value
r_squared

0.7373440319905034

<h4>Formula for calculating Adjusted R squared
  
$R^2_{adj.} = 1 - (1-R^2)*\frac{n-1}{n-p-1}$

In [8]:
#We need the adjusted R-squared value to adjust for multiple features and their individual explanatory power.
r_adj = 1 - (1-r_squared) * (X.shape[0]-1) / (X.shape[0]-X.shape[1]-1)
r_adj

0.7304039352748053

<h4>Feature selection

In [9]:
f_regression(X,y) #We run a f_regression to get P-values of each feature to test their explanatory power.
#The first array contain each features F-statistic and the second array is the P-values.
#A useful feature should have a P-value below 0.05 

  y = column_or_1d(y, warn=True)


(array([ 89.48611476,  75.2576423 , 153.95488314,  15.97151242,
        112.59148028, 471.84673988,  83.47745922,  33.57957033,
         85.91427767, 141.76135658, 175.10554288,  63.05422911,
        601.61787111]),
 array([1.17398708e-19, 5.71358415e-17, 4.90025998e-31, 7.39062317e-05,
        7.06504159e-24, 2.48722887e-74, 1.56998221e-18, 1.20661173e-08,
        5.46593257e-19, 5.63773363e-29, 1.60950948e-34, 1.31811273e-14,
        5.08110339e-88]))

In [10]:
p_values = f_regression(X,y)[1].round(4)
p_values

  y = column_or_1d(y, warn=True)


array([0.    , 0.    , 0.    , 0.0001, 0.    , 0.    , 0.    , 0.    ,
       0.    , 0.    , 0.    , 0.    , 0.    ])

<h4>Validation test

In [11]:
reg.score(X_test,y_test)

0.5892223849182507