## Chapter 3: Multiple regression in action

In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.datasets import load_boston
from sklearn import linear_model

%matplotlib inline

  'Matplotlib is building the font cache using fc-list. '


ImportError: cannot import name 'ss'

In [None]:
boston = load_boston()
dataset = pd.DataFrame(boston.data, columns = boston.feature_names)
dataset['target'] = boston.target
dataset
            

In [None]:
print(boston.DESCR)
print(boston)

In [None]:
observations = len(dataset)
observations

In [None]:
print(boston.DESCR)
print(boston)

In [None]:
variables = dataset.columns[:-1]
variables

In [None]:
x = dataset.iloc[:,:-1]
x

In [None]:
y = dataset['target'].values
y

## Using multiple features

In [None]:
# using import statsmodels.api as sm
# and import statsmodels.formula.api as smf

In [None]:
Xc =sm.add_constant(x)
linear_regression = sm.OLS(y, Xc)
fitted_model = linear_regression.fit()
fitted_model.summary()

In [None]:
linear_regression2 = smf.ols(formula = 'target ~ CRIM +ZN + INDUS + CHAS + NOX + RM + AGE + DIS + \
                                        RAD + TAX + PTRATIO + B +LSTAT', data=dataset)
fitted_model2 = linear_regression2.fit()
fitted_model2.summary()

## Correlation matrix

In [None]:
X = dataset.iloc[:,:-1]
correlation_matrix = X.corr()
print(correlation_matrix)

In [None]:
def visualize_correlation_matrix(data, hurdle = 0.0):
    R = np.corrcoef(data,rowvar = 0)
    R[np.where(np.abs(R) < hurdle)] = 0.0
    heatmap = plt.pcolor(R, cmap=mpl.cm.coolwarm, alpha=0.8)
    heatmap.axes.set_frame_on(False)
    heatmap.axes.set_yticks(np.arange(R.shape[0]) + 0.5, minor=False)
    heatmap.axes.set_xticks(np.arange(R.shape[1]) + 0.5, minor=False)
    heatmap.axes.set_xticklabels(variables, minor=False)
    plt.xticks(rotation=90)
    heatmap.axes.set_yticklabels(variables, minor=False)
    plt.tick_params(axis='both', which='both', bottom='off', \
    top='off', left = 'off', right = 'off')
    plt.colorbar()
    plt.show()
    
visualize_correlation_matrix(X, hurdle=0.5)

In [None]:
sns.pairplot(dataset[variables], size = 2.5)
plt.tight_layout()
plt.show()

In [None]:
cols = ['LSAT', 'INDUS', 'NOX', 'NOX', 'RM', 'TAX']
sns.pairplot(dataset[variables], size = 2.5)
plt.tight_layout()
plt.show()

## Revisiting gradient descent

In [10]:
from sklearn.preprocessing import StandardScaler

In [11]:
from sklearn.preprocessing import StandardScaler
observations = len(dataset)
variables = dataset.columns
standardization = StandardScaler()
Xst = standardization.fit_transform(X)
original_means = standardization.mean_
original_stds = standardization.var_**.5
Xst = np.column_stack((Xst,np.ones(observations)))
y = dataset['target'].values

NameError: name 'dataset' is not defined

In [9]:
import random
def random_w( p ):
    return np.array([np.random.normal() for j in range(p)])
def hypothesis(X,w):
    return np.dot(X,w)
def loss(X,w,y):
    return hypothesis(X,w) - y
def squared_loss(X,w,y):
    return loss(X,w,y)**2
def gradient(X,w,y):
    gradients = list()
    n = float(len( y ))
    for j in range(len(w)):
        gradients.append(np.sum(loss(X,w,y) * X[:,j]) / n)
    return gradients
def update(X,w,y, alpha=0.01):
    return [t - alpha*g for t, g in zip(w, gradient(X,w,y))]
def optimize(X,y, alpha=0.01, eta = 10**-12, iterations = 1000):
    w = random_w(X.shape[1])
    path = list()
    for k in range(iterations):
        SSL = np.sum(squared_loss(X,w,y))
        new_w = update(X,w,y, alpha=alpha)
        new_SSL = np.sum(squared_loss(X,new_w,y))
        w = new_w
        if k>=5 and (new_SSL - SSL <= eta and \
            new_SSL - SSL >= -eta):
            path.append(new_SSL)
        return w, path
    if k % (iterations / 20) == 0:
        path.append(new_SSL)
    return w, path

alpha = 0.02
w, path = optimize(Xst, y, alpha, eta = 10**-12, \
iterations = 20000)
print ("These are our final standardized coefficients: " + ', \
'.join(map(lambda x: "%0.4f" % x, w)))

NameError: name 'Xst' is not defined

In [None]:
unstandardized_betas = w[:-1] / original_stds
unstandardized_bias = w[-1]-np.sum((original_means /original_stds) * w[:-1])
print ('%8s: %8.4f' % ('bias', unstandardized_bias))
for beta,varname in zip(unstandardized_betas, variables):
    print ('%8s: %8.4f' % (varname, beta))