# (make_regression class) in sklearn

In [37]:
# import (make_regression class) from sklearn
from sklearn.datasets import make_regression

In [38]:
#load regression data

'''
X ,y = make_regression(n_samples=100, n_features=100, n_informative=10,
                       n_targets=1, bias=0.0, effective_rank=None,
                       tail_strength=0.5, noise=0.0, shuffle=True, coef=False,
                       random_state=None)
'''

'\nX ,y = make_regression(n_samples=100, n_features=100, n_informative=10,\n                       n_targets=1, bias=0.0, effective_rank=None,\n                       tail_strength=0.5, noise=0.0, shuffle=True, coef=False,\n                       random_state=None)\n'

In [39]:
X_train, y_train = make_regression(n_samples=100, n_features=4,shuffle=True)

In [40]:
#X Data
print('X Data is \n' , X_train[:10])
print('X shape is ' , X_train.shape)

#y Data
print('y Data is \n' , y_train[:10])
print('y shape is ' , y_train.shape)

X Data is 
 [[ 1.56008465 -0.80906744  0.33970077 -1.60667062]
 [-1.43023705  1.04793992 -1.70989442 -0.06787459]
 [ 0.08675528  0.210307   -0.97916107  1.82202594]
 [ 0.66526866 -0.96102381 -1.60977503  0.16796504]
 [ 2.12014703  1.80137224 -1.33269625 -0.41325606]
 [ 0.427389   -0.05567146  1.05049111 -1.12636094]
 [-0.75092938 -0.37137736  0.13817374  1.05831239]
 [-0.79305831  1.44426035  0.61159593  0.32864633]
 [ 0.02708803 -0.41829533 -0.40416296  0.11518759]
 [-0.47718422  0.38250713 -1.26771075  0.14392035]]
X shape is  (100, 4)
y Data is 
 [ 149.26940167 -197.69601505  -25.26097726  -24.5577026   187.03548348
   84.90338357  -72.96763068  -21.39723571  -23.47683404  -95.89734833]
y shape is  (100,)


In [41]:
#Required Libraries
import copy, math
import numpy as np  

In [42]:
b_init = 785.1811367994083
w_init = np.array([ 0.39133535, 18.75376741, -53.36032453, -26.42131618])
print(f"w_init shape: {w_init.shape}, b_init type: {type(b_init)}")

w_init shape: (4,), b_init type: <class 'float'>


<a name="toc_15456_3"></a>
# Model Prediction With Multiple Variables
The model's prediction with multiple variables is given by the linear model:

$$ f_{\mathbf{w},b}(\mathbf{x}) =  w_0x_0 + w_1x_1 +... + w_{n-1}x_{n-1} + b \tag{1}$$
or in vector notation:
$$ f_{\mathbf{w},b}(\mathbf{x}) = \mathbf{w} \cdot \mathbf{x} + b  \tag{2} $$ 
where $\cdot$ is a vector `dot product`

To demonstrate the dot product, we will implement prediction using (1) and (2).

## Single Prediction, vector

In [43]:
def predict(x, w, b): 
    """
    single predict using linear regression
    Args:
      x (ndarray): Shape (n,) example with multiple features
      w (ndarray): Shape (n,) model parameters   
      b (scalar):             model parameter 
      
    Returns:
      p (scalar):  prediction
    """
    p = np.dot(x, w) + b     
    return p    

<a name="toc_15456_4"></a>
# Compute Cost With Multiple Variables
The equation for the cost function with multiple variables $J(\mathbf{w},b)$ is:
$$J(\mathbf{w},b) = \frac{1}{2m} \sum\limits_{i = 0}^{m-1} (f_{\mathbf{w},b}(\mathbf{x}^{(i)}) - y^{(i)})^2 \tag{3}$$ 
where:
$$ f_{\mathbf{w},b}(\mathbf{x}^{(i)}) = \mathbf{w} \cdot \mathbf{x}^{(i)} + b  \tag{4} $$ 


In contrast to previous labs, $\mathbf{w}$ and $\mathbf{x}^{(i)}$ are vectors rather than scalars supporting multiple features.

In [44]:
def compute_cost(X, y, w, b): 
    """
    compute cost
    Args:
      X (ndarray (m,n)): Data, m examples with n features
      y (ndarray (m,)) : target values
      w (ndarray (n,)) : model parameters  
      b (scalar)       : model parameter
      
    Returns:
      cost (scalar): cost
    """
    m = X.shape[0]
    cost = 0.0
    for i in range(m):                                
        f_wb_i = np.dot(X[i], w) + b           #(n,)(n,) = scalar (see np.dot)
        cost = cost + (f_wb_i - y[i])**2       #scalar
    cost = cost / (2 * m)                      #scalar    
    return cost

<a name="toc_15456_5"></a>
# Gradient Descent With Multiple Variables
Gradient descent for multiple variables:

$$\begin{align*} \text{repeat}&\text{ until convergence:} \; \lbrace \newline\;
& w_j = w_j -  \alpha \frac{\partial J(\mathbf{w},b)}{\partial w_j} \tag{5}  \; & \text{for j = 0..n-1}\newline
&b\ \ = b -  \alpha \frac{\partial J(\mathbf{w},b)}{\partial b}  \newline \rbrace
\end{align*}$$

where, n is the number of features, parameters $w_j$,  $b$, are updated simultaneously and where  

$$
\begin{align}
\frac{\partial J(\mathbf{w},b)}{\partial w_j}  &= \frac{1}{m} \sum\limits_{i = 0}^{m-1} (f_{\mathbf{w},b}(\mathbf{x}^{(i)}) - y^{(i)})x_{j}^{(i)} \tag{6}  \\
\frac{\partial J(\mathbf{w},b)}{\partial b}  &= \frac{1}{m} \sum\limits_{i = 0}^{m-1} (f_{\mathbf{w},b}(\mathbf{x}^{(i)}) - y^{(i)}) \tag{7}
\end{align}
$$
* m is the number of training examples in the data set

    
*  $f_{\mathbf{w},b}(\mathbf{x}^{(i)})$ is the model's prediction, while $y^{(i)}$ is the target value


<a name="toc_15456_5.1"></a>
## Compute Gradient with Multiple Variables
An implementation for calculating the equations (6) and (7) is below. There are many ways to implement this. In this version, there is an
- outer loop over all m examples. 
    - $\frac{\partial J(\mathbf{w},b)}{\partial b}$ for the example can be computed directly and accumulated
    - in a second loop over all n features:
        - $\frac{\partial J(\mathbf{w},b)}{\partial w_j}$ is computed for each $w_j$.
   

In [45]:
def compute_gradient(X, y, w, b): 
    """
    Computes the gradient for linear regression 
    Args:
      X (ndarray (m,n)): Data, m examples with n features
      y (ndarray (m,)) : target values
      w (ndarray (n,)) : model parameters  
      b (scalar)       : model parameter
      
    Returns:
      dj_dw (ndarray (n,)): The gradient of the cost w.r.t. the parameters w. 
      dj_db (scalar):       The gradient of the cost w.r.t. the parameter b. 
    """
    m,n = X.shape           #(number of examples, number of features)
    dj_dw = np.zeros((n,))
    dj_db = 0.

    for i in range(m):                             
        err = (np.dot(X[i], w) + b) - y[i]   
        for j in range(n):                         
            dj_dw[j] = dj_dw[j] + err * X[i, j]    
        dj_db = dj_db + err                        
    dj_dw = dj_dw / m                                
    dj_db = dj_db / m                                
        
    return dj_db, dj_dw

<a name="toc_15456_5.2"></a>
## Gradient Descent With Multiple Variables
The routine below implements equation (5) above.

In [46]:
def gradient_descent(X, y, w_in, b_in, cost_function, gradient_function, alpha, num_iters): 
    """
    Performs batch gradient descent to learn w and b. Updates w and b by taking 
    num_iters gradient steps with learning rate alpha
    
    Args:
      X (ndarray (m,n))   : Data, m examples with n features
      y (ndarray (m,))    : target values
      w_in (ndarray (n,)) : initial model parameters  
      b_in (scalar)       : initial model parameter
      cost_function       : function to compute cost
      gradient_function   : function to compute the gradient
      alpha (float)       : Learning rate
      num_iters (int)     : number of iterations to run gradient descent
      
    Returns:
      w (ndarray (n,)) : Updated values of parameters 
      b (scalar)       : Updated value of parameter 
      """
    
    # An array to store cost J and w's at each iteration primarily for graphing later
    J_history = []
    w = copy.deepcopy(w_in)  #avoid modifying global w within function
    b = b_in
    
    for i in range(num_iters):

        # Calculate the gradient and update the parameters
        dj_db,dj_dw = gradient_function(X, y, w, b)   ##None

        # Update Parameters using w, b, alpha and gradient
        w = w - alpha * dj_dw               ##None
        b = b - alpha * dj_db               ##None
      
        # Save cost J at each iteration
        if i<100000:      # prevent resource exhaustion 
            J_history.append( cost_function(X, y, w, b))

        # Print cost every at intervals 10 times or as many iterations if < 10
        if i% math.ceil(num_iters / 10) == 0:
            print(f"Iteration {i:4d}: Cost {J_history[-1]:8.2f}   ")
        
    return w, b, J_history #return final w,b and J history for graphing

In [47]:
# initialize parameters
initial_w = np.zeros_like(w_init)
initial_b = 0.
# some gradient descent settings
iterations = 1000
alpha = 5.0e-7
# run gradient descent 
w_final, b_final, J_hist = gradient_descent(X_train, y_train, initial_w, initial_b,
                                                    compute_cost, compute_gradient, 
                                                    alpha, iterations)
print(f"b,w found by gradient descent: {b_final:0.2f},{w_final} ")
m,_ = X_train.shape
for i in range(m):
    print(f"prediction: {np.dot(X_train[i], w_final) + b_final:0.2f}, target value: {y_train[i]}")

Iteration    0: Cost  4896.55   
Iteration  100: Cost  4896.14   
Iteration  200: Cost  4895.73   
Iteration  300: Cost  4895.32   
Iteration  400: Cost  4894.91   
Iteration  500: Cost  4894.50   
Iteration  600: Cost  4894.09   
Iteration  700: Cost  4893.69   
Iteration  800: Cost  4893.28   
Iteration  900: Cost  4892.87   
b,w found by gradient descent: -0.01,[0.03941867 0.00882023 0.01718462 0.00767065] 
prediction: 0.04, target value: 149.2694016663034
prediction: -0.08, target value: -197.69601504800724
prediction: -0.01, target value: -25.260977260954256
prediction: -0.02, target value: -24.557702603614707
prediction: 0.07, target value: 187.03548348493334
prediction: 0.02, target value: 84.9033835692537
prediction: -0.03, target value: -72.96763068484451
prediction: -0.01, target value: -21.397235707197723
prediction: -0.02, target value: -23.476834037647805
prediction: -0.04, target value: -95.89734832555249
prediction: -0.05, target value: -90.84246692295726
prediction: -0.

In [36]:
# print(predict(X_train, w_final, b_final))