## import needed libraries

In [1]:
import numpy as np # type: ignore
from numpy.linalg import norm # type: ignore
from sklearn.datasets import make_regression # type: ignore


## import out data using sklearn library just a random data for regression

In [2]:
X, y = make_regression(n_samples=200, n_features=5, noise=1, random_state=42)


## Here is our data

In [3]:
print(X)

[[-3.85313597e-01  1.99059696e-01 -6.00216877e-01  4.62103474e-01
   6.98020850e-02]
 [ 1.30740577e-01  1.63241130e+00 -1.43014138e+00 -1.24778318e+00
  -4.40044487e-01]
 [-7.73009784e-01  2.24092482e-01  1.25924008e-02 -4.01220472e-01
   9.76760985e-02]
 [-5.76771331e-01 -5.02381094e-02 -2.38948047e-01  2.70456826e-01
  -9.07563662e-01]
 [-5.75818241e-01  6.14166700e-01  7.57507710e-01 -2.20969600e-01
  -5.30501148e-01]
 [-5.55476989e-02  1.62861555e+00 -1.38010146e+00  7.40947804e-02
  -1.70338244e+00]
 [ 1.69645637e+00 -1.28042935e+00  1.75479418e+00  6.06009951e-01
  -2.08192941e+00]
 [ 1.17944012e+00  4.91919172e-01 -1.32023321e+00 -8.98414671e-01
   1.83145877e+00]
 [-7.53736164e-01  8.22060160e-01  1.89679298e+00  4.12780927e-01
  -2.45388116e-01]
 [-5.30996955e-01  2.89168644e-01  2.45530014e+00  4.92451264e-01
  -6.37739984e-01]
 [ 2.59882794e-01  1.30714275e+00 -1.60748323e+00  2.27459935e-01
   1.84633859e-01]
 [-2.81784609e-01  1.57957215e+00 -5.22860027e-01  1.45338448e+00

## when dealing with regression problems you need to scale (normalize) your data 

## we will apply the min_max scaler to our data 

In [4]:
minimum = np.array([np.min(X[:, i]) for i in range(X.shape[1])])
maximum = np.array([np.max(X[:, i]) for i in range(X.shape[1])])
X_scaled = X.copy()


for i in range(X_scaled.shape[0]):
    X_scaled[i,:] = (X_scaled[i,:] - minimum) / (maximum - minimum)


## Here is our scaled data.

In [5]:
print(X_scaled)

[[0.34522048 0.60682563 0.45524742 0.55192097 0.47484168]
 [0.42495104 0.9071746  0.31219033 0.19889515 0.3858609 ]
 [0.28532128 0.61207108 0.56087957 0.37367778 0.47970638]
 [0.31564019 0.55458699 0.51752063 0.51235331 0.30426729]
 [0.31578745 0.69380845 0.68928331 0.41089265 0.37007399]
 [0.39616944 0.90637922 0.32081589 0.4718121  0.16537733]
 [0.66685471 0.29680888 0.86118919 0.58163212 0.09931156]
 [0.58697551 0.66819232 0.3311356  0.27102631 0.78229414]
 [0.28829906 0.73737109 0.88566604 0.54173776 0.41983323]
 [0.32271235 0.62570734 0.98193793 0.55818662 0.35135817]
 [0.44490356 0.83901681 0.28162127 0.5034761  0.49488265]
 [0.36121575 0.89610252 0.4685817  0.75658236 0.38932655]
 [0.49638594 0.80804363 0.37236218 0.38198875 0.57015543]
 [0.56147212 0.61876331 0.72809897 0.55408757 0.75332579]
 [0.43150809 0.74456652 0.59561308 0.36432622 0.24524741]
 [0.51010406 0.45387414 0.47434252 0.97822871 0.64489128]
 [0.45691131 0.53249295 0.76000379 0.47439184 0.5070622 ]
 [0.49542121 0

## Now let's get to the real work

## So you want to develop a linear model that takes a single record with x number of features and predict a target we want this target to be as accurate as possible to the true value 


![SegmentLocal](linearform11.png "segment")

## If you have data and basically you want to fit a line to your data that achieves the least error that the linear model we want.

## If you have data and basically you want to fit a line to your data that achieves the least error that the linear model we want.

## we wil use the MSE (mean squared error)

![SegmentLocal](costform.png "segment")

![SegmentLocal](gradient_descent_parameter_a.gif "segment")

## So we will use the MSE as cost function that 

## remember you can use f = wx + b or f = wx it's all about if you want interception to control the line starting point

In [6]:
## intialization of weights (using random numbers) we will use fit interception
## so we got y =mx+c :: one feature = 2 parameter  
## so our equation num of parameters = num of features + 1 (or num of features only if you will not use the interception)

w = np.array([1,1,1,1,1,1])

In [7]:

def gradient_descent_linear_regression(X, t_label, w, learning_rate = 0.01, precision = 0.0001, max_iter = 1000): 
    ## note the (learning_rate, precision and max_iter) all of them are hyper parameters it's better to try different values of them to achieve the best hyper parameter to get the lowest cost value
    x = np.c_[np.ones(X.shape[0]),X] ## adding new columns of ones to make interception parameter to control the hyperplane location like in sklearn : fit_intercept = True
    y_gt = t_label
    m = x.shape[0]
    initial_weights = w
    
    last_weight = initial_weights + 10 ## inital with any numbers but make sure  to be not close to initial_weights to avoid early convergence 
    weight_hist, cost_hist, iteration_history = [],  [], []
    cur_weight = initial_weights.copy()
    
    def cost_func(w):
        y_pdnew = np.dot(x, w)
        cost = 0
        for i in range(m):
            err = ((y_gt[i] - y_pdnew[i]) ** 2)
            cost += err

        return float((1/(2*m)) * cost) ## MSE cost 


    def gradients(weights): ## partial deravatives of the MSE function based on each feature (gradients)
        y_pd = np.dot(x,weights)
        g = 0
        for i in range(m):
            grad = (y_pd[i] - y_gt[i]) * x[i]
            g+= grad
        return (1/m )* g

    iter = 0
    while np.linalg.norm(last_weight - cur_weight) > precision and iter < max_iter: ## stopping criteria if the change in weights converged the learning will stop  but  you must stay less than the maximum number of iterations.
        
        last_weight = cur_weight 
        weight_hist.append(last_weight) ## history of our parameters
        cost_hist.append(cost_func(last_weight)) ## cost history with each iteration
        iteration_history.append(iter) ## to show which iteration we stopped
        cur_weight = cur_weight - (learning_rate *gradients(cur_weight)) ## the update of weights operation
        
        cost = cost_func(last_weight) ## new update means new cost and error update
        
        
        iter += 1
    return cost

In [8]:
def trial_1():
    cost = gradient_descent_linear_regression(X_scaled,y,w) ## use same hyper parameters
    print(cost)
trial_1()

3128.364966228765


In [9]:
def trial_2():
    cost = gradient_descent_linear_regression(X_scaled,y,w,0.01,0.0001,10000) ## increased the number max iterations
    print(cost)
trial_2()

172.3982712641052


In [10]:
def trial_3():
    cost = gradient_descent_linear_regression(X_scaled,y,w,0.01,0.00001,100000) ## increased the number max iterations and decreased the precision
    print(cost)
trial_3()

0.48897077463957433


## Remember the lower the cost the better