## Linear Regression

$$
f(x) = \beta_0 + \sum^p_{J=1}x_j \beta_j 
$$
We want to Minimize the residual sum of squares. To do so, first we need to compute it.
$$
\begin{aligned}
S(\beta) &= \sum^N_{i = 1}(y_i - f(x_i))^2 \\
        &= \sum^N_{i = 1} \big( y_i - \beta_0 - \sum^p_{j=1}x_{ij}\beta_j \big)^2
\end{aligned}
$$
* We have to add one more column with value 1 constant to multiply with the $\beta_0$, by doing so, the formula become:
$$
\begin{aligned}
S(\beta) &= \sum^N_{i=1} \big(y_i - \sum^p_{\color{red}{j=0}}x_{ij}\beta_j \big) ^2 \\
&= (y - X\beta)^T (y - X\beta) \\
&= y^Ty - y^T X\beta - \beta^T X^T y + \beta^T X^T X \beta
\end{aligned}
$$
* Minimizing and getting the estimated $\beta$ formula
$$
\begin{aligned}
\frac{\partial S}{\partial \beta} &= \frac{\partial (y^Ty - y^T X\beta - \beta^T X^T y + \beta^T X^T X \beta)}{\partial \beta} = 0 \\
&= -2X^T y + 2X^T X \beta = 0 \\
&= X^T X \beta = X^T y \\
\hat{\beta} &= (X^T X)^{-1} X^T y
\end{aligned}
$$
* To predic the target value to a new vector x:
$$
\hat{y} = \hat{f}(x) = (1:x)^T \hat{\beta}
$$

In [1]:
import numpy as np

In [2]:
X = np.array([[-1.75,  1.15,  0.98,  0.22, -0.19, -0.46, -0.58,  0.67, -0.53,-0.44],
                 [3.34, 2.75, 3.51, 1.93, 3.26, 3.44, 3.82, 2.9 , 4.03, 1.88]]).T
X

array([[-1.75,  3.34],
       [ 1.15,  2.75],
       [ 0.98,  3.51],
       [ 0.22,  1.93],
       [-0.19,  3.26],
       [-0.46,  3.44],
       [-0.58,  3.82],
       [ 0.67,  2.9 ],
       [-0.53,  4.03],
       [-0.44,  1.88]])

In [3]:
y = np.array([-19.56,  10.54,   5.53,   0.5 ,  -5.24,  -7.54,  -9.71,   5.26, -10.69,  -5.1])
y

array([-19.56,  10.54,   5.53,   0.5 ,  -5.24,  -7.54,  -9.71,   5.26,
       -10.69,  -5.1 ])

In [4]:
class LinearRegression():
    def __init__(self):
        self.beta = None
    
    def fit(self, X, y, pinv=False):
        """
        Args:
        X -> matrix of values
        y -> corresponding target values
        pinv -> flag that indicates the use of pseudo inverse matrix calulation
        
        Return:
        Beta
        """
        X = np.array(X)
        y = np.array(y)
        X = np.insert(X, 0, np.ones(X.shape[0]), axis = 1)
        XTX = X.T @ X #can use this operator
        if not pinv:
            inverse = np.linalg.inv(XTX)
        else:
            inverse = np.linalg.pinv(XTX)
        self.beta = np.dot(np.dot(inverse, X.T), y) # or np.dot, which may be more confusing with more operations
        return self.beta
    
    def predict(self, X):
        """
        Args:
        X -> matrix of values
        
        return
        predicted value y_hat
        """
        return self.beta[0] + np.dot(X,self.beta[1:])

    def variance(self, X, y):
        """
        Args:
        X -> matrix of values
        y -> corresponding target values

        Return:
        Variance        
        """
        y_hat = self.predict(X)
        N, p = X.shape[0], X.shape[1]
        return np.sum((y - y_hat) ** 2 / (N - p - 1))

        

In [5]:
lr = LinearRegression()

In [6]:
beta = lr.fit(X, y) #same results of beta as the professor

In [7]:
lr.predict([-1.75, 3.34])

-20.48600365967664

In [8]:
def mse(y, y_pred):
    return np.mean((y - y_pred)**2) 

In [9]:
y[0]

-19.56

In [10]:
mse(y[0], lr.predict([-1.75, 3.34]))

0.857482777734534

### Correlated columns

In [11]:
X2 = np.ones((10,2))

In [12]:
X2[:,1] = X2[:,1] * 2

In [13]:
X2

array([[1., 2.],
       [1., 2.],
       [1., 2.],
       [1., 2.],
       [1., 2.],
       [1., 2.],
       [1., 2.],
       [1., 2.],
       [1., 2.],
       [1., 2.]])

In [14]:
y2 = X2.sum(axis=1) + np.random.random((X2.shape[0])) * 0.01
y2

array([3.00402179, 3.0032624 , 3.00240856, 3.00295596, 3.00565385,
       3.00195023, 3.00258199, 3.00719405, 3.0085365 , 3.00646698])

In [15]:
lr2 = LinearRegression()

In [16]:
lr2.fit(X2, y2)

LinAlgError: Singular matrix

* Erros due to correlated columns, one way to go through is calculating the pseudo inverse

In [17]:
lr2.fit(X2, y2, pinv=True)

array([0.50075054, 0.50075054, 1.00150108])

In [18]:
lr2.predict([1, 2])

3.0045032293615934

### real database

In [19]:
from sklearn.datasets import load_iris, load_breast_cancer

In [20]:
np.random.seed(42)

In [21]:
X_breast,y_breast = load_breast_cancer(return_X_y=True)

In [22]:
X_breast.shape

(569, 30)

In [23]:
y_breast.shape

(569,)

In [24]:
breast_cancer_dataset = np.insert(X_breast, X_breast.shape[1], y_breast, axis=1)
breast_cancer_dataset.shape

(569, 31)

In [25]:
np.random.shuffle(breast_cancer_dataset)

In [26]:
breast_cancer_dataset.shape

(569, 31)

In [27]:
train_set = breast_cancer_dataset[:530]
train_set.shape

(530, 31)

In [28]:
test_set = breast_cancer_dataset[530:]
test_set.shape

(39, 31)

In [29]:
X_train, y_train = train_set[:,:30], train_set[:,-1]
print(X_train.shape, "\n",y_train.shape)

(530, 30) 
 (530,)


In [30]:
X_test, y_test = test_set[:,:30], test_set[:,-1]
print(X_test.shape, "\n",y_test.shape)

(39, 30) 
 (39,)


In [31]:
lr3 = LinearRegression()

In [32]:
lr3.fit(X_train, y_train)

array([ 2.98138703e+00,  1.66052735e-01, -7.95833945e-03, -1.64299058e-02,
       -2.78771833e-04,  4.59180117e-03,  3.55216500e+00, -1.55700652e+00,
       -1.55677866e+00, -9.20449605e-02,  3.21729801e+00, -4.47340609e-01,
        2.86419452e-03,  2.08389151e-02,  9.89951065e-04, -1.50522111e+01,
        4.86825779e-01,  3.36112709e+00, -1.09886925e+01, -2.68350513e+00,
        1.12946630e+01, -1.97167639e-01, -5.05767537e-03,  2.96539457e-03,
        9.96599344e-04, -6.68910603e-01,  1.89288509e-02, -4.06618832e-01,
       -4.06044512e-01, -4.72963544e-01, -5.58660442e+00])

In [33]:
y_pred = lr3.predict(X_test)

In [34]:
mse(y_test, y_pred)

0.05468007839899052

In [35]:
for i in range(len(y_test)):
    print(f"real = {y_test[i]} predicted = {y_pred[i]}")

real = 1.0 predicted = 0.9588469449417425
real = 1.0 predicted = 1.2130753702572001
real = 0.0 predicted = -0.15107299165410604
real = 1.0 predicted = 0.7058384310126482
real = 1.0 predicted = 0.7255783401265514
real = 1.0 predicted = 1.1931227611440174
real = 1.0 predicted = 0.8519273650919374
real = 1.0 predicted = 1.0945338824282227
real = 1.0 predicted = 0.8834561879541742
real = 1.0 predicted = 0.972540777906302
real = 1.0 predicted = 0.9427805694384017
real = 1.0 predicted = 1.0875409772320794
real = 0.0 predicted = -0.3843998927895975
real = 1.0 predicted = 1.1500305175964367
real = 1.0 predicted = 1.2230480677916873
real = 1.0 predicted = 1.0438041629429422
real = 1.0 predicted = 0.8361557309856069
real = 1.0 predicted = 1.1155767343618397
real = 1.0 predicted = 0.8276986031831406
real = 0.0 predicted = 0.4330782180242605
real = 1.0 predicted = 0.5419091887401639
real = 1.0 predicted = 0.7322288047297554
real = 0.0 predicted = -0.13117762487675266
real = 1.0 predicted = 1.10201

### coeficient significance

In [37]:
alpha = lr.variance(X, y)