- Ten baseline variables, age, sex, body mass index, average blood pressure, and six blood serum measurements were obtained for each of n = 442 diabetes patients, as well as the response of interest, a quantitative measure of disease progression one year after baseline.

- Number of Instances: 442

- Number of Attributes: First 10 columns are numeric predictive values

- Target: Column 11 is a quantitative measure of disease progression one year after baseline

- Attribute Information: 
1. age - age in years
2. sex - gender
3. bmi - body mass index
4. bp - average blood pressure
5. s1 - tc, total serum cholesterol
6. s2 - ldl, low-density lipoproteins
7. s3 - hdl, high-density lipoproteins
8. s4 - tch, total cholesterol / HDL
9. s5 - ltg, possibly log of serum triglycerides level
10. s6 - glu, blood sugar level

In [8]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_diabetes

In [4]:
X,y = load_diabetes(return_X_y = True)

In [5]:
X

array([[ 0.03807591,  0.05068012,  0.06169621, ..., -0.00259226,
         0.01990749, -0.01764613],
       [-0.00188202, -0.04464164, -0.05147406, ..., -0.03949338,
        -0.06833155, -0.09220405],
       [ 0.08529891,  0.05068012,  0.04445121, ..., -0.00259226,
         0.00286131, -0.02593034],
       ...,
       [ 0.04170844,  0.05068012, -0.01590626, ..., -0.01107952,
        -0.04688253,  0.01549073],
       [-0.04547248, -0.04464164,  0.03906215, ...,  0.02655962,
         0.04452873, -0.02593034],
       [-0.04547248, -0.04464164, -0.0730303 , ..., -0.03949338,
        -0.00422151,  0.00306441]], shape=(442, 10))

In [6]:
X.shape

(442, 10)

In [17]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

In [19]:
print(X_train.shape)
print(X_test.shape)

(353, 10)
(89, 10)


In [21]:
# Model Building

from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train, y_train)

In [22]:
y_pred = lr.predict(X_test)

In [24]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.4399338661568969

In [25]:
lr.coef_

array([  -9.15865318, -205.45432163,  516.69374454,  340.61999905,
       -895.5520019 ,  561.22067904,  153.89310954,  126.73139688,
        861.12700152,   52.42112238])

In [26]:
lr.intercept_

np.float64(151.88331005254167)

# Making Our Own Linear Regression Class

In [67]:
class MyLR:

    def __init__(self):
        self.coef_ = None
        self.intercept_= None

    def fit(self,X_train, y_train):
        X_train = np.insert(X_train,0,1,axis=1) # Adding new column at 0th index with value 1

        # calculate the coefficients
        betas = np.linalg.inv(np.dot(X_train.T, X_train)).dot(X_train.T).dot(y_train)
        self.intercept_ = betas[0]
        self.coef_ = betas[1:]

    def predict(self, X_test):
        y_pred = np.dot(X_test, self.coef_) + self.intercept_
        return y_pred

In [68]:
mlr = MyLR()

In [69]:
mlr.fit(X_train,y_train)

In [66]:
X_train.shape

(353, 10)

In [47]:
np.insert(X_train, 0, 1, axis=1).shape

(353, 11)

In [71]:
y_pred = mlr.predict(X_test)

In [72]:
r2_score(y_test, y_pred)

0.43993386615689756

In [73]:
mlr.coef_

array([  -9.15865318, -205.45432163,  516.69374454,  340.61999905,
       -895.5520019 ,  561.22067904,  153.89310954,  126.73139688,
        861.12700152,   52.42112238])

In [74]:
mlr.intercept_

np.float64(151.8833100525417)