In [12]:
import pandas as pd 
import numpy as np 
from sklearn.datasets import load_diabetes
import nbimporter
from ML_classes import LinearRegression
from sklearn.linear_model import LinearRegression as LR
import warnings 


warnings.filterwarnings(action="ignore")

In [4]:
diabetes_df = load_diabetes(as_frame=True)
diabetes = diabetes_df.data
diabetes['target'] = diabetes_df.target
diabetes.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,target
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019907,-0.017646,151.0
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068332,-0.092204,75.0
2,0.085299,0.05068,0.044451,-0.00567,-0.045599,-0.034194,-0.032356,-0.002592,0.002861,-0.02593,141.0
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022688,-0.009362,206.0
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031988,-0.046641,135.0


### 1 Revise the python scripts you have written for simple linear regression for performing multiple linear regression (MLR).

In [5]:
class MLR(LinearRegression):
    def __init__(self,data,bias=1,target=True,alpha=0.01): 
        self.alpha = alpha
        self.bias = bias 
        self.target = target 
        if target:
            target_vector = data["target"]
            data = data.drop(columns="target")
        self.data = data
        #initalize theta_vector Tx 
        theta_vector = np.zeros_like(len(data.columns))
        #add bias term into theta vector
        self.theta_vector = np.zeros(self.data.shape[1] + 1)
        self.target_vector = target_vector
    def linear_combination(self,index):
        x_features = self.data.iloc[index,:]
        x_features = np.insert(x_features,0,self.bias)
        return np.dot(self.theta_vector,x_features)
    def get_features(self,index):
        x_features = self.data.iloc[index,:]
        x_features = np.insert(x_features,0,self.bias)
        return x_features 
    def J(self):
        cost = 0
        for index,value in self.data.iterrows():
            Tx = self.linear_combination(index)
            cost += (Tx - self.target_vector[index])**2
        return (1/(2*len(self.target_vector)) * cost)
    def update_T(self): #calculate partial derivatives with respect to each theta
        dJtJ = np.zeros_like(self.theta_vector)
        for i in range(len(self.target_vector)):
            error = self.linear_combination(i) - self.target_vector[i]
            dJtJ += error * self.get_features(index=i)
            #update theta vector 
        self.theta_vector -= (self.alpha/(len(self.data))*dJtJ)
    def iterations(self, n=100):
        # Perform gradient descent for n iterations
        cost_dict = {}
        for iteration in range(1, n + 1):
            self.update_T()
            cost = self.J()
            cost_dict[f'{cost}_{iteration}'] = self.theta_vector.copy()
        return cost_dict
    def stopping_criteria(self,dJgT=True,threshold=10,max_iterations=1000):
        cost_dict = {}
        deltaJ = 0
        iteration = 0 
        while dJgT and (iteration < max_iterations):
            self.update_T()
            cost = self.J()
            cost_dict[cost] = self.theta_vector.copy()
            if len(cost_dict) > 1:
                cost_list = list(cost_dict)
                deltaJ = abs(cost_list[-1] - cost_list[-2])
                if deltaJ < threshold:
                    dJgT = False
            iteration += 1
        print(f'iteration_n:{iteration}')
        min_cost = min(cost_dict)
        print(min_cost)
        return cost_dict[min_cost]



    

        

### 2 Perform MLR on the diabetes dataset using your MLR scripts, predict the target (the last column named target), and compute the R2 value between the given target and your predicted target value.

In [6]:
diabetes_norm = MLR.normalization(diabetes)
x = MLR(data=diabetes_norm,bias=1,alpha=0.01)
y_bar = np.mean(diabetes_norm["target"])

In [7]:
trained_parameters = x.stopping_criteria(threshold=.000000000001)

iteration_n:1000
0.24272933145109427


### 3 Perform MLR on the diabetes dataset using sklearn, predict the target, and compute the R2 value between the given target and your predicted target value that sklearn has generated.

In [9]:
bias = trained_parameters[0]
trained_parameters = trained_parameters[1:]

SSE = 0
SST = 0 

for index,values in diabetes.iterrows():
    y_est = (np.dot(values[:-1],trained_parameters) + bias)
    y_i = values[-1]
    SSE += (y_i - y_est)**2
    SST += (y_i - y_bar)**2




In [10]:
R_2 = 1 - SSE/SST
R_2

0.5145413370978114

In [13]:
diabetes = load_diabetes()
X = diabetes.data
y = diabetes.target
lm = LR()

reg = lm.fit(X,y)

reg.score(X,y)

0.5177484222203498

### 4 Compare the predicted target produced by your scripts with the predicted target produced by sklearn. Are they very similar or very different?
- Scores are extremely similar 

In [14]:
print(R_2) #My coefficient of determination 
print(reg.score(X,y)) #sklearn's coefficient of determination 

0.5145413370978114
0.5177484222203498


In [None]:
def initalize(data=None,n_features=None,bias=1,target=True):
    if target:
        target_vector = data["target"]
        data = data.drop(columns="target")
    #initalize theta_vector Tx
    theta_vector = np.empty_like(data.iloc[1,:])
    #add bias term into theta vector 
    theta_vector = np.insert(theta_vector,0,bias)

    return theta_vector

def linear_combination(data,index,theta,bias):
    #return linear combination of theta_vector and features of x 
    x_features = data.drop(columns="target").iloc[index,:]
    x_features = np.insert(x_features,0,bias)
    
    return np.dot(theta,x_features)