In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [2]:
from sklearn.datasets import california_housing

In [3]:
calihousing = california_housing.fetch_california_housing()

In [4]:
data = pd.DataFrame(calihousing.data,columns=calihousing.feature_names)
target = calihousing.target
X_train,X_test,y_train,y_test = train_test_split(data,target,test_size=0.25, random_state=42)

In [5]:
def addConstantFunc(X):

    x0 = np.ones(len(X))

    #add a constant
    X['constant'] = x0

    return X


def calculate_cost_function(X,y, coefficients):



    #add a constant
    X =  addConstantFunc(X)

    cost = np.sum((X.dot(coefficients) - y)**2)/ (2*len(y))

    return cost


def LinearRegression(X,y, alpha, n_iterations, step_loss = True):

    """
    if step loss is true, the function returns the gradient descent output


    """
    X = addConstantFunc(X)

    gradient_preds = []

    #create base intercept
    coefficients = np.array(np.zeros(X.shape[1]))

    cost_history = [0] * n_iterations

    for i in range(n_iterations):

        h = X.dot(coefficients)

        loss = h - y

        gradient = X.T.dot(loss)/ len(y)

        coefficients = coefficients - alpha  * gradient

        cost = calculate_cost_function(X,y, coefficients)

        gradient_preds.append(tuple([i, cost]))

        cost_history[i] = cost

        steploss = pd.DataFrame(gradient_preds,columns=['Steps','Loss'])



    if step_loss==True:
        return (coefficients, cost_history)

    else:
        return coefficients


def predict(X, coefficients):

    X = addConstantFunc(X)
    prediction = X.dot(coefficients)
    return prediction

In [6]:
def calculate_adjr_squared(data,tree):

    '''
    adjusted r square penalizes for adding independent variables that do not fit the model

    '''


    labels = data.iloc[:,-1]
    mean = labels.mean()

    predictions = data.apply(predict, args=(tree,), axis=1)

    ss_res = sum((labels - predictions) **2)
    ss_tot = sum((labels - mean) ** 2)
    r_squared = 1 - (ss_res /ss_tot)

    n = data.shape[0]
    k = data.iloc[:-1].shape[1]

    adjusted_r_squared = 1 - ((1-r_squared) *(n-1) / (n - k - 1))
    return adjusted_r_squared
