# Chater 14 - Simple Linear Regression

Understanding the nature of linear relationships in multidimensional data


In [10]:
# imports and ports of past functions
import math
import random

def mean(x):
    return sum(x) / len(x)

def standard_deviation(x):
    return math.sqrt(variance(x))

def variance(x):
    n=len(x)
    deviations = de_mean(x)
    return sum_of_squares(deviations) / (n-1)

def de_mean(x):
    x_bar = mean(x)
    return [x_i - x_bar for x_i in x]

def dot(v, w):
    return sum(v_i * w_i
              for v_i, w_i in zip(v,w))

def sum_of_squares(v):
    return dot(v,v)

def vector_subtract(v, w): 
    return [v_i - w_i
           for v_i, w_i in zip(v,w)]

def magnitude(v):
    return math.sqrt(sum_of_squares(v))

def squared_dist(v,w):
    return sum_of_squares(vector_subtract(v,w))

def distance(v,w):
    return magnitude(vector_subtract(v,w))

def difference_quotient(f,x,h):
    return (f(x+h) - f(x)) / h

def square(x):
    return x*x

def derivative(x):
    return 2 * x

def in_random_order(data):
    indices = [i for i, _ in enumerate(data)]
    random.shuffle(indices) #Joel writes indexes but that's not the pluralization.
    for i in indices:
        yield data[i]

def minimize_stochastic(target_fn, gradient_fn, x, y, theta_0, alpha_0=0.01):
    
    data = zip(x,y,)
    theta = theta_0
    alpha = alpha_0
    
    min_theta, min_value = None, float('inf')
    iterations_with_no_improvement = 0
    
    # we want to stop when we don't see any improvement try shrinking the step size, but if we do keep going
    while iterations_with_no_improvement < 100:
        value = sum( target_fn(x_i, y_i, theta) for x_i, y_i in data )
        
        if value < min_value:
            min_theta, min_value = theta, value
            iterations_with_no_improvement = 0
            alpha = alpha_0
        else:
            iterations_with_no_improvement += 1
            alpha *= 0.9
            
        # take a gradient step
        for x_i, y_i in in_random_order(data):
            gradient_i = gradient_fn(x_i, y_i, that)
            theta = vector_subtract(theta, scalar_multiply(alpha, gradient_i))
            
    return min_theta


In [4]:
# linear model

def predict(alpha, beta, x_i):
    return beta * x_i + alpha

def error(alpha, beta, x_i, y_i):
    return y_i - predict(alpha, beta, x_i)

# error over the entire data set from squared errors
def least_squares_fit(x,y):
    beta = correlation(x,y) * standard_deviation(y) / standard_deviation(x)
    alpha = mean(y) - beta * mean(x)
    return alpha, beta

def sum_of_squared_errors(alpha, beta, x, y):
    return sum(error(alpha, beta, x_i, y_i) ** 2 for x_i, y_i, in zip(x,y))

# want to minimize error by choosing alpha and beta


In [6]:
# example code on outlierless data in chapter 5

alpha, beta = least_squares_fit(num_friends_good, daily_minutes_good)

NameError: name 'num_friends_good' is not defined

In [7]:
# R-squared from scratch

def total_sum_of_squares(y):
    return sum(v ** 2 for v in de_mean(y))

def r_squared(alpha, beta, x,y):
    return 1.0 - (sum_of_squared_errors(alpha, beta, x, y) / total_sum_of_squares(y))

#r squared will always be between 0 and 1

In [8]:
# example code on chapter 5 data
r_squared(alpha, beta, num_friends_good, daily_minutes_good)

NameError: name 'alpha' is not defined

In [9]:
# can also solve this with gradient descent, which we built a function for!

# theta = [alpha, beta]
def squared_error(x_i, y_i, theta):
    alpha, beta = theta
    return error(alpha, beta, x_i, y_i) ** 2

def squared_error_gradient(x_i, y_i, theta):
    alpha, beta = theta
    return [-2 * error(alpha, beta, x_i, y_i), -2 * error(alpha, beta, x_i, y_i) * x_i] # alpha and beta partial derivatives


In [13]:
# example with chapter 5 data

random.seed(0)
theta = [random.random(), random.random()]
alpha, beta = minimize_stochastic(squared_error, squared_error_gradient, num_friends_good, daily_minutes_good, theta, .0001)

print(alpha, beta)

NameError: name 'num_friends_good' is not defined

In [14]:
# minimizing the sum of squared errors is equivalent to maximizing the likelihood of the observed data

## This concludes chapter 14

For further exploration, move to the next chapter and learn about multiple regression!