# Regression Week 1: Simple Linear Regression

# Fire up imports

In [70]:
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cross_validation import train_test_split

# Load house sales data

In [71]:
sales = pd.read_csv('kc_house_data.csv')
#train_data , test_data = train_test_split(sales, test_size=0.2)
train_data = pd.read_csv('kc_house_train_data.csv')
test_data = pd.read_csv('kc_house_test_data.csv')

# Build a generic simple linear regression function

In [72]:
def simple_linear_regression(input_feature, output):
    #your code here
    N = len(output)
    X = np.array(input_feature)
    Y = np.array(output)
    sum_xy = np.sum(np.multiply(X, Y))
    sum_x, sum_y = np.sum(X), np.sum(Y)
    sum_x_square = np.sum(np.multiply(X, X))
    slope = (sum_xy - (sum_x*sum_y)/N) / (sum_x_square - (sum_x*sum_x)/N)
    intercept = (sum_y - slope*sum_x)/N
    return(intercept, slope)

We can test that our function works by passing it something where we know the answer. In particular we can generate a feature and then put the output exactly on a line: output = 1 + 1*input_feature then we know both our slope and intercept should be 1

In [73]:
test_feature = np.array(range(5))
test_output = 3 + 2*test_feature
(test_intercept, test_slope) =  simple_linear_regression(test_feature, test_output)
print ("Intercept: " + str(test_intercept))
print ("Slope: " + str(test_slope))

Intercept: 3.0
Slope: 2.0


Now that we know it works let's build a regression model for predicting price based on sqft_living. Rembember that we train on train_data!

In [74]:
sqft_intercept, sqft_slope = simple_linear_regression(train_data['sqft_living'], train_data['price'])

print("Intercept: " + str(sqft_intercept))
print("Slope: " + str(sqft_slope))

Intercept: -47116.0790729
Slope: 281.95883963


# Predicting Values

In [75]:
def get_regression_predictions(input_feature, intercept, slope) :
    #[your code here]
    predicted_output = intercept + slope*input_feature
    return(predicted_output)

#### Quiz Question: Using your Slope and Intercept from (4), What is the predicted price for a house with 2650 sqft?

In [76]:
my_house_sqft = 2650
estimated_price = get_regression_predictions(my_house_sqft, sqft_intercept, sqft_slope)
print("The estimated price for a house with %d squarefeet is $%.2f" % (my_house_sqft, estimated_price))

The estimated price for a house with 2650 squarefeet is $700074.85


# Residual Sum of Squares

In [77]:
def get_residual_sum_of_squares(input_feature, output, intercept,slope) :
    #[your code here]
    prediction = intercept + slope*input_feature
    RSS = sum((prediction-output)*(prediction-output))
    return(RSS)

Let's test our get_residual_sum_of_squares function by applying it to the test model where the data lie exactly on a line. Since they lie exactly on a line the residual sum of squares should be zero

In [78]:
print(get_residual_sum_of_squares(test_feature, test_output, test_intercept, test_slope)) #should be 0.0

0.0


#### Quiz Question: According to this function and the slope and intercept from the squarefeet model What is the RSS for the simple linear regression using squarefeet to predict prices on TRAINING data?

In [79]:
rss_prices_on_sqft = get_residual_sum_of_squares(train_data['sqft_living'], train_data['price'], sqft_intercept, sqft_slope)
print('The RSS of predicting Prices based on Square Feet is : ' + str(rss_prices_on_sqft))

The RSS of predicting Prices based on Square Feet is : 1.20191835418e+15


# Predict the squarefeet given price

In [80]:
def inverse_regression_predictions(output, intercept, slope):
    #[your code here]
    estimated_input = (output - intercept) / slope
    return(estimated_input)

#### Quiz Question: According to this function and the regression slope and intercept from (3) what is the estimated square-feet for a house costing $800,000? 

In [81]:
my_house_price = 800000
estimated_squarefeet = inverse_regression_predictions(my_house_price, sqft_intercept, sqft_slope)
print("The estimated squarefeet for a house worth $%.2f is %d" % (my_house_price, estimated_squarefeet))

The estimated squarefeet for a house worth $800000.00 is 3004


# New Model: estimate prices from bedrooms

We have made one model for predicting house prices using squarefeet, but there are many other features in the sales SFrame. Use your simple linear regression function to estimate the regression parameters from predicting Prices based on number of bedrooms. Use the training data!

In [82]:
bedroom_intercept, bedroom_slope = simple_linear_regression(train_data['bedrooms'], train_data['price'])
print("Intercept: " + str(bedroom_intercept))
print("Slope: " + str(bedroom_slope))

Intercept: 109473.177623
Slope: 127588.952934


# Test your Linear Regression Algorithm

Now we have two models for predicting the price of a house. How do we know which one is better? Calculate the RSS on the TEST data (remember this data wasn't involved in learning the model). Compute the RSS from predicting prices using bedrooms and from predicting prices using squarefeet.


#### Quiz Question: Which model (square feet or bedrooms) has lowest RSS on TEST data? Think about why this might be the case 

In [83]:
rss_prices_on_bedrooms = get_residual_sum_of_squares(test_data['bedrooms'], test_data['price'], bedroom_intercept, bedroom_slope)
rss_prices_on_sqft = get_residual_sum_of_squares(test_data['sqft_living'], test_data['price'], sqft_intercept, sqft_slope)
print(rss_prices_on_sqft)
print(rss_prices_on_bedrooms)

2.75402933618e+14
4.9336458596e+14


In [87]:
280.76/0.092903 

3022.076789769975