In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [5]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':str, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}
train = pd.read_csv('kc_house_train_data.csv', dtype=dtype_dict)
test = pd.read_csv('kc_house_test_data.csv', dtype=dtype_dict)

In [50]:
def simple_linear_regression(input_col, label):
    num_row = len(input_col)
    x = input_col.values.reshape(num_row, 1)
    y = label.values.reshape(num_row, 1)
    x_mean = np.mean(x)
    y_mean = np.mean(y)
    xx_mean = np.mean(np.multiply(x, x))
    xy_mean = np.mean(np.multiply(x, y))
    slope = (xy_mean - x_mean*y_mean)/(xx_mean - x_mean**2)
    intercept = y_mean - slope * x_mean
    return {'intercept':intercept, 'slope':slope}

In [51]:
model_sqft = simple_linear_regression(train['sqft_living'], train['price'])
print str(model_sqft['intercept']) + ' ' + str(model_sqft['slope'])

-47116.0790729 281.95883963


In [52]:
def get_regression_prediction(input_col, intercept, slope):
    num_row = len(input_col)
    predicted_output = np.zeros((num_row, 1))
    for i in range(num_row):
        predicted_output[i] = intercept + slope * input_col[i]
    return predicted_output

## Q1

In [53]:
print str(get_regression_prediction([2650], **model_sqft))

[[ 700074.84594751]]


## Q2

In [59]:
def get_residual_sum_of_squares(input_col, output, intercept, slope):
    RSS = 0.0
    num_row = len(input_col)
    for i in range(num_row):
        RSS = RSS + (get_regression_prediction([input_col[i]], intercept, slope) - output[i])**2
    return RSS

In [60]:
print str(get_residual_sum_of_squares(train['sqft_living'], train['price'], **model_sqft))

[[  1.20191835e+15]]


## Q3

In [61]:
def inverse_regression_predictions(output, intercept, slope):
    num_row = len(output)
    estimated_input = np.zeros((num_row, 1))
    for i in range(num_row):
        estimated_input[i] = (output[i] - intercept)/slope
    return estimated_input

In [62]:
print str(inverse_regression_predictions([800000.0], **model_sqft))

[[ 3004.39624515]]


## Q4

In [63]:
model_br = simple_linear_regression(train['bedrooms'], train['price'])
print 'RSS of sqft model = ' + str(get_residual_sum_of_squares(test['sqft_living'], test['price'], **model_sqft))
print 'RSS of br model = ' + str(get_residual_sum_of_squares(test['bedrooms'], test['price'], **model_br))

RSS of sqft model = [[  2.75402934e+14]]
RSS of br model = [[  4.93364586e+14]]
