# Importing

In [1]:
import pandas as pd

# Reading data

In [3]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':str, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

In [8]:
train_data = pd.read_csv('kc_house_train_data.csv',dtype=dtype_dict)
test_data = pd.read_csv('kc_house_test_data.csv', dtype=dtype_dict)

In [9]:
train_data.head(5)

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3.0,1.0,1180.0,5650,1,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340.0,5650.0
1,6414100192,20141209T000000,538000.0,3.0,2.25,2570.0,7242,2,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690.0,7639.0
2,5631500400,20150225T000000,180000.0,2.0,1.0,770.0,10000,1,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720.0,8062.0
3,2487200875,20141209T000000,604000.0,4.0,3.0,1960.0,5000,1,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360.0,5000.0
4,1954400510,20150218T000000,510000.0,3.0,2.0,1680.0,8080,1,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800.0,7503.0


In [10]:
test_data.head(5)

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,114101516,20140528T000000,310000.0,3.0,1.0,1430.0,19901,1.5,0,0,...,7,1430,0,1927,0,98028,47.7558,-122.229,1780.0,12697.0
1,9297300055,20150124T000000,650000.0,4.0,3.0,2950.0,5000,2.0,0,3,...,9,1980,970,1979,0,98126,47.5714,-122.375,2140.0,4000.0
2,1202000200,20141103T000000,233000.0,3.0,2.0,1710.0,4697,1.5,0,0,...,6,1710,0,1941,0,98002,47.3048,-122.218,1030.0,4705.0
3,8562750320,20141110T000000,580500.0,3.0,2.5,2320.0,3980,2.0,0,0,...,8,2320,0,2003,0,98027,47.5391,-122.07,2580.0,3980.0
4,7589200193,20141110T000000,535000.0,3.0,1.0,1090.0,3000,1.5,0,0,...,8,1090,0,1929,0,98117,47.6889,-122.375,1570.0,5080.0


# Slope, intercept function (closed form)

$ Y = mX +b$

where

$ m = \frac{\sum{XY}-\frac{\sum{X}\sum{Y}}{N}}{\sum{(X^2)}-\frac{{(\sum{X})}^2}{N}} $

and

$ b = \bar{Y} - m \bar{X}$

or

$ b = \frac{\sum{Y}}{N}-m \frac{\sum{X}}{N} $

In [36]:
def simple_linear_regression(input_feature, output):
    N = len(input_feature)
    sum_y = sum(output)
    sum_x = sum(input_feature)
    sum_xx = sum(input_feature**2)
    sum_xy = sum(input_feature*output)
    slope = (sum_xy-(sum_y*sum_x/N))/(sum_xx-(sum_x*sum_x/N))
    intercept = (sum_y/N)-slope*(sum_x/N)
    return intercept, slope

# Finding the intercept and slope

In [79]:
input_feature = train_data['sqft_living']
output = train_data['price']

In [80]:
intercept,slope = simple_linear_regression(input_feature,output)
intercept,slope

(-47116.07907289418, 281.9588396303426)

# Regression prediction

In [53]:
def get_regression_predictions(input_feature, intercept, slope):
    predicted_output=intercept+slope*input_feature
    return predicted_output

In [55]:
predicted_output = get_regression_predictions(input_feature, intercept, slope)
predicted_output

0        285595.351691
1        677518.138777
2        169992.227442
3        505523.246603
4        426574.771506
             ...      
17379    942559.448030
17380    322250.000843
17381    384280.945562
17382    404018.064336
17383    240481.937350
Name: sqft_living, Length: 17384, dtype: float64

# Quiz Question: 
Using your Slope and Intercept from (4), What is the predicted price for a house with 2650 sqft?





In [70]:
price = get_regression_predictions(2650, intercept, slope)
price 

700074.8459475137

# RSS

In [62]:
def get_residual_sum_of_squares(input_feature, output, intercept,slope):
    RSS = sum((output-(intercept+slope*input_feature))**2)
    return(RSS)

# Quiz question
According to this function and the slope and intercept from (4) What is the RSS for the simple linear regression using squarefeet to predict prices on TRAINING data?

In [81]:
RSS_sqft_training = get_residual_sum_of_squares(input_feature, output, intercept,slope)
RSS_sqft_training

1201918354177286.2

# Inverse regression predictions

In [65]:
def inverse_regression_predictions(output, intercept, slope):
    estimated_input =  (output - intercept)/ slope
    return(estimated_input)

# Quiz Question
According to this function and the regression slope and intercept from (3) what is the estimated square-feet for a house costing $800,000?

In [76]:
area = inverse_regression_predictions(800000, intercept, slope)
area

3004.3962451522766

# Using bedroom as the input feature

In [85]:
input_feature_bedroom = train_data['bedrooms']


In [93]:
bedroom_slope, bedroom_intercept = simple_linear_regression(input_feature_bedroom,output)
bedroom_slope, bedroom_intercept

(109473.1776229596, 127588.95293398784)

# RSS from BOTH models on TEST data

In [90]:
output_test = test_data['price']
input_test_bedrooms = test_data ['bedrooms']
input_test_sqft = test_data ['sqft_living']
RSS_sqft_test = get_residual_sum_of_squares(input_test_sqft, output_test, intercept,slope)
RSS_bedrooms_test = get_residual_sum_of_squares(input_test_bedrooms, output_test, bedroom_intercept,bedroom_slope)
print('RSS of sqft model=',RSS_sqft_test)
print('RSS of bedroom model=',RSS_bedrooms_test)

RSS of sqft model= 275402933617813.1
RSS of bedroom model= 499664725951644.0


# Quiz Question
Which model (square feet or bedrooms) has lowest RSS on TEST data? Think about why this might be the case.


In [89]:
RSS_sqft_test < RSS_bedrooms_test

True