In [1]:
import pandas as pd
import numpy as np

In [2]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 
              'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 
              'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':str, 'condition':int, 
              'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

In [3]:
sales = pd.read_csv('kc_house_data.csv', dtype=dtype_dict)
train_data = pd.read_csv('kc_house_train_data.csv', dtype=dtype_dict)
test_data = pd.read_csv('kc_house_test_data.csv', dtype=dtype_dict)

In [4]:
sales.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3.0,1.0,1180.0,5650,1,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340.0,5650.0
1,6414100192,20141209T000000,538000.0,3.0,2.25,2570.0,7242,2,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690.0,7639.0
2,5631500400,20150225T000000,180000.0,2.0,1.0,770.0,10000,1,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720.0,8062.0
3,2487200875,20141209T000000,604000.0,4.0,3.0,1960.0,5000,1,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360.0,5000.0
4,1954400510,20150218T000000,510000.0,3.0,2.0,1680.0,8080,1,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800.0,7503.0


In [7]:
price=sales['price']

sum_price=price.sum()
num_houses=len(price)

avg_price1=sum_price/num_houses
avg_price2=price.mean()

print("avarage pricr via method 1: " +str(avg_price1))
print("avarage pricr via method 2: " +str(avg_price2))

avarage pricr via method 1: 540088.1417665294
avarage pricr via method 2: 540088.1417665294


In [10]:
half_price=0.5*price

price_squared=price*price
sum_price_squared=price_squared.sum()

print("The sum of price squared is: "+ str(sum_price_squared))


The sum of price squared is: 9217325138472070.0


## Build a generic Simple Linear Regression function

In [17]:
def simple_linear_regression(input_features,output):
    n=len(input_features)
    x=input_features
    y=output
    
    x_mean=x.mean()
    y_mean=y.mean()
    
    sum_xy=(x*y).sum()
    xy_by_n=(y.sum()*x.sum())/n
    
    x_square_sum=(x**2).sum()
    xx_by_n=(x.sum()*x.sum())/n
    
    slope=(sum_xy-xy_by_n)/(x_square_sum-xx_by_n)
    
    intercept=y_mean-(slope*x_mean)
    return (intercept,slope)

In [18]:
sqft_intercept,sqft_slope=simple_linear_regression(train_data['sqft_living'].values,train_data['price'].values)

print("Intercept: "+str(sqft_intercept))
print("Slope: "+str(sqft_slope))

Intercept: -47116.07907289418
Slope: 281.9588396303426


In [19]:
def get_regression_predictions(input_features,slope,intercept):
    predicted_values=intercept+(slope*input_features)
    return predicted_values

In [23]:
my_house_sqft=2650
estimated_price=get_regression_predictions(my_house_sqft,sqft_slope,sqft_intercept)

print("The estimated price:$ "+str(estimated_price)+" with "+str( my_house_sqft)+"sqft")


The estimated price:$ 700074.8459475137 with 2650sqft


In [24]:
def get_residual_sum_of_squares(input_features,output,intercept,slope):
    predicted_values=intercept+(slope*input_features)
    
    residuals=output-predicted_values
    RSS= (residuals*residuals).sum()
    return (RSS)

In [27]:
rss_prices_on_sqft = get_residual_sum_of_squares(train_data['sqft_living'], train_data['price'], sqft_intercept, sqft_slope)
print("The RSS of predicting prices based on sqaure feet :"+ str(rss_prices_on_sqft))

The RSS of predicting prices based on sqaure feet :1201918354177283.0


In [29]:
sqft_intercept, sqft_slope = simple_linear_regression(train_data['bedrooms'].values, train_data['price'].values)

print ("Intercept: " + str(sqft_intercept))
print ("Slope: " + str(sqft_slope))

Intercept: 109473.1776229596
Slope: 127588.95293398784


In [32]:
sqft_intercept,sqft_slope=simple_linear_regression(train_data['bedrooms'].values,train_data['price'].values)

rss_prices_on_bedrooms = get_residual_sum_of_squares(test_data['bedrooms'].values, 
                                                     test_data['price'].values, 
                                                     sqft_intercept, sqft_slope)

print("The RSS of predicting prices based on bedrooms : "+ str(rss_prices_on_bedrooms))

The RSS of predicting prices based on bedrooms : 493364585960300.9


In [33]:
sqft_intercept, sqft_slope = simple_linear_regression(train_data['sqft_living'].values, 
                                                      train_data['price'].values)
rss_prices_on_sqft = get_residual_sum_of_squares(test_data['sqft_living'].values, 
                                                 test_data['price'].values, 
                                                 sqft_intercept, sqft_slope)

print("The RSS of predicting prices based on square feet : "+ str(rss_prices_on_sqft))

The RSS of predicting prices based on square feet : 275402933617812.12
