In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression

In [2]:
train_data = pd.read_csv('kc_house_train_data.csv/kc_house_train_data.csv')
test_data = pd.read_csv('kc_house_test_data.csv/kc_house_test_data.csv')

In [3]:
train_data.columns

Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
       'lat', 'long', 'sqft_living15', 'sqft_lot15'],
      dtype='object')

# process data

In [4]:
def add_features(data):
    data['bedrooms_squared'] = data['bedrooms'] ** 2
    data['bed_bath_rooms'] = data['bedrooms'] * data['bathrooms']
    data['log_sqft_living'] = np.log(data['sqft_living'])
    data['lat_plus_long'] = data['lat'] + data['long']
    return data

In [5]:
train_data = add_features(train_data)
test_data = add_features(test_data)

In [6]:
train_data.columns

Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
       'lat', 'long', 'sqft_living15', 'sqft_lot15', 'bedrooms_squared',
       'bed_bath_rooms', 'log_sqft_living', 'lat_plus_long'],
      dtype='object')

In [7]:
test_data.columns

Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
       'lat', 'long', 'sqft_living15', 'sqft_lot15', 'bedrooms_squared',
       'bed_bath_rooms', 'log_sqft_living', 'lat_plus_long'],
      dtype='object')

In [8]:
print('%.2f %.2f %.2f %.2f'%(test_data['bedrooms_squared'].mean(), test_data['bed_bath_rooms'].mean(), 
      test_data['log_sqft_living'].mean(), test_data['lat_plus_long'].mean()))

12.45 7.50 7.55 -74.65


# fit three models

In [9]:
x1 = train_data[['sqft_living','bedrooms','bathrooms','lat','long']]
model1 = LinearRegression()
model1.fit(x1,train_data['price'])
print(model1.intercept_, model1.coef_)

-69075726.79256982 [ 3.12258646e+02 -5.95865332e+04  1.57067421e+04  6.58619264e+05
 -3.09374351e+05]


In [10]:
x2 = train_data[['sqft_living','bedrooms','bathrooms','lat','long','bed_bath_rooms']]
model2 = LinearRegression()
model2.fit(x2,train_data['price'])
print(model2.intercept_, model2.coef_)

-66867968.87107886 [ 3.06610053e+02 -1.13446368e+05 -7.14613083e+04  6.54844630e+05
 -2.94298969e+05  2.55796520e+04]


In [11]:
x3 = train_data[['sqft_living','bedrooms','bathrooms','lat','long',
                       'bed_bath_rooms','bedrooms_squared','log_sqft_living','lat_plus_long']]
model3 = LinearRegression()
model3.fit(x3,train_data['price'])
print(model3.intercept_, model3.coef_)

-62036084.98609824 [ 5.29422820e+02  3.45142296e+04  6.70607813e+04  5.34085611e+05
 -4.06750711e+05 -8.57050439e+03 -6.78858667e+03 -5.61831484e+05
  1.27334900e+05]


# compute residual sum of squares

In [12]:
def compute_RSS(model, input_features, output):
    totalSS = ((output - output.mean())**2).sum()
    RSS = (1 - model.score(input_features, output)) * totalSS
    return RSS

In [13]:
rss1 = compute_RSS(model1, x1, train_data['price'])
rss2 = compute_RSS(model2, x2, train_data['price'])
rss3 = compute_RSS(model3, x3, train_data['price'])
print('%.2e %.2e %.2e'%(rss1,rss2,rss3))

9.68e+14 9.58e+14 9.03e+14


In [14]:
rss1 = compute_RSS(model1, test_data[['sqft_living','bedrooms','bathrooms','lat','long']], test_data['price'])
rss2 = compute_RSS(model2, test_data[['sqft_living','bedrooms','bathrooms','lat','long','bed_bath_rooms']], test_data['price'])
rss3 = compute_RSS(model3, test_data[['sqft_living','bedrooms','bathrooms','lat','long','bed_bath_rooms','bedrooms_squared','log_sqft_living','lat_plus_long']], test_data['price'])
print('%.2e  %.2e  %.2e'%(rss1,rss2,rss3))

2.26e+14  2.23e+14  2.59e+14
