In [11]:
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
%matplotlib inline 

# Part 1

In [12]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':str, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}
train = pd.read_csv('kc_house_train_data.csv', dtype=dtype_dict)
test = pd.read_csv('kc_house_test_data.csv', dtype=dtype_dict)

In [13]:
train['bedrooms_squared'] = pd.Series(np.multiply(train['bedrooms'], train['bedrooms']))
train['bed_bath_rooms'] = pd.Series(np.multiply(train['bedrooms'], train['bathrooms']))
train['log_sqft_living'] = pd.Series(np.log(train['sqft_living']))
train['lat_plus_long'] = pd.Series(np.add(train['lat'], train['long']))

test['bedrooms_squared'] = pd.Series(np.multiply(test['bedrooms'], test['bedrooms']))
test['bed_bath_rooms'] = pd.Series(np.multiply(test['bedrooms'], test['bathrooms']))
test['log_sqft_living'] = pd.Series(np.log(test['sqft_living']))
test['lat_plus_long'] = pd.Series(np.add(test['lat'], test['long']))


## Q1

In [14]:
print 'bedroom_squared = ' + str(np.mean(test['bedrooms_squared']))
print 'bed_bath_roomw = ' + str(np.mean(test['bed_bath_rooms']))
print 'log_sqft_living = ' + str(np.mean(test['log_sqft_living']))
print 'lat_plus_long = ' + str(np.mean(test['lat_plus_long']))

bedroom_squared = 12.4466777016
bed_bath_roomw = 7.50390163159
log_sqft_living = 7.55027467965
lat_plus_long = -74.653333554


## Q2

In [15]:
len(train['price'])

17384

In [16]:
regr = linear_model.LinearRegression()
model1 = regr.fit(train[['sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long']], train['price'])
print 'model 1 = ' + str(model1.coef_)
y_hat = model1.predict(train[['sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long']])
print 'training MSE = '+ str(mean_squared_error(y_hat, train['price']))
y_hat = model1.predict(test[['sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long']])
print 'testing MSE = '+ str(mean_squared_error(y_hat, test['price']))

model 1 = [  3.12258646e+02  -5.95865332e+04   1.57067421e+04   6.58619264e+05
  -3.09374351e+05]
training MSE = 55676481997.8
testing MSE = 53322409504.7


In [17]:
regr = linear_model.LinearRegression()
model2 = regr.fit(train[['sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long', 'bed_bath_rooms']], train['price'])
print 'model 2 = ' + str(model2.coef_)
y_hat = model2.predict(train[['sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long', 'bed_bath_rooms']])
print 'training MSE = '+ str(mean_squared_error(y_hat, train['price']))
y_hat = model2.predict(test[['sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long', 'bed_bath_rooms']])
print 'testing MSE = '+ str(mean_squared_error(y_hat, test['price']))

model 2 = [  3.06610053e+02  -1.13446368e+05  -7.14613083e+04   6.54844630e+05
  -2.94298969e+05   2.55796520e+04]
training MSE = 55132284576.3
testing MSE = 52820397960.9


In [18]:
regr = linear_model.LinearRegression()
model3 = regr.fit(train[['sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long', 'bed_bath_rooms', 'bedrooms_squared', 'log_sqft_living','lat_plus_long']], train['price'])
print 'model 3 = ' + str(model3.coef_)
y_hat = model3.predict(train[['sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long', 'bed_bath_rooms', 'bedrooms_squared', 'log_sqft_living','lat_plus_long']])
print 'training MSE = '+ str(mean_squared_error(y_hat, train['price']))
y_hat = model3.predict(test[['sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long', 'bed_bath_rooms', 'bedrooms_squared', 'log_sqft_living','lat_plus_long']])
print 'testing MSE = '+ str(mean_squared_error(y_hat, test['price']))

model 3 = [  5.29422820e+02   3.45142296e+04   6.70607813e+04   5.34085611e+05
  -4.06750711e+05  -8.57050439e+03  -6.78858667e+03  -5.61831484e+05
   1.27334900e+05]
training MSE = 51969423323.2
testing MSE = 61299673494.2


# Part 2

In [62]:
train = pd.read_csv('kc_house_train_data.csv', dtype=dtype_dict)

In [69]:
def get_numpy_data(data, features, output):
    data['constant'] = 1
    features = ['constant'] + features
    output_np = data[output]
    output_np = output_np.as_matrix()
    data_np = data.as_matrix(features)
    return (data_np, output_np)

In [70]:
def predict_outcome(data_np, weights):
    predictions = np.dot(data_np, weights)
    return predictions

In [130]:
def feature_derivative(errors, feature):
    derivative = np.dot(errors, feature) 
    return derivative*(-2.0)

In [131]:
def regression_gradient_descent(data_np, output_np, initial_weights, step_size, tolerance):
        converged = False
        weights = np.array(initial_weights)
        while not converged:
            predictions = predict_outcome(data_np, weights)
            errors = output_np - predictions
            gradient_sum_square = 0.0
            
            for i in range(len(weights)):
                derivative = feature_derivative(errors, data_np[:, i])
                gradient_sum_square = gradient_sum_square + derivative**2
                weights[i] = weights[i] - derivative * step_size
                
            gradient_mag = np.sqrt(gradient_sum_square)
            if gradient_mag < tolerance:
                converged = True
            
        return weights

In [132]:
features = ['sqft_living']
output = 'price'
initial_weights = [-47000., 1.]
step_size = 7e-12
tolerance = 2.5e7
data_np, output_np = get_numpy_data(train, features, output)

## Simple model coefficients

In [134]:
simple_weights = regression_gradient_descent(data_np, output_np, initial_weights, step_size, tolerance)
print str(simple_weights)

[-46999.88716555    281.91211918]


## Predict via simple model

In [135]:
test = pd.read_csv('kc_house_test_data.csv', dtype=dtype_dict)
test_data_np, test_output_np = get_numpy_data(test, features, output)
predicted_output_np = predict_outcome(test_data_np, simple_weights)
print str(predicted_output_np[0])

356134.443255


## RSS of simple model on test data

In [137]:
test_rss = np.sum((predicted_output_np - test_output_np)**2)
print test_rss

2.75400044902e+14


## Multiple Regression Coefficients

In [140]:
features = ['sqft_living', 'sqft_living15']
output = 'price'
new_data_np, new_output_np = get_numpy_data(train, features, output)
initial_weights = np.array([-100000., 1., 1.])
step_size = 4e-12
tolerance = 1e9

In [141]:
new_weights = regression_gradient_descent(new_data_np, new_output_np, initial_weights, step_size, tolerance)

In [142]:
test_data_np, test_output_np = get_numpy_data(test, features, output)
predicted_output_np = predict_outcome(test_data_np, new_weights)
print str(predicted_output_np[0])

366651.411629


In [143]:
print 'actual price = ' + str(test_output_np[0])

actual price = 310000.0


## RSS of multiple regression model on test data

In [144]:
test_rss = np.sum((predicted_output_np - test_output_np)**2)
print test_rss

2.7026344363e+14
