In [1]:
# Import third party libraries

# Numerical library
import numpy as np

# Machine learning library
from sklearn import linear_model
from sklearn import metrics

# Used for matrix inversion
from numpy.linalg import inv

# Plotting library
import matplotlib.pyplot as plt

# Allows for printing inline for jupyter notebook
%matplotlib inline 

In [26]:
# Load datasets and store in ndarray
training_data = open('C:\Users\whitlock\Downloads\housing_train.txt','r')
X_train = np.loadtxt(training_data)

testing_data = open('C:\Users\whitlock\Downloads\housing_test.txt', 'r')
X_test = np.loadtxt(testing_data)

In [27]:
# Split off known target values
y_train = X_train[:,13]
y_test = X_test[:,13]

# Transpose row vector to columnar
y_train = y_train[np.newaxis].T
y_test = y_test[np.newaxis].T

In [28]:
# Remove column 13 from X
X_train = np.delete(X_train, 13, axis=1)
X_test = np.delete(X_test, 13, axis=1)

# Function to create array of dummy ones and returned 
# columnar vector
def make_dummy_vector(target):
    temp = np.ones(len(target))
    return temp[np.newaxis].T

# Create dummy 1 values
dummy_train = make_dummy_vector(X_train)
dummy_test = make_dummy_vector(X_test)

# Add dummy data to feature matrices
X_train = np.concatenate((dummy_train, X_train), axis=1)
X_test = np.concatenate((dummy_test, X_test), axis=1)

# Transpose X for further calculations
#X_train = X_train.T
#X_test = X_test.T

In [30]:
## PART 2
# Compute optimal weight vector w -- (X^T * X)^-1 (X^T * Y)
def calc_w_vector(X, y):
    return np.dot(inv(np.dot(X.T,X)), np.dot(X.T,y))

def alt_calc(X,y):
    return np.dot(np.dot(inv(X), inv(X.T), np.dot(X.T,y)))
    
# Limit printout to 3 decimal places
np.set_printoptions(precision=3)

# Caculate w vectors
w_train = calc_w_vector(X_train,y_train)
w_test = calc_w_vector(X_test,y_test)

# Print both weight vectors to console
print 'w_train vector:'
print('\n'.join('{}: {}'.format(*k) for k in enumerate(w_train)))

print ' \r\nw_test vector:'
print('\n'.join('{}: {}'.format(*k) for k in enumerate(w_test)))

w_train vector:
0: [ 39.584]
1: [-0.101]
2: [ 0.046]
3: [-0.003]
4: [ 3.072]
5: [-17.225]
6: [ 3.711]
7: [ 0.007]
8: [-1.599]
9: [ 0.374]
10: [-0.016]
11: [-1.024]
12: [ 0.01]
13: [-0.586]
 
w_test vector:
0: [ 16.494]
1: [-0.03]
2: [ 0.01]
3: [-0.16]
4: [ 1.129]
5: [-6.583]
6: [ 4.438]
7: [-0.077]
8: [-0.845]
9: [-0.025]
10: [ 0.005]
11: [-0.7]
12: [ 0.01]
13: [-0.037]


In [43]:
print np.dot(np.subtract(y_train.T,np.dot(X_train,w_train).T),np.subtract(y_train,np.dot(X_train, w_train)))


[[ 9561.191]]


In [48]:
## PART 3
# Functions
def calc_sse(X, y, w):
    return np.dot(np.subtract(y, np.dot(X, w)).T, np.subtract(y,np.dot(X, w)))

def calc_mse(X, y, regr):
    return np.mean((regr.predict(X) - y) ** 2)

# Apply learned weight vectors
target_func_train = np.dot(X_train, w_train)
target_func_test = np.dot(X_test, w_test)

# Create linear regression object
#training_model = linear_model.LinearRegression()
#testing_model = linear_model.LinearRegression()

# Train the model using the training sets
#training_model.fit(X_train,target_func_train)
#testing_model.fit(X_test,target_func_test)

# Print error output, not sure about the 0 values

print 'Training Model: \r\nSSE: %.2f' % calc_sse(X_train, y_train, w_train)
#print 'MSE: %.2f \r\n' % calc_mse(X_train, target_func_train)

print 'Testing Model: \r\nSSE: %.2f' % calc_sse(X_test, y_test, w_test)
#print 'MSE: %.2f' % calc_mse(X_test, target_func_test)

#metrics.mean_squared_error(y_train,target_func_train)
#metrics.mean_squared_error(y_test,target_func_test)


Training Model: 
SSE: 9561.19
Testing Model: 
SSE: 852.51


In [49]:
## PART 4
# Repeating part 2 and 3 without a dummy features of 1's in X

# Remove dummy column from both tables
X_train_no_dummy = X_train[:, (1,2,3,4,5,6,7,8,9,10,11,12,13)]
X_test_no_dummy = X_test[:, (1,2,3,4,5,6,7,8,9,10,11,12,13)]

# Caculate w vectors
w_train_no_dummy = calc_w_vector(X_train_no_dummy,y_train)
w_test_no_dummy = calc_w_vector(X_test_no_dummy,y_test)

# Print both weight vectors to console
print 'w_train_no_dummy vector:'
print('\n'.join('{}: {}'.format(*k) for k in enumerate(w_train_no_dummy)))

print ' \r\nw_test_no_dummy vector:'
print('\n'.join('{}: {}'.format(*k) for k in enumerate(w_test_no_dummy)))

w_train_no_dummy vector:
0: [-0.098]
1: [ 0.049]
2: [-0.025]
3: [ 3.451]
4: [-0.355]
5: [ 5.817]
6: [-0.003]
7: [-1.021]
8: [ 0.227]
9: [-0.012]
10: [-0.388]
11: [ 0.017]
12: [-0.485]
 
w_test_no_dummy vector:
0: [ 0.011]
1: [ 0.01]
2: [-0.19]
3: [ 1.126]
4: [-1.137]
5: [ 5.801]
6: [-0.081]
7: [-0.649]
8: [-0.129]
9: [ 0.008]
10: [-0.572]
11: [ 0.011]
12: [ 0.072]


<h3>Thoughts about results</h3>
The above results make it seems like our model will be centered around the orgin beacuse we did not calcuate a true b value in the w vector.

In [51]:
## PART 4 cont.
# Apply learned weight vectors
target_func_train_no_dummy = np.dot(X_train_no_dummy, w_train_no_dummy)
target_func_test_no_dummy = np.dot(X_test_no_dummy, w_test_no_dummy)

# Create linear regression object
#training_model_no_dummy = linear_model.LinearRegression()
#testing_model_no_dummy = linear_model.LinearRegression()

# Train the model using the training sets
#training_model_no_dummy.fit(X_train_no_dummy,target_func_train_no_dummy)
#testing_model_no_dummy.fit(X_test_no_dummy,target_func_test_no_dummy)

# Print error output, not sure about the 0 values
print 'Training Model without Dummy: \r\nSSE: %.2f' % calc_sse(X_train_no_dummy, y_train, w_train_no_dummy)
#print 'MSE: %.2f \r\n' % calc_mse(X_train_no_dummy, target_func_train, training_model_no_dummy)

print 'Testing Model without dummy: \r\nSSE: %.2f' % calc_sse(X_test_no_dummy, y_test, w_test_no_dummy)
#print 'MSE: %.2f' % calc_mse(X_test_no_dummy, target_func_test_no_dummy, training_model_no_dummy)

Training Model without Dummy: 
SSE: 10598.06
Testing Model without dummy: 
SSE: 883.85


In [60]:
# Generate artificial random features
feature_one = np.random.uniform(0,10,433)
feature_two = np.random.uniform(0,100,433)
feature_three = np.random.uniform(0,200,433)
feature_four = np.random.uniform(0,400,433)
feature_five = np.random.uniform(0,600,433)
feature_six = np.random.uniform(0,800,433)
feature_seven = np.random.uniform(0,1000,433)
feature_eight = np.random.uniform(0,1200,433)
feature_nine = np.random.uniform(0,1400,433)
feature_ten = np.random.uniform(0,1600,433)


In [59]:
print feature_one

[ 40.405  97.378  53.28   37.881  78.661  95.919  19.879  76.221   6.594
  67.607  56.063  47.277   1.678  34.094  14.36   19.545  14.089  60.37
  87.986  83.655  90.341  37.575  50.653  30.14   33.247  61.553  30.901
   4.556  73.344  39.386  34.636   9.628  42.807  72.186  52.76   48.793
  28.463  71.113  15.677  23.036  91.2    19.465  40.929  41.804   2.536
  89.094  99.139  60.074  40.997  53.743  67.213  11.499  34.518  53.427
  93.652  64.953  69.658  90.974  22.39   82.13   15.963  81.942   6.271
  85.805  85.081   9.225  45.172  13.063  45.98   34.24   66.047  71.682
  94.824  31.622   3.224  57.93   53.668  22.952  74.273  12.312  75.107
  75.641  62.008  94.114  32.375  81.131  23.556   2.886  53.953  26.115
  11.347  36.322   4.691  12.712  58.334  11.55   77.611  46.335  41.488
  91.114  82.205  54.193  52.847  36.12   23.005  57.785   5.572  37.728
  86.849  47.532  32.147  19.071  89.515  95.553  60.944  77.001  71.579
  53.525   4.299   6.482  54.765  56.166  50.31   45

In [None]:
# Coefficients
print('Coefficients: \n' , regr.coef_)

# Mean squared error
print("Mean squared error: %.2f" 
      % np.mean((regr.predict(X_test) - y_test) ** 2))

# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % regr.score(X_test, y_test))

In [None]:
print (regr.predict(X_test[:1,]) - y_test[:1,])

In [None]:
print X_train[:,2]

In [None]:
print X_train.shape
print w_train.shape

In [None]:
print X_train.T

In [None]:
# EXTRA MATERIAL BELOW

In [None]:
# Don't show scientific notation
np.set_printoptions(suppress=True)

print "Printing X_train:"
print X_train

In [None]:
print "Printing y_train:"
print y_train

In [None]:
# Plot feature 1: Crime rate by town
plt.scatter(X_train[:, 0],y_train)

In [None]:
# Plot feature 2: Residential land zoned for lots over 25,0000 sq. ft
plt.scatter(X_train[:, 1],y_train)

In [None]:
# Multiplotting feature 1 & 2
plt.scatter(X_train[:, 0],y_train)
plt.scatter(X_train[:, 1],y_train)

In [None]:
# Plot feature 3: Proportion of non-retail business acres per town
plt.scatter(X_train[:, 2],y_train)

In [None]:
# Plot feature 4: Charles River dummy variable (= 1 if tract bounds river, 0 otherwise)
plt.scatter(X_train[:, 3],y_train)

In [None]:
# Plot feature 5: Nitric oxides concentration (parts per 10 million)
plt.scatter(X_train[:, 4],y_train)

In [None]:
# Plot feature 6: Average number fo rooms per dwelling
plt.scatter(X_train[:, 5],y_train)

In [None]:
# Plot feature 7: Porportion of owner-occupied units built prior to 1940
plt.scatter(X_train[:, 6],y_train)

In [None]:
# Plot feature 8: Weighted distances to five Boston employment centers
plt.scatter(X_train[:, 7],y_train)

In [None]:
# Plot feature 9: Index of accessability to radial highways
plt.scatter(X_train[:, 8],y_train)

In [None]:
# Plot feature 10: Full-value property-tax rate per $10,000
plt.scatter(X_train[:, 9],y_train)

In [None]:
# Plot feature 11: Pupil-teacher ratio by town
plt.scatter(X_train[:, 10],y_train)

In [None]:
# Plot feature 12: 1000(Bk - 0.63)^2 where Bk is the population fo blacks by town
plt.scatter(X_train[:, 11],y_train)

In [None]:
# Plot feature 13: % lower status of the population
plt.scatter(X_train[:, 12],y_train)