In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd # for a better table visualization only
from helper import display_data
from sklearn import linear_model
import warnings
warnings.filterwarnings(action="ignore", module="scipy", message="^internal gelsd")

In [2]:
data = np.loadtxt('../data/house_price.txt', delimiter=',')
display_data(data)

array([[  2.10400000e+03,   3.00000000e+00,   3.99900000e+05],
       [  1.60000000e+03,   3.00000000e+00,   3.29900000e+05],
       [  2.40000000e+03,   3.00000000e+00,   3.69000000e+05],
       [  1.41600000e+03,   2.00000000e+00,   2.32000000e+05],
       [  3.00000000e+03,   4.00000000e+00,   5.39900000e+05]])

In [3]:
df = pd.DataFrame(data, columns=['Area', '# of Bedroom', 'Price'])
df.head()

Unnamed: 0,Area,# of Bedroom,Price
0,2104.0,3.0,399900.0
1,1600.0,3.0,329900.0
2,2400.0,3.0,369000.0
3,1416.0,2.0,232000.0
4,3000.0,4.0,539900.0


In [4]:
X = data[:,0:2]
display_data(X)

array([[  2.10400000e+03,   3.00000000e+00],
       [  1.60000000e+03,   3.00000000e+00],
       [  2.40000000e+03,   3.00000000e+00],
       [  1.41600000e+03,   2.00000000e+00],
       [  3.00000000e+03,   4.00000000e+00]])

In [5]:
def normalize(X):
    num_feature = X.shape[1]
    
    mu = np.zeros(num_feature)
    sigma = np.zeros(num_feature)
    X_norm = np.zeros(X.shape)

    for i in range(0, num_feature):
        mu[i] = np.mean(X[:,i])
        sigma[i] = np.std(X[:,i])
        X_norm[:,i] = (X[:,i] - mu[i]) / sigma[i]

    return X_norm, mu, sigma

In [6]:
X_norm, mu, sigma = normalize(X)
display_data(X_norm)

array([[ 0.13141542, -0.22609337],
       [-0.5096407 , -0.22609337],
       [ 0.5079087 , -0.22609337],
       [-0.74367706, -1.5543919 ],
       [ 1.27107075,  1.10220517]])

In [7]:
y = data[:,2]
display_data(y)

array([ 399900.,  329900.,  369000.,  232000.,  539900.])

In [8]:
lr1 = linear_model.LinearRegression()
lr1.fit(X, y)
lr1.coef_

array([  139.21067402, -8738.01911233])

In [9]:
to_pred = np.array([1650, 3]).reshape((1, -1))

In [10]:
lr1.predict(to_pred)

array([ 293081.4643349])

In [11]:
lr2 = linear_model.LinearRegression(normalize=False)
lr2.fit(X_norm, y)
lr2.coef_

array([ 109447.79646964,   -6578.35485416])

In [12]:
to_pred = np.array([(1650 - mu[0]) / sigma[0], (3 - mu[1]) / sigma[1]]).reshape((1, -1))

In [13]:
lr2.predict(to_pred)

array([ 293081.4643349])