In [245]:
%matplotlib notebook
from sklearn import linear_model
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import Imputer, StandardScaler
import numpy as np

import matplotlib.pyplot as plt
import pandas

In [246]:
training_data = pandas.read_csv('train.csv')

In [247]:
training_data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


# Preprocessing

In [248]:
imputer = Imputer(strategy='median')

In [249]:
X = training_data[['LotFrontage', 'LotArea', "WoodDeckSF", 'PoolArea', 'OpenPorchSF', 'EnclosedPorch', 'GarageArea']]
#X = training_data[['LotArea', 'GarageArea']]
X = imputer.fit_transform(X)
X

array([[    65.,   8450.,      0., ...,     61.,      0.,    548.],
       [    80.,   9600.,    298., ...,      0.,      0.,    460.],
       [    68.,  11250.,      0., ...,     42.,      0.,    608.],
       ..., 
       [    66.,   9042.,      0., ...,     60.,      0.,    252.],
       [    68.,   9717.,    366., ...,      0.,    112.,    240.],
       [    75.,   9937.,    736., ...,     68.,      0.,    276.]])

In [250]:
y = training_data[['SalePrice']]
y.values

array([[208500],
       [181500],
       [223500],
       ..., 
       [266500],
       [142125],
       [147500]])

In [251]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [252]:
X_scaler = StandardScaler().fit(X_train)
X_train = X_scaler.transform(X_train)
y_scaler = StandardScaler().fit(y_train)
y_train = y_scaler.transform(y_train)
regr = linear_model.Ridge(alpha=0)
for iterations in range(50):
    regr.fit(X_train, y_train)

X_test = X_scaler.transform(X_test)
y_test = y_scaler.transform(y_test)
y_pred = regr.predict(X_test)
print("RMSLE: %.5f" % np.sqrt(mean_squared_error(y_scaler.inverse_transform(y_test), y_scaler.inverse_transform(y_pred))))
print("R2: %.2f" % r2_score(y_scaler.inverse_transform(y_test), y_scaler.inverse_transform(y_pred)))

plt.scatter(X_test[:,0], y_test,color='black', marker='.', s=1)
plt.scatter(X_test[:,0], y_pred, color='blue', marker='o', s=10, facecolors='none')
plt.xticks()
plt.yticks()
plt.show()

RMSLE: 60256.99037
R2: 0.37


<IPython.core.display.Javascript object>

# Testset Data

In [253]:
# test = pandas.read_csv('test.csv')
# testing_X = test[['LotFrontage', 'LotArea', "WoodDeckSF", 'PoolArea', 'OpenPorchSF', 'EnclosedPorch', 'GarageArea', 'GarageCars']]
# testing_X = imputer.transform(X)
# testing_y = test[['SalePrice']]
# X_test = X_scaler.transform(testing_X)
# y_test = y_scaler.transform(testing_y)
# y_pred = regr.predict(testing_y)

In [254]:
plt.scatter(X_scaler.inverse_transform(X_test)[:,0], y_scaler.inverse_transform(y_test), color='black', marker='.', s=1)
plt.scatter(X_scaler.inverse_transform(X_test)[:,0], y_scaler.inverse_transform(y_pred), color='blue', marker='o', s=10, facecolors='none')
plt.xticks()
plt.yticks()
plt.show()