In [205]:
# import libraries and data
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import PCA

In [206]:
training_df = pd.read_csv('train_trimmed.csv',index_col=0)
testing_df = pd.read_csv('test_trimmed.csv', index_col=0)

In [207]:
label = np.log(training_df['SalePrice']) # log price -- convention 
training_features = training_df.drop(['SalePrice'], axis = 1)

In [208]:
# correlation Part
# observe there is quite strong correlation between variables 
# so we apply PCA to handle this 
# However we will apply PCA after one-hot encoding
correlation_matrix = training_df.corr().abs()
s = correlation_matrix.unstack()
so = s.sort_values(kind="quicksort")
so = so[:2756]
print (training_df.shape)
print (so[2700:])

(1456, 78)
GarageYrBlt   YearRemodAdd    0.615560
YearRemodAdd  GarageYrBlt     0.615560
SalePrice     1stFlrSF        0.625235
1stFlrSF      SalePrice       0.625235
KitchenQual   YearRemodAdd    0.625260
YearRemodAdd  KitchenQual     0.625260
BsmtQual      OverallQual     0.625421
OverallQual   BsmtQual        0.625421
GrLivArea     FullBath        0.635161
FullBath      GrLivArea       0.635161
BsmtQual      BsmtCond        0.635769
BsmtCond      BsmtQual        0.635769
SalePrice     GarageArea      0.636964
GarageArea    SalePrice       0.636964
SalePrice     TotalBsmtSF     0.646584
TotalBsmtSF   SalePrice       0.646584
GarageCars    SalePrice       0.649256
SalePrice     GarageCars      0.649256
BsmtFinSF1    BsmtFullBath    0.661933
BsmtFullBath  BsmtFinSF1      0.661933
KitchenQual   SalePrice       0.666217
SalePrice     KitchenQual     0.666217
KitchenQual   OverallQual     0.667869
OverallQual   KitchenQual     0.667869
BedroomAbvGr  TotRmsAbvGrd    0.679346
TotRmsAbvGrd  

In [209]:
# split the dataset into training and validation
valid_df = training_features.iloc[:100]
training_df = training_features.iloc[100:]
valid_label = label.iloc[:100]
training_label = label.iloc[100:]

In [210]:
# categorical variables --> one hot encoding

# find out which columns is categorical
cat_variables = []
num_variables = []
for name in training_df.columns:
    # print (training_features[name].dtype.name)
    if training_df[name].dtype.name == 'object':
        cat_variables.append(name)
    else:
        num_variables.append(name)
print (cat_variables)

cat_dict = training_df[ cat_variables ].to_dict( orient = 'records' )
cat_dict_valid = valid_df[ cat_variables ].to_dict( orient = 'records' )
cat_dict_test = testing_df[cat_variables].to_dict(orient = 'records')

training_numerical = training_df[ num_variables ].as_matrix()
valid_numerical = valid_df[ num_variables].as_matrix()
testing_numerical = testing_df[num_variables]
#max_train = np.amax( training_numerical, 0 )

#x_num_train = training_numerical / max_train
#x_num_valid = valid_numerical/ max_train
#x_num_test = testing_numerical / max_train		# scale test by max_train
from sklearn.feature_extraction import DictVectorizer as DV
# vectorize

vectorizer = DV( sparse = False )
vec_x_cat_train = vectorizer.fit_transform( cat_dict )
vec_x_cat_test = vectorizer.transform( cat_dict_test)
vec_x_cat_valid = vectorizer.transform (cat_dict_valid)
training_complete = np.hstack((training_numerical,vec_x_cat_train))
valid_complete = np.hstack((valid_numerical,vec_x_cat_valid))
testing_complete = np.hstack((testing_numerical,vec_x_cat_test))

print (training_complete.shape)
print (valid_complete.shape)
print (testing_complete.shape)
# # now we have one-hot encoding and normalized numerical variable 
# # next step we apply BernoulliRBM 
# from sklearn.neural_network import BernoulliRBM
# RBM = BernoulliRBM(n_components=80)
# training_complete_RBM_transform = RBM.fit_transform(training_complete)
# testing_complete_RBM_transform = RBM.fit_transform(test_complete)

['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'Foundation', 'Heating', 'Electrical', 'GarageType', 'PavedDrive', 'MiscFeature', 'SaleType', 'SaleCondition']
(1356, 228)
(100, 228)
(1459, 228)


In [211]:
# Apply PCA here
pca = PCA(n_components=178) 
training_complete_1 = pca.fit_transform(training_complete)
valid_complete_1 = pca.transform(valid_complete)
testing_complete_1 = pca.transform(testing_complete)
print (training_complete_1.shape)
print (valid_complete_1.shape)
print (testing_complete_1.shape)
print ('Explained variation per principal component (PCA): {}'.format(np.sum(pca.explained_variance_ratio_)))

(1356, 178)
(100, 178)
(1459, 178)
Explained variation per principal component (PCA): 0.999999999862411


In [216]:
error = []
# Linear Regression -- used as benchmark
linear_regression = LinearRegression()
linear_regression.fit(training_complete_1, training_label)
pred_labels = linear_regression.predict(testing_complete_1)
pred_valid = linear_regression.predict(valid_complete_1)
pred_train = linear_regression.predict(training_complete_1)
print (mean_squared_error(pred_train, training_label))
print (mean_squared_error(pred_valid, valid_label))
error.append(mean_squared_error(pred_valid, valid_label))
d = {'id': testing_numerical.index.values, 'SalePrice':np.exp(pred_labels) }
submission = pd.DataFrame(data = d)
submission.set_index(keys = 'id', inplace = True)
submission.to_csv('submission.csv')

0.009002728817546235
0.015253004720264824


In [213]:
# still linear model - LASSO regression
linear_regression_lasso = Lasso(alpha = 0.001, max_iter=50000)

linear_regression_lasso.fit(training_complete_1, training_label)
pred_labels_1 = linear_regression_lasso.predict(testing_complete_1)
pred_valid = linear_regression_lasso.predict(valid_complete_1)
pred_train = linear_regression_lasso.predict(training_complete_1)
print (mean_squared_error(pred_train, training_label))
print (mean_squared_error(pred_valid, valid_label))
error.append(mean_squared_error(pred_valid, valid_label))

0.011312655615282923
0.01788820864207479


In [214]:
# still linear model - RIDGE regression
linear_regression_ridge = Ridge(alpha = 0.39, max_iter=50000)
linear_regression_ridge.fit(training_complete_1, training_label)
pred_labels_2 = linear_regression_ridge.predict(testing_complete_1)
pred_valid = linear_regression_ridge.predict(valid_complete_1)
pred_train = linear_regression_ridge.predict(training_complete_1)
print (mean_squared_error(pred_train, training_label))
print (mean_squared_error(pred_valid, valid_label))
error.append(mean_squared_error(pred_valid, valid_label))

0.009011160430043609
0.0151436415424942


In [215]:
# still linear model - ElasticNet regression
linear_regression_ElasticNet = ElasticNet(alpha = 0.0006, max_iter=50000, l1_ratio = 0.53)

linear_regression_ElasticNet.fit(training_complete_1, training_label)
pred_labels_3 = linear_regression_ElasticNet.predict(testing_complete_1)
pred_valid = linear_regression_ElasticNet.predict(valid_complete_1)
pred_train = linear_regression_ElasticNet.predict(training_complete_1)
print (mean_squared_error(pred_train, training_label))
print (mean_squared_error(pred_valid, valid_label))
error.append(mean_squared_error(pred_valid, valid_label))

0.009849219431337014
0.01627253882371876


In [217]:
print (error[-1], error[-2])

IndexError: list index out of range

In [218]:
# adding up three results
results = (pred_labels_1 + pred_labels_2 + pred_labels_3)/3
d = {'id': testing_numerical.index.values, 'SalePrice':np.exp(results) }
submission = pd.DataFrame(data = d)
submission.set_index(keys = 'id', inplace = True)
submission.to_csv('submission.csv')

In [126]:
# Gradient Boosting 
# Cross validation (n_estimators)
from sklearn.ensemble import GradientBoostingRegressor
regre = GradientBoostingRegressor(learning_rate=0.05,
                                 max_depth=6, n_estimators = 100)
regre.fit(training_complete, training_label)
pred_labels = regre.predict(testing_complete)
pred_valid = regre.predict(valid_complete)
pred_train = regre.predict(training_complete)
print (mean_squared_error(pred_train, training_label))
print (mean_squared_error(pred_valid, valid_label))

40916972.99833862
582864629.9843149


In [None]:
# Binarize the categorical variables left 
# create summarized variables 
# Tree models 