In [6]:
# import libraries and data
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.decomposition import PCA

In [40]:
training_df = pd.read_csv('train_trimmed.csv',index_col=0)
testing_df = pd.read_csv('test_trimmed.csv', index_col=0)

In [41]:
label = np.log(training_df['SalePrice']) # log price -- convention 
training_features = training_df.drop(['SalePrice'], axis = 1)

In [51]:
# correlation Part
# observe there is quite strong correlation between variables 
# so we apply PCA to handle this 
# However we will apply PCA after one-hot encoding
correlation_matrix = training_df.corr().abs()
s = correlation_matrix.unstack()
so = s.sort_values(kind="quicksort")
so = so[:3192]
print (training_df.shape)
print (so[3150:])

(1456, 78)
TotalBsmtSF   SalePrice       0.646584
SalePrice     TotalBsmtSF     0.646584
              GarageCars      0.649256
GarageCars    SalePrice       0.649256
BsmtFullBath  BsmtFinSF1      0.661933
BsmtFinSF1    BsmtFullBath    0.661933
KitchenQual   SalePrice       0.666217
SalePrice     KitchenQual     0.666217
OverallQual   KitchenQual     0.667869
KitchenQual   OverallQual     0.667869
BedroomAbvGr  TotRmsAbvGrd    0.679346
TotRmsAbvGrd  BedroomAbvGr    0.679346
GrLivArea     2ndFlrSF        0.687430
2ndFlrSF      GrLivArea       0.687430
SalePrice     ExterQual       0.694628
ExterQual     SalePrice       0.694628
              KitchenQual     0.712513
KitchenQual   ExterQual       0.712513
GrLivArea     SalePrice       0.720516
SalePrice     GrLivArea       0.720516
BsmtFinType1  BsmtFinSF1      0.721871
BsmtFinSF1    BsmtFinType1    0.721871
OverallQual   ExterQual       0.721973
ExterQual     OverallQual     0.721973
YearBuilt     GarageYrBlt     0.776578
GarageYrBlt   

In [43]:
so.shape

(3249,)

In [10]:
# split the dataset into training and validation
valid_df = training_features.iloc[:100]
training_df = training_features.iloc[100:]
valid_label = label.iloc[:100]
training_label = label.iloc[100:]

In [11]:
# categorical variables --> one hot encoding

# find out which columns is categorical
cat_variables = []
num_variables = []
for name in training_df.columns:
    # print (training_features[name].dtype.name)
    if training_df[name].dtype.name == 'object':
        cat_variables.append(name)
    else:
        num_variables.append(name)
print (cat_variables)

cat_dict = training_df[ cat_variables ].to_dict( orient = 'records' )
cat_dict_valid = valid_df[ cat_variables ].to_dict( orient = 'records' )
cat_dict_test = testing_df[cat_variables].to_dict(orient = 'records')

training_numerical = training_df[ num_variables ].as_matrix()
valid_numerical = valid_df[ num_variables].as_matrix()
testing_numerical = testing_df[num_variables]
#max_train = np.amax( training_numerical, 0 )

#x_num_train = training_numerical / max_train
#x_num_valid = valid_numerical/ max_train
#x_num_test = testing_numerical / max_train		# scale test by max_train
from sklearn.feature_extraction import DictVectorizer as DV
# vectorize

vectorizer = DV( sparse = False )
vec_x_cat_train = vectorizer.fit_transform( cat_dict )
vec_x_cat_test = vectorizer.transform( cat_dict_test)
vec_x_cat_valid = vectorizer.transform (cat_dict_valid)
training_complete = np.hstack((training_numerical,vec_x_cat_train))
valid_complete = np.hstack((valid_numerical,vec_x_cat_valid))
testing_complete = np.hstack((testing_numerical,vec_x_cat_test))

print (training_complete.shape)
print (valid_complete.shape)
print (testing_complete.shape)
# # now we have one-hot encoding and normalized numerical variable 
# # next step we apply BernoulliRBM 
# from sklearn.neural_network import BernoulliRBM
# RBM = BernoulliRBM(n_components=80)
# training_complete_RBM_transform = RBM.fit_transform(training_complete)
# testing_complete_RBM_transform = RBM.fit_transform(test_complete)

['MSZoning', 'Street', 'LandContour', 'LotConfig', 'Neighborhood', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'Foundation', 'Heating', 'Electrical', 'GarageType', 'PavedDrive', 'MiscFeature', 'SaleType', 'SaleCondition']
(1356, 213)
(100, 213)
(1459, 213)


In [12]:
# Apply PCA here
pca = PCA(n_components=178) 
training_complete_1 = pca.fit_transform(training_complete)
valid_complete_1 = pca.transform(valid_complete)
testing_complete_1 = pca.transform(testing_complete)
print (training_complete_1.shape)
print (valid_complete_1.shape)
print (testing_complete_1.shape)
print ('Explained variation per principal component (PCA): {}'.format(np.sum(pca.explained_variance_ratio_)))

(1356, 178)
(100, 178)
(1459, 178)
Explained variation per principal component (PCA): 0.9999999999518072


In [13]:
error = []
# Linear Regression -- used as benchmark
linear_regression = LinearRegression()
linear_regression.fit(training_complete_1, training_label)
pred_labels = linear_regression.predict(testing_complete_1)
pred_valid = linear_regression.predict(valid_complete_1)
pred_train = linear_regression.predict(training_complete_1)
print (mean_squared_error(pred_train, training_label))
print (mean_squared_error(pred_valid, valid_label))
error.append(mean_squared_error(pred_valid, valid_label))
d = {'id': testing_numerical.index.values, 'SalePrice':np.exp(pred_labels) }
submission = pd.DataFrame(data = d)
submission.set_index(keys = 'id', inplace = True)
submission.to_csv('submission.csv')

0.0090608381743799
0.015558549674764523




In [14]:
# still linear model - LASSO regression
linear_regression_lasso = Lasso(alpha = 0.001, max_iter=50000)

linear_regression_lasso.fit(training_complete_1, training_label)
pred_labels_1 = linear_regression_lasso.predict(testing_complete_1)
pred_valid_1 = linear_regression_lasso.predict(valid_complete_1)
pred_train = linear_regression_lasso.predict(training_complete_1)
print (mean_squared_error(pred_train, training_label))
print (mean_squared_error(pred_valid_1, valid_label))
error.append(mean_squared_error(pred_valid, valid_label))

0.01124998857170423
0.017575346280409694


In [15]:
# still linear model - RIDGE regression
linear_regression_ridge = Ridge(alpha = 0.39, max_iter=50000)
linear_regression_ridge.fit(training_complete_1, training_label)
pred_labels_2 = linear_regression_ridge.predict(testing_complete_1)
pred_valid_2 = linear_regression_ridge.predict(valid_complete_1)
pred_train = linear_regression_ridge.predict(training_complete_1)
print (mean_squared_error(pred_train, training_label))
print (mean_squared_error(pred_valid_2, valid_label))
error.append(mean_squared_error(pred_valid, valid_label))

0.009074532156673969
0.015259256549053716


In [16]:
# still linear model - ElasticNet regression
linear_regression_ElasticNet = ElasticNet(alpha = 0.0006, max_iter=50000, l1_ratio = 0.53)

linear_regression_ElasticNet.fit(training_complete_1, training_label)
pred_labels_3 = linear_regression_ElasticNet.predict(testing_complete_1)
pred_valid_3 = linear_regression_ElasticNet.predict(valid_complete_1)
pred_train = linear_regression_ElasticNet.predict(training_complete_1)
print (mean_squared_error(pred_train, training_label))
print (mean_squared_error(pred_valid_3, valid_label))
error.append(mean_squared_error(pred_valid, valid_label))

0.009923515812207379
0.01583180033888915


In [17]:
# adding up three results
results = (pred_labels_1 + pred_labels_2 + pred_labels_3)/3
valid_results = (pred_valid_1 + pred_valid_2 + pred_valid_3)/3
print (mean_squared_error(valid_results, valid_label))
d = {'id': testing_numerical.index.values, 'SalePrice':np.exp(results) }
submission = pd.DataFrame(data = d)
submission.set_index(keys = 'id', inplace = True)
submission.to_csv('submission.csv')

0.015159257703269043


In [18]:
# Gradient Boosting 
# Cross validation (n_estimators)

n_est = [130, 140, 150]
max_dep = [6, 7, 8, 9]

er = 1111
for n_e in n_est:
    for m_d in max_dep:
        regre = GradientBoostingRegressor(learning_rate=0.05,
                                 max_depth=6, n_estimators = 100)
        regre.fit(training_complete, training_label)
        pred_labels = regre.predict(testing_complete)
        pred_valid = regre.predict(valid_complete)
        pred_train = regre.predict(training_complete)
        print (mean_squared_error(pred_train, training_label))
        print (mean_squared_error(pred_valid, valid_label))
        if mean_squared_error(pred_valid, valid_label) < er: 
            best_n_e = n_e
            best_m_d = m_d
            result_XGB = pred_labels
            results_valid = pred_valid
            er = mean_squared_error(pred_valid, valid_label)

0.0013861750265252019
0.019954972002147117
0.0013861750265252019
0.020536626799532453
0.0013861750265252019
0.020365237586836442
0.0013861750265252019
0.02023325925327373
0.0013861750265252019
0.020649111651991216
0.0013861750265252019
0.020678320680378857
0.0013861750265252019
0.02058807460140826
0.0013861750265252019
0.020707597281343647
0.0013861750265252019
0.02076638728716542
0.0013861750265252019
0.020137360071141877
0.0013861750265252019
0.020595358663568257
0.0013861750265252019
0.02043057319672905


In [19]:
print (best_n_e, best_m_d)

130 6


In [37]:
valid_results_1 = (3* valid_results + results_valid)/4
print (mean_squared_error(valid_results_1, valid_label))
results_1 = (3 * results + result_XGB)/4
d = {'id': testing_numerical.index.values, 'SalePrice':np.exp(results_1) }
submission = pd.DataFrame(data = d)
submission.set_index(keys = 'id', inplace = True)
submission.to_csv('submission.csv')

0.014614020529309909


In [None]:
# Random Forest
n_est = [700, 800, 900, 1000, 1100, 1200, 1300]
ee = 111
for n_e in n_est:
    random_forest = RandomForestRegressor(n_estimators=1000, verbose=1)
    random_forest.fit(training_complete, training_label)
    pred_labels = random_forest.predict(testing_complete)
    pred_valid = random_forest.predict(valid_complete)
    pred_train = random_forest.predict(training_complete)
    print (mean_squared_error(pred_train, training_label))
    print (mean_squared_error(pred_valid, valid_label))
    if mean_squared_error(pred_valid, valid_label) < ee: 
            best_n_e_2 = n_e
            best_m_d_2 = m_d
            result_RF = pred_labels
            results_valid_RF = pred_valid
            ee = mean_squared_error(pred_valid, valid_label)

In [78]:
# valid_results_2 = (4*valid_results_1+results_valid_RF)/5
# print (mean_squared_error(valid_results_2, valid_label))


0.014871675175768189


In [None]:
# Binarize the categorical variables left 
# create summarized variables 
# Tree models 
