## Predicting House Prices
- Lasso
- Ridge
- Random Forest
- Ensemble using linear regression

(KNN does not perform well on this dataset.)

In [2]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.cross_validation import train_test_split
import matplotlib
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge as Ridge_Reg
from sklearn.linear_model import Lasso as Lasso_Reg
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_predict
from sklearn.preprocessing import StandardScaler as Standardize
from sklearn.decomposition import PCA
from sklearn.metrics import r2_score
import warnings
warnings.filterwarnings("ignore")

In [3]:
# read the data
x_train = np.loadtxt('data/x_train_cleaned.txt', delimiter=',')
x_test = np.loadtxt('data/x_test_cleaned.txt', delimiter=',')
y_train_log = np.loadtxt('data/y_train_log.txt', delimiter=',')

In [4]:
# due to the high correlation among features
# apply PCA to the features
pca = PCA()
x_train_pca = pca.fit_transform(x_train)
x_test_pca = pca.transform(x_test)

In [5]:
# Standardize predictors for lasso and ridge
std = Standardize(with_mean=False)
x_train_std = std.fit_transform(x_train_pca)
x_test_std = std.transform(x_test_pca)

In [7]:
n = x_train_std.shape[0]

### Lasso regression
Tune regularization parameter (alpha)

In [8]:
## Lasso regression
# parameters
param = np.power(10.0, range(-4, 5, 1))
# tune lasso regression
model = Lasso_Reg()
grid_model = GridSearchCV(model, param_grid = {'alpha': param}, cv  = 5)
grid_model.fit(x_train_std, y_train_log)
# best model
lasso = grid_model.best_estimator_
print "The R-squared of the best lasso model is ", grid_model.best_score_
print "The best alpha is ", lasso.get_params()['alpha']

## Prediction
# training
y_pred_train_lasso = cross_val_predict(lasso, x_train_std, y_train_log, cv = 5)
rmse = np.sqrt(np.sum((y_pred_train_lasso - y_train_log)**2)/n)
print "The model yield of RMSLE of ", rmse
# testing
lasso.fit(x_train_std, y_train_log)
y_pred_test_lasso = lasso.predict(x_test_std)

The R-squared of the best lasso model is  0.872303079928
The best alpha is  0.001
The model yield of RMSLE of  0.142752098516


### Ridge regression
Tune regularization parameter (alpha)

In [9]:
## Ridge regression
# tune lasso regression using the same parameter list
model = Ridge_Reg()
grid_model = GridSearchCV(model, param_grid = {'alpha': param}, cv  = 5)
grid_model.fit(x_train_std, y_train_log)
# best model
ridge = grid_model.best_estimator_
print "The R-squared of the best ridge model is ", grid_model.best_score_
print "The best alpha is ", ridge.get_params()['alpha']

## Prediction
# training
y_pred_train_ridge = cross_val_predict(ridge, x_train_std, y_train_log, cv = 5)
rmse = np.sqrt(np.sum((y_pred_train_ridge - y_train_log)**2)/n)
print "The model yield of RMSLE of ", rmse
# testing
ridge.fit(x_train_std, y_train_log)
y_pred_test_ridge = ridge.predict(x_test_std)

The R-squared of the best ridge model is  0.855140293836
The best alpha is  100.0
The model yield of RMSLE of  0.152647477778


### Random Forest Regressor
Tune max_depth, n_estimators, min_samples_split and min_samples_leaf

In [10]:
## Random Forest Regressor
# parameters
max_depth = range(18, 23, 1)
n_estimators = range(25, 46, 2)
min_samples_split = range(2, 7, 1)
min_samples_leaf = range(1, 4, 1)

# tune RF regressor
model = RandomForestRegressor()
grid_model = GridSearchCV(model, param_grid = {'max_depth': max_depth, 'n_estimators': n_estimators, 
                                               'min_samples_split': min_samples_split, 'min_samples_leaf': min_samples_leaf},
                          cv  = 5)
grid_model.fit(x_train, y_train_log)
# best model
RF = grid_model.best_estimator_
print "The R-squared of the best RF model is ", grid_model.best_score_
print "The best parameters: "
RF.get_params()

The R-squared of the best RF model is  0.872730048928
The best parameters: 


{'bootstrap': True,
 'criterion': 'mse',
 'max_depth': 22,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_impurity_split': 1e-07,
 'min_samples_leaf': 2,
 'min_samples_split': 3,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 45,
 'n_jobs': 1,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [11]:
## Prediction
# training
y_pred_train_RF = cross_val_predict(RF, x_train, y_train_log, cv = 5)
rmse = np.sqrt(np.sum((y_pred_train_RF - y_train_log)**2)/n)
print "The model yield of RMSLE of ", rmse
# testing
RF.fit(x_train, y_train_log)
y_pred_test_RF = RF.predict(x_test)

The model yield of RMSLE of  0.143510279105


### Save the results

In [12]:
# store the results into dataframes
train_results = pd.DataFrame ({
        'lasso': y_pred_train_lasso,
        'ridge': y_pred_train_ridge,
        'RF': y_pred_train_RF
})

test_results = pd.DataFrame ({
        'lasso': y_pred_test_lasso,
        'ridge': y_pred_test_ridge,
        'RF': y_pred_test_RF
})

In [13]:
# save results in log-scale
train_results.to_csv('results/train_results_pca.csv', sep = ',', index=False)
test_results.to_csv('results/test_results_pca.csv', sep = ',', index=False)