In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn import tree
from sklearn import ensemble
from sklearn.model_selection import cross_val_score, cross_val_predict
import sklearn.model_selection as ms
import warnings
import matplotlib.ticker as ticker
import seaborn as sns
warnings.filterwarnings('ignore')


## Stacked model testing

### XGboost

In [None]:
def get_error(X_train, y_train, X_test, y_test, model, show = True):
    model.fit(X_train, y_train)
    train_error = 1 - model.score(X_train, y_train)
    test_error  = 1 - model.score(X_test, y_test)
    if show:
        print("The training error is: %.5f" %train_error)
        print("The test     error is: %.5f" %test_error)
    return [train_error, test_error]

In [5]:
train_data_cleaned = pd.read_csv("data/cleaned_houseprice.csv")
HousePrices = pd.read_csv('data/train.csv')
#HousePrices_adj = pd.read_csv('./collab/data/SalePriceAdj.csv')[["Id","SalePrice_Adj"]]

X = train_data_cleaned.drop(['Id'], axis = 1)
y = np.log(HousePrices['SalePrice'])

# Exclude price outliers - old
# train_exclude = HousePrices.loc[[462,30,1432,812,142,1182,1270,3,691]]

train_exclude = HousePrices.loc[[313,335,249,706,451,1298,523,1182,691,496]]
X = X.loc[list(set(X.index)-set(train_exclude.index))]
y = y.loc[list(set(y.index)-set(train_exclude.index))]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size =0.2, random_state = 1)

#### Hyper Parameter Tuning

In [2]:
xg_cl = xgb.XGBRegressor()

In [None]:
# Need to tune max_depth - 3 is the best
depth  = range(1, 10)
xg_cl_train = []
xg_cl_test = []
for i in depth:
    xg_cl.set_params(max_depth = i)
    error = get_error(X_train, y_train, X_test, y_test, xg_cl, show=False)
    xg_cl_train.append(error[0])
    xg_cl_test.append(error[1])

plt.plot(depth, xg_cl_train, c = 'blue', label='training error')
plt.plot(depth, xg_cl_test, c = 'red', label='test error')
plt.legend(loc=7)
plt.xlabel('depth')

In [None]:
# Need to tune n_estimator - 100
depth  = range(50, 650, 50)
xg_cl_train = []
xg_cl_test = []
for i in depth:
    xg_cl.set_params(n_estimators = i)
    error = get_error(X_train, y_train, X_test, y_test, xg_cl, show=False)
    xg_cl_train.append(error[0])
    xg_cl_test.append(error[1])

plt.plot(depth, xg_cl_train, c = 'blue', label='training error')
plt.plot(depth, xg_cl_test, c = 'red', label='test error')
plt.legend(loc=7)
plt.xlabel('n_estimator')

#### Running the model and inspecting results

In [None]:
# Outcomes
xg_cl = xgb.XGBRegressor(n_estimators = 100, learning_rate = 0.1, max_depth= 3, seed = 1, subsample = 0.9)
xg_cl.fit(X_train,y_train)
preds_train = xg_cl.predict(X_train)
preds_test = xg_cl.predict(X_test)

RMSE = np.sqrt(mean_squared_error(y_test, preds_test))
print("RMSE:", RMSE)
print("R^2 Train:", xg_cl.score(X_train, y_train))
print("R^2 Test:", xg_cl.score(X_test, y_test))

plt.scatter(preds_train, y_train, c="green")
plt.scatter(preds_test, y_test, c ="red")

In [None]:
# 10 fold Cross validation
scores = cross_val_score(xg_cl, X, y, cv=10)
predictions = cross_val_predict(xg_cl, X, y, cv=10)

# Cross Validation results - XGB
print("RSME of CV:", np.sqrt(mean_squared_error(predictions, y)))
print("R^2 of CV:", scores.mean())

In [None]:
# Feature importance
feature_selection = pd.DataFrame({"feature":X_train.columns, "importance":xg_cl.feature_importances_})

plt.figure(figsize=(20,30))
feature_selection.sort_values("importance", ascending = False).head(60).plot.bar(x='feature', y='importance', figsize = (20,5))
feature_selection.loc[feature_selection["feature"] == "BedroomAbvGr"]
feature_selection.sort_values("importance", ascending = False).head(90)
important_features = feature_selection.sort_values("importance", ascending = False).head(70)["feature"]

#### Residuals

In [None]:
# price vs. residual plot - full train data set

residual_full = xg_cl.predict(X) - y
print(np.corrcoef(y, residual_full)[0,1])
plt.scatter(y, residual_full, color = "red")


In [None]:
# price vs. residual plot - full train data set actual
residual_full_act = np.exp(xg_cl.predict(X)) - np.exp(y)
print(np.corrcoef(np.exp(y), residual_full_act)[0,1])

g = sns.scatterplot(np.exp(y), residual_full_act, color='green')
plt.xlabel('Actual Price')
plt.ylabel('XGB Prediction')
plt.title("Residual Plot Excluding Outliers - XGB")
xlabels = ['${:,.0f}'.format(x) + 'K' for x in g.get_xticks()/1000]
ylabels = ['${:,.0f}'.format(x) + 'K' for x in g.get_yticks()/1000]
g.set_xticklabels(xlabels)
g.set_yticklabels(ylabels)
plt.savefig('XGB_Residual_Plot_Excluding_Outliers.pdf')

In [None]:
# price vs. residual plot - Train
residual_train = preds_train-y_train
print(np.corrcoef(y_train, residual_train)[0,1])
plt.scatter(y_train,residual_train, color = "red")


In [None]:
# price vs. residual plot - Test
residual_test = preds_test-y_test
print(np.corrcoef(y_test, residual_test)[0,1])
plt.scatter(y_test,residual_test, color = "red")

In [None]:
# price vs. residual plot - Test in actual space
residual_test_act = np.exp(preds_test)-np.exp(y_test)
print(np.corrcoef(np.exp(y_test), residual_test_act)[0,1])
plt.scatter(np.exp(y_test),residual_test_act, color = "red")

---

### Random Forest

In [9]:
# create the data for random tree
# preds_stack = np.concatenate([preds_train, preds_test]).reshape(-1,1)
# y_stack = y_train.append(y_test)


# X = train_data_cleaned.drop(['Id',"House_2.5Fin","Ext_Stone","Ext_ImStucc"], axis = 1)
# y = np.log(HousePrices['SalePrice'])

preds_stack = xg_cl.fit(X,y)
preds_stack.fit(X, y)
print(preds_stack.score(X, y))

preds_train_all_stack = xg_cl.predict(X)




0.9474900914662394


In [None]:
#preds_train_all_stack
X_train_stack, X_test_stack, y_train_stack,  y_test_stack = train_test_split(preds_train_all_stack, y, test_size = 0.2, random_state = 0)


#### Tuning Parameters

In [7]:
randomForest = ensemble.RandomForestRegressor()

In [None]:
# set the parameter grid
grid_para_forest = {
    #'min_samples_leaf': range(1,15),
    'max_depth': range(1, 15),
    'n_estimators': range(10, 200, 10)
}

# run the RF classifier through the grid search
grid_search_forest = ms.GridSearchCV(randomForest, grid_para_forest, cv=5, n_jobs=-1)
%time grid_search_forest.fit(np.array(preds_train_all_stack).reshape(-1,1), y)


In [None]:
grid_search_forest.best_params_

In [None]:
# Tuning max_depth - 6

depth  = range(1, 30)
randomForest_train = []
randomForest_test = []
for i in depth:
    randomForest.set_params(max_depth = i)
    error = get_error(X_train, y_train, X_test, y_test, randomForest, show=False)
    randomForest_train.append(error[0])
    randomForest_test.append(error[1])

plt.plot(depth, randomForest_train, c = 'blue', label='training error')
plt.plot(depth, randomForest_test, c = 'red', label='test error')
plt.legend(loc=7)
plt.xlabel('depth')


In [None]:
tuning_RSME_test = []
residual_actual = []
residual_log = []
tuning_RSME_train = []

for n in range(1,50,1):
    randomForest.set_params(n_estimators=90, random_state=42, max_features=1, max_depth = 6, min_samples_leaf= n)
    randomForest.fit(np.array(X_train_stack).reshape(-1,1), y_train_stack)
    
    preds_train_stack = randomForest.predict(np.array(X_train_stack).reshape(-1,1))
    preds_test_stack = randomForest.predict(np.array(X_test_stack).reshape(-1,1))

    tuning_RSME_train.append(np.sqrt(mean_squared_error(y_train_stack, preds_train_stack)))
    tuning_RSME_test.append(np.sqrt(mean_squared_error(y_test_stack, preds_test_stack)))
    
    residual_test_stack = preds_test_stack - y_test_stack
    residual_log.append(np.abs(residual_test_stack).sum())
    
    predict_test_stack = randomForest.predict(np.array(X_test_stack).reshape(-1,1))
    residual_actual.append(sum(np.abs(np.exp(predict_test_stack)-np.exp(y_test_stack))))

In [None]:
pd.DataFrame({"leaf_size":range(1,50,1),"tuning_RSME_test":tuning_RSME_test, "tuning_RSME_train": tuning_RSME_train, "residual_actual":residual_actual, "residual_log":residual_log}).sort_values("leaf_size")

#### Train random forest model

In [None]:

randomForest.set_params(n_estimators=180, random_state=42, max_features=1, max_depth = 5, min_samples_leaf=6)
randomForest.fit(np.array(X_train_stack).reshape(-1,1), y_train_stack)
print("The training error of random forest is: %.5f" %(1 - randomForest.score(np.array(X_train_stack).reshape(-1,1), y_train_stack)))
print("The test error of random forest is: %.5f" %(1 - randomForest.score(np.array(X_test_stack).reshape(-1,1), y_test_stack)))

preds_train_stack = randomForest.predict(np.array(X_train_stack).reshape(-1,1))
RMSE_train_stack = np.sqrt(mean_squared_error(y_train_stack, preds_train_stack))
print("Train RMSE:", RMSE_train_stack)


preds_test_stack = randomForest.predict(np.array(X_test_stack).reshape(-1,1))
RMSE_test_stack = np.sqrt(mean_squared_error(y_test_stack, preds_test_stack))
print("Test RMSE:", RMSE_test_stack)


In [None]:
# 10 fold Cross validation
from sklearn.model_selection import cross_val_score, cross_val_predict
scores_rf = cross_val_score(randomForest, np.array(preds_train_all_stack).reshape(-1,1), y, cv=10)
predictions_rf = cross_val_predict(randomForest,np.array(preds_train_all_stack).reshape(-1,1), y, cv=10)
print("RSME of CV:", np.sqrt(mean_squared_error(predictions_rf, y)))
print("R^2 of CV:", scores_rf.mean())



#### Residual Checks - Random Forest

In [None]:
# price vs. residual plot - full train set
randomForest.fit(np.array(preds_train_all_stack).reshape(-1,1), y)
residual_stack = randomForest.predict(np.array(preds_train_all_stack).reshape(-1,1)) - y
print(np.corrcoef(y, residual_stack)[0,1])
plt.scatter(y, residual_stack, color = "red")


In [None]:
# price vs. residual plot - full train set in actual space
randomForest.fit(np.array(preds_train_all_stack).reshape(-1,1), y)
residual_stack_act = np.exp(randomForest.predict(np.array(preds_train_all_stack).reshape(-1,1))) - np.exp(y)
print(np.corrcoef(np.exp(y), residual_stack_act)[0,1])
plt.scatter(np.exp(y), residual_stack_act, color = "red")


In [None]:
print(np.abs(residual_test_stack).sum())
print(residual_test_stack.sort_values())

print(np.abs(residual_test_stack).sum())
residual_test_stack.sort_values()
residual_test_stack

In [None]:
# price vs. residual plot - Train Stack in log space
residual_train_stack = preds_train_stack - y_train_stack
print(np.corrcoef(y_train_stack, residual_train_stack)[0,1])
plt.scatter(y_train_stack,residual_train_stack, color = "red")

In [None]:
# price vs. residual plot - Train Stack in actual space
residual_train_stack_act = np.exp(preds_train_stack) - np.exp(y_train_stack)
print(np.corrcoef(np.exp(y_train_stack), residual_train_stack)[0,1])
plt.scatter(np.exp(y_train_stack),residual_train_stack_act, color = "red")

In [None]:
pd.DataFrame({"residual":np.abs(preds_train_stack - y_train_stack), "preds":np.exp(preds_train_stack), "test":np.exp(y_train_stack)}).mean()

In [None]:
pd.DataFrame({"residual":(preds_train_stack - y_train_stack), "preds":np.exp(preds_train_stack), "test":np.exp(y_train_stack)}).sort_values("residual", ascending = False).head(15)

In [None]:
# price vs. residual plot - Test Stack in log space
residual_test_stack = preds_test_stack - y_test_stack
print(np.corrcoef(y_test_stack, residual_test_stack)[0,1])
plt.scatter(y_test_stack,residual_test_stack, color = "red")

In [None]:
# price vs. residual plot - Test Stack in actual price
residual_test_stack_act = np.exp(preds_test_stack) - np.exp(y_test_stack)
print(np.corrcoef(np.exp(y_test_stack), residual_test_stack)[0,1])
plt.scatter(np.exp(y_test_stack), residual_test_stack_act, color = "red")

In [None]:
print(np.abs(residual_test_stack).sum())
print(residual_test_stack.sort_values())

print(np.abs(residual_test_stack_act).sum())
residual_test_stack_act.sort_values()
residual_test_stack_act

## Submission

In [10]:
train_data_cleaned_test = pd.read_csv("data/cleaned_houseprice_test.csv")
test_submission = train_data_cleaned_test.drop(['Id'], axis = 1)
test_submission.head(5)

Unnamed: 0,LotArea,Street,Alley,LotShape,LandSlope,YearRemodAdd,RoofMatl,MasVnrType,TotalBsmtSF,Heating,...,Zone_RH,Zone_RM,Found_BrkTil,Found_CBlock,Found_Slab,Found_Stone,Found_Wood,Contour_Bnk,Contour_HLS,Contour_Low
0,11622,1,0.0,0,0,1961,1,0,882.0,1,...,1,0,0,1,0,0,0,0,0,0
1,14267,1,0.0,1,0,1958,1,1,1329.0,1,...,0,0,0,1,0,0,0,0,0,0
2,13830,1,0.0,1,0,1998,1,0,928.0,1,...,0,0,0,0,0,0,0,0,0,0
3,9978,1,0.0,1,0,1998,1,1,926.0,1,...,0,0,0,0,0,0,0,0,0,0
4,5005,1,0.0,1,0,1992,1,0,1280.0,1,...,0,0,0,0,0,0,0,0,1,0


In [12]:
# Rerun the model with full train data
#for n in range(1,50,1):

xg_cl = xgb.XGBRegressor(n_estimators = 100, learning_rate = 0.1, max_depth= 3, seed = 1, subsample = 0.9)
xg_cl.fit(X,y)
predict_test = xg_cl.predict(test_submission)

randomForest.set_params(n_estimators=180, random_state=42, max_features=1, max_depth = 5, min_samples_leaf=4)
randomForest.fit(np.array(preds_train_all_stack).reshape(-1,1), y)
predict_test_stack = randomForest.predict(np.array(predict_test).reshape(-1,1))

#print(n, sum(abs(predict_test_stack-predict_test)))



In [None]:
test=pd.DataFrame({"stack":np.exp(predict_test_stack), "baseline":np.exp(predict_test)})

In [None]:
test["diff"]=test["stack"]-test['baseline']
print(sum(test["diff"]))
print(test.sort_values("baseline"))
plt.scatter(test["baseline"],test["diff"])

In [13]:
predict_test_stack

array([11.75140822, 11.9827758 , 12.06197336, ..., 11.91494254,
       11.70127596, 12.23595374])

In [None]:

submission_xgb = pd.DataFrame({"Id":train_data_cleaned_test["Id"], "SalePrice":np.exp(predict_test_stack)})
submission_xgb.describe()

submission_xgb.to_csv("data/submission_xgb_stack_drop.csv", index = False)

# np.exp(predict_test)

In [None]:
submission_xgb.head()