In [603]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from scipy.stats import randint as sp_randint
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import skew
from scipy.stats.stats import pearsonr
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.svm import SVR
from sklearn.ensemble import AdaBoostRegressor
from itertools import product
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import Ridge, RidgeCV, ElasticNet, LassoCV, LassoLarsCV
from sklearn.linear_model import ElasticNetCV
from sklearn.model_selection import cross_val_score

%matplotlib inline

In [668]:
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")


In [669]:
# train = pd.read_csv("../cleanedData/data.train.matrix.csv")
# test = pd.read_csv("../cleanedData/data.test.matrix.csv")

In [670]:
# y = train['SalePrice']
# cols = [col for col in train.columns if col not in ['SalePrice']]
# X_train = train[cols]

In [671]:
train = pd.read_csv("../rawData/train.csv")
test = pd.read_csv("../rawData/test.csv")

In [672]:
all_data = pd.concat((train.loc[:,'MSSubClass':'SaleCondition'],
                      test.loc[:,'MSSubClass':'SaleCondition']))
matplotlib.rcParams['figure.figsize'] = (12.0, 6.0)
prices = pd.DataFrame({"price":train["SalePrice"], "log(price + 1)":np.log1p(train["SalePrice"])})
# prices.hist()
#log transform the target:
train["SalePrice"] = np.log1p(train["SalePrice"])

#log transform skewed numeric features:
numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index

skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna())) #compute skewness
skewed_feats = skewed_feats[skewed_feats > 0.75]
skewed_feats = skewed_feats.index

all_data[skewed_feats] = np.log1p(all_data[skewed_feats])
all_data = pd.get_dummies(all_data)
all_data = all_data.fillna(all_data.mean())


X_train = all_data[:train.shape[0]]
X_test = all_data[train.shape[0]:]
y = train.SalePrice


In [673]:
def rmse_cv(model, X_train, y):
    rmse= np.sqrt(-cross_val_score(model, X_train, y, scoring="neg_mean_squared_error", cv = 10))
    return(rmse)

In [674]:
# more features
all_data.head()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,4.110874,4.189655,9.04204,7,5,2003,2003,5.283204,6.561031,0.0,...,0,0,0,1,0,0,0,0,1,0
1,3.044522,4.394449,9.169623,6,8,1976,1976,0.0,6.886532,0.0,...,0,0,0,1,0,0,0,0,1,0
2,4.110874,4.234107,9.328212,7,5,2001,2002,5.09375,6.188264,0.0,...,0,0,0,1,0,0,0,0,1,0
3,4.26268,4.110874,9.164401,7,5,1915,1970,0.0,5.379897,0.0,...,0,0,0,1,1,0,0,0,0,0
4,4.110874,4.442651,9.565284,8,5,2000,2000,5.860786,6.486161,0.0,...,0,0,0,1,0,0,0,0,1,0


In [675]:
model_ridge = Ridge()

In [676]:
alphas = [0.001, 0.005, 0.01, 0.05, 0.1, 0.3, 1, 3, 5, 10, 15, 30, 50, 75]
cv_ridge = [rmse_cv(Ridge(alpha = alpha), X_train, y).mean() 
            for alpha in alphas]

In [677]:
cv_ridge = pd.Series(cv_ridge, index = alphas)

# cv_ridge.plot(title = "Validation - Just Do It")
# plt.xlabel("alpha")
# plt.ylabel("rmse")

In [678]:
cv_ridge.mean()

0.13065920469395489

In [679]:
model_lasso = LassoCV(alphas = [1, 0.5, 0.1, 0.005, 0.001, 0.0005]).fit(X_train, y)

In [680]:
# list(rmse_cv(model_lasso))

In [681]:
cv_lasso = pd.Series(list(rmse_cv(model_lasso, X_train, y)))
# cv_ridge.plot()
# plt.xlabel("alpha")
# plt.ylabel("rmse")

In [682]:
cv_lasso = rmse_cv(model_lasso,X_train, y)
cv_lasso.mean()

0.12118535046904196

In [726]:
X_train_reduced = X_train[X_train.columns[model_lasso.coef_ > 0]]
X_test_reduced = X_test[X_test.columns[model_lasso.coef_ > 0]]

In [721]:
model_lasso = LassoCV(alphas = [1, 0.5, 0.1, 0.005, 0.001, 0.0005]).fit(X_train, y)
cv_lasso = pd.Series(list(rmse_cv(model_lasso, X_train, y)))
cv_lasso = rmse_cv(model_lasso, X_train, y)
cv_lasso.mean()

0.12118535046904196

In [722]:
model_enet = ElasticNet(alpha=0.0005, l1_ratio=0.8)
rmse_cv(model_enet, X_train, y).mean()

0.12096858294376119

In [723]:
model_lasso_reduced = LassoCV(alphas = [1, 0.5, 0.1, 0.005, 0.001, 0.0005]).fit(X_train_reduced, y)
cv_lasso = pd.Series(list(rmse_cv(model_lasso_reduced, X_train_reduced, y)))
cv_lasso = rmse_cv(model_lasso_reduced, X_train_reduced, y)
cv_lasso.mean()

0.12355136973347081

In [725]:
model_enet_reduced = ElasticNet(alpha=0.0005, l1_ratio=0.8)
rmse_cv(model_enet_reduced, X_train_reduced, y).mean()

0.12326889785537645

In [664]:
alphas = [0.001, 0.005, 0.01, 0.05, 0.1, 0.3, 1, 3, 5, 10, 15, 30, 50, 75]
cv_ridge = [rmse_cv(Ridge(alpha = alpha), X_train_reduced, y).mean() 
            for alpha in alphas]

In [665]:
cv_ridge = pd.Series(cv_ridge, index = alphas)
cv_ridge.mean()

0.12463070050069092

In [None]:
clf2 = ElasticNet(alpha=0.0005, l1_ratio=0.9)
rmse_cv(clf2, X_train_reduced, y).mean()

In [635]:

#ELASTIC NET
clf2 = ElasticNet(alpha=0.0005, l1_ratio=0.9)
rmse_cv(clf2).mean()

0.12108641434225838

In [590]:
#LASSO MODEL
clf1 = LassoCV(alphas = [1, 0.1, 0.001, 0.0005, 5e-4])
clf1.fit(X_train, y)
lasso_preds = np.expm1(clf1.predict(X_test))

#ELASTIC NET
clf2 = ElasticNet(alpha=0.0005, l1_ratio=0.9)
clf2.fit(X_train, y)
elas_preds = np.expm1(clf2.predict(X_test))


In [730]:
clf1 = LassoCV(alphas = [1, 0.1, 0.001, 0.0005, 5e-4])
clf1.fit(X_train, y)
lasso_preds = np.expm1(clf1.predict(X_test))


clf2 = ElasticNet(alpha=0.0005, l1_ratio=0.8)
clf2.fit(X_train, y)
elas_preds = np.expm1(clf2.predict(X_test))

In [731]:
clf3 = LassoCV(alphas = [1, 0.1, 0.001, 0.0005, 5e-4])
clf3.fit(X_train, y)
lasso_preds_reduced = np.expm1(clf3.predict(X_test))


clf4 = ElasticNet(alpha=0.0005, l1_ratio=0.8)
clf4.fit(X_train, y)
elas_preds_reduced = np.expm1(clf4.predict(X_test))

In [732]:
final_result = 0.5 * elas_preds + 0.3 * lasso_preds + 0.1 * elas_preds_reduced + 0.1

In [598]:
final_result

array([ 119984.47469785,  151529.79740559,  180222.28236197, ...,
        169053.45745982,  118822.89211279,  227997.79456096])

In [458]:
model_lasso = LassoCV(alphas = [1, 0.5, 0.1, 0.005, 0.001, 0.0005]).fit(X_train, y)



In [298]:
# model_elastic = ElasticNetCV(l1_ratio = [num / 100 for num in range(1, 100)], cv=5).fit(X_train, y)

In [300]:
param_bundle = {
    
    'randomForest': {
        "max_depth": [None], 
        "n_estimators": [2000],
        'n_jobs':[-1]
    },
    'gbm': {
        "max_depth": [None], 
        "n_estimators": [500],
        "learning_rate": [0.1, 0.05]
    }
    
}

In [301]:
model_bundle = {
    'randomForest': RandomForestRegressor,
    'gbm': GradientBoostingRegressor
    
}

In [302]:
# for key, val in param_bunble.items():
# param_grid = param_bundle['gbm']
# model = model_bundle['gbm']()
# #     clf = model(param)
# grid_search = GridSearchCV(model, param_grid=param_grid, n_jobs = -1, cv= 8)
# grid_search.fit(X_train, y)
# report(grid_search.cv_results_)
cv_result = rmse_cv(GradientBoostingRegressor(n_estimators=500))
print(cv_result.mean())

KeyboardInterrupt: 

In [366]:
clf = GradientBoostingRegressor(n_estimators=500)
clf.fit(X_train, y)
print(cv_result)

[ 0.13839848  0.09917672  0.10317121  0.16675129  0.15574974  0.10727297
  0.13311965  0.10372274  0.11709455  0.13873188]


In [368]:
X_test['PriceRange'] =  clf.predict(X_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [373]:
X_test['PriceRange'] =  clf.predict(X_test)
X_test = pd.get_dummies(X_test)

In [378]:
X_test.shape

(1459, 292)

In [127]:
clf = GradientBoostingRegressor(n_estimators=500)

param_grid = {"max_depth": [3, 5, None], "learning_rate": [0.1, 0.05, 0.001]}    
# run grid search
grid_search = GridSearchCV(clf, param_grid=param_grid, n_jobs = -1, cv= 8)

grid_search.fit(X_train, y)

GridSearchCV(cv=8, error_score='raise',
       estimator=GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_split=1e-07,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=200,
             presort='auto', random_state=None, subsample=1.0, verbose=0,
             warm_start=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'learning_rate': [0.1, 0.05], 'max_depth': [1, 3]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [123]:
report(grid_search.cv_results_)

Model with rank: 1
Mean validation score: 0.906 (std: 0.016)
Parameters: {'learning_rate': 0.1, 'max_depth': 3}

Model with rank: 2
Mean validation score: 0.902 (std: 0.015)
Parameters: {'learning_rate': 0.05, 'max_depth': 3}

Model with rank: 3
Mean validation score: 0.884 (std: 0.018)
Parameters: {'learning_rate': 0.1, 'max_depth': 1}



In [124]:
cv_rf = rmse_cv(grid_search)

In [54]:
cv_rf.mean()

NameError: name 'cv_rf' is not defined

In [98]:
learning_rates = [0.05, 0.1]
cv_gbm = [rmse_cv(GradientBoostingRegressor(n_estimators= 500, learning_rate = learning_rate)).mean() 
            for learning_rate in learning_rates]
cv_gbm = pd.Series(cv_gbm, index = learning_rates)

In [99]:
cv_gbm.min()

0.12565536164521188

In [53]:
rmse_cv(clf1).mean()

NameError: name 'clf1' is not defined

In [733]:
sample = pd.read_csv("../rawData/sample_submission.csv")

In [734]:
sample.head()

Unnamed: 0,Id,SalePrice
0,1461,169277.052498
1,1462,187758.393989
2,1463,183583.68357
3,1464,179317.477511
4,1465,150730.079977


In [735]:
sample['SalePrice'] = final_result

In [736]:
sample.to_csv("../submission/14th.csv", index = False)

In [75]:
# X_train.to_csv("../cleanedData/data.train.matrix2.csv")