There are methods that are common in machine learning projects like cross validation, grid searching and stacking.

So I wrapped them into a common module in helper.py. 

In this script, I will call those functions directly. Feel free to have a check at my github for source code.



In [1]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, SGDRegressor, BayesianRidge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.svm import SVR, LinearSVR
from sklearn.kernel_ridge import KernelRidge
from xgboost import XGBRegressor

import numpy as np
import pandas as pd




In [2]:
import helpers
from importlib import reload
reload(helpers)
from helpers import *

In [3]:
X = np.load("X.npy")
y = np.load("y.npy")
scaled_test = np.load("scaled_test.npy")

In [4]:
# models = [LinearRegression(),Ridge(),Lasso(alpha=0.01,max_iter=10000),RandomForestRegressor(),GradientBoostingRegressor(),SVR(),LinearSVR(),
#           ElasticNet(alpha=0.001,max_iter=10000),SGDRegressor(max_iter=1000,tol=1e-3),BayesianRidge(),KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5),
#           ExtraTreesRegressor(),XGBRegressor()]

# names = ["LR", "Ridge", "Lasso", "RF", "GBR", "SVR", "LinSVR", "Ela","SGD","Bay","Ker","Extra","Xgb"]
# for name, model in zip(names, models):
#     score = rmse_cv(model, X, y)
#     print("{}: {:.6f}, {:.4f}".format(name,score.mean(),score.std()))



In [5]:
lasso_best = grid(Lasso()).grid_get(X,y,{'alpha': [0.0004,0.0005,0.0007,0.0006,0.0009,0.0008],'max_iter':[10000]})

{'alpha': 0.0005, 'max_iter': 10000} 0.107202347338
                                 params  mean_test_score  std_test_score
0  {'alpha': 0.0004, 'max_iter': 10000}         0.107326        0.003200
1  {'alpha': 0.0005, 'max_iter': 10000}         0.107202        0.003151
2  {'alpha': 0.0007, 'max_iter': 10000}         0.107412        0.003104
3  {'alpha': 0.0006, 'max_iter': 10000}         0.107268        0.003121
4  {'alpha': 0.0009, 'max_iter': 10000}         0.107871        0.003099
5  {'alpha': 0.0008, 'max_iter': 10000}         0.107619        0.003103


In [6]:
ridge_best=grid(Ridge()).grid_get(X,y,{'alpha':[35,40,45,50,55,60,65,70,80,90]})

{'alpha': 35} 0.108637386149
          params  mean_test_score  std_test_score
0  {'alpha': 35}         0.108637        0.003011
1  {'alpha': 40}         0.108655        0.002995
2  {'alpha': 45}         0.108684        0.002982
3  {'alpha': 50}         0.108719        0.002972
4  {'alpha': 55}         0.108761        0.002964
5  {'alpha': 60}         0.108806        0.002958
6  {'alpha': 65}         0.108855        0.002953
7  {'alpha': 70}         0.108906        0.002950
8  {'alpha': 80}         0.109014        0.002945
9  {'alpha': 90}         0.109128        0.002943


In [7]:
svr_best = grid(SVR()).grid_get(X,y,{'C':[11,12,13,14,15],'kernel':["rbf"],"gamma":[0.0003,0.0004],"epsilon":[0.008,0.009]})


{'C': 15, 'gamma': 0.0004, 'epsilon': 0.008, 'kernel': 'rbf'} 0.106137364476
                                               params  mean_test_score  \
0   {'C': 11, 'gamma': 0.0003, 'epsilon': 0.008, '...         0.106940   
1   {'C': 11, 'gamma': 0.0004, 'epsilon': 0.008, '...         0.106369   
2   {'C': 11, 'gamma': 0.0003, 'epsilon': 0.009, '...         0.106881   
3   {'C': 11, 'gamma': 0.0004, 'epsilon': 0.009, '...         0.106355   
4   {'C': 12, 'gamma': 0.0003, 'epsilon': 0.008, '...         0.106825   
5   {'C': 12, 'gamma': 0.0004, 'epsilon': 0.008, '...         0.106260   
6   {'C': 12, 'gamma': 0.0003, 'epsilon': 0.009, '...         0.106781   
7   {'C': 12, 'gamma': 0.0004, 'epsilon': 0.009, '...         0.106246   
8   {'C': 13, 'gamma': 0.0003, 'epsilon': 0.008, '...         0.106715   
9   {'C': 13, 'gamma': 0.0004, 'epsilon': 0.008, '...         0.106181   
10  {'C': 13, 'gamma': 0.0003, 'epsilon': 0.009, '...         0.106698   
11  {'C': 13, 'gamma': 0.0004, 'eps

In [None]:
param_grid={'alpha':[0.2,0.3,0.4,0.5], 'kernel':["polynomial"], 'degree':[3],'coef0':[0.8,1,1.2]}
ker_best = grid(KernelRidge()).grid_get(X,y,param_grid)



{'alpha': 0.3, 'degree': 3, 'kernel': 'polynomial', 'coef0': 1.2} 0.10715301547
                                               params  mean_test_score  \
0   {'alpha': 0.2, 'degree': 3, 'kernel': 'polynom...         0.108579   
1   {'alpha': 0.2, 'degree': 3, 'kernel': 'polynom...         0.107362   
2   {'alpha': 0.2, 'degree': 3, 'kernel': 'polynom...         0.107167   
3   {'alpha': 0.3, 'degree': 3, 'kernel': 'polynom...         0.110112   
4   {'alpha': 0.3, 'degree': 3, 'kernel': 'polynom...         0.107802   
5   {'alpha': 0.3, 'degree': 3, 'kernel': 'polynom...         0.107153   
6   {'alpha': 0.4, 'degree': 3, 'kernel': 'polynom...         0.111889   
7   {'alpha': 0.4, 'degree': 3, 'kernel': 'polynom...         0.108480   
8   {'alpha': 0.4, 'degree': 3, 'kernel': 'polynom...         0.107397   
9   {'alpha': 0.5, 'degree': 3, 'kernel': 'polynom...         0.113770   
10  {'alpha': 0.5, 'degree': 3, 'kernel': 'polynom...         0.109262   
11  {'alpha': 0.5, 'degree': 3, 

In [None]:
ela_best = grid(ElasticNet()).grid_get(X,y,{'alpha':[0.0005,0.0008,0.004,0.005],'l1_ratio':[0.08,0.1,0.3,0.5,0.7],'max_iter':[10000]})

In [None]:
bay = BayesianRidge()

## Stacking

In [None]:
stack_model = stacking(mod=[lasso_best,ridge_best,svr_best,ker_best,ela_best,bay],meta_model=ker_best)
print(rmse_cv(stack_model,X,y))


In [None]:
X_train_stack, X_test_stack = stack_model.get_oof(a,b,test_X_scaled)
X_train_add = np.hstack((X,X_train_stack))
X_test_add = np.hstack((scaled_test,X_test_stack))
print(rmse_cv(stack_model,X_train_add,y))



In [None]:
stack_model.fit(X_train_add,y)

In [None]:
pred = np.exp(stack_model.predict(X_test_add))

In [None]:
sub=pd.read_csv("sample_submission.csv")
from time import gmtime, strftime
submName = strftime("%Y%m%d%H%M%S", gmtime()) + '_submission.csv'
sub['SalePrice']=pred
sub.to_csv(submName, index=False)