In [18]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
from scipy.stats import skew
from sklearn.model_selection import cross_val_score, train_test_split,GridSearchCV
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV, ElasticNetCV,ElasticNet, Lasso
from sklearn.preprocessing import StandardScaler,PowerTransformer
from sklearn.metrics import mean_squared_error, make_scorer, mean_squared_log_error
from sklearn.model_selection import RepeatedKFold
from sklearn.neighbors import KNeighborsRegressor
import xgboost as XGB
# from sklearn.model_selection import RepeatedKFold #why it doesn't work???
# from feature_selection import VarianceReduction

## Model
### Q1. More or fewer features?
### Q2. Lasso vs. Boosted Regression Trees
### Q3. Outliers matters?
### Q4. Even simpler models?
### Q5. Refleciton on the specification for the target variable, log or not

In [37]:
idsTotal=1460

In [38]:
# data version
train = pd.read_csv('dataset2_addfeatures.csv')

In [39]:
# test_final set
test_Kaggle=train.loc[train.Id>idsTotal,:].drop(['SalePrice','Id'],axis=1)
train=train.loc[train.Id<=idsTotal,:]
y=np.log1p(train.SalePrice)
train=train.drop(['SalePrice','Id'],axis=1)

In [40]:
# non-missing dataset
categorical_features = train.select_dtypes(include = ["object"]).columns
numerical_features = train.select_dtypes(exclude = ["object"]).columns
print("Numerical features : " + str(len(numerical_features)))
print("Categorical features : " + str(len(categorical_features)))
train_num = train[numerical_features]
train_cat = train[categorical_features]
print("NAs for numerical features in train : " + str(train_num.isnull().values.sum()))
print("Remaining NAs for numerical features in train : " + str(train_num.isnull().values.sum()))

Numerical features : 70
Categorical features : 45
NAs for numerical features in train : 0
Remaining NAs for numerical features in train : 0


In [41]:
# make sure target variable is not in the Xs
cor=train.apply(lambda x: x.corr(y,method='spearman'))
cor[abs(cor)>0.95]



Series([], dtype: float64)

In [42]:
categorical_features = train.select_dtypes(include = ["object"]).columns
train_cat = train.loc[:,categorical_features]
train_cat_dum = pd.get_dummies(train_cat)
numerical_features = train.select_dtypes(exclude = ["object"]).columns


train=pd.concat([train[numerical_features],train_cat_dum],axis=1)



In [43]:
#Standardization
stdSc = StandardScaler()
X=stdSc.fit_transform(train)
X=pd.DataFrame(X,columns=train.columns)

In [44]:
lassocv = LassoCV(n_alphas=100,cv=5,max_iter=10000)
lassocv.fit(X,y)
print('Best lambda for LASSO = {}'.format(lassocv.alpha_))
print('Score for LASSO = {}'.format(lassocv.score(X,y)))
best_lambda=lassocv.alpha_

Best lambda for LASSO = 0.0035361256739633133
Score for LASSO = 0.9426169929266768


In [45]:
# CV for the best hyperparameter
def lasy():
    lasso=Lasso(max_iter=10000)
    tuned_parameters = [{'alpha': best_lambda*np.append(np.logspace(-2,2,30),1)}]
    lassocv = GridSearchCV(lasso, tuned_parameters, cv=5, scoring=None)
    lassocv.fit(X, y)
    print('Best pars for LASSO = {} Default Scoring'.format(lassocv.best_params_))
    print('Best rmse for LASSO = {} Default Scoring'.format(lassocv.best_score_))
    lassocv = GridSearchCV(lasso, tuned_parameters, cv=5, scoring='neg_mean_squared_error')
    lassocv.fit(X, y)
    print('Best pars for LASSO = {}'.format(lassocv.best_params_))
    print('Best rmse for LASSO = {} '.format(np.sqrt(-lassocv.best_score_)))
    return lassocv

In [46]:
lassocv=lasy()

Best pars for LASSO = {'alpha': 0.0035361256739633133} Default Scoring
Best rmse for LASSO = 0.9256753291879508 Default Scoring
Best pars for LASSO = {'alpha': 0.0035361256739633133}
Best rmse for LASSO = 0.10879839112591393 


## benchmark models

In [15]:
lassocv = LassoCV(n_alphas=100,cv=5,max_iter=10000)
lassocv.fit(X,y)
print('Best lambda for LASSO = {}'.format(lassocv.alpha_))
print('Score for LASSO = {}'.format(lassocv.score(X,y)))
best_lambda=lassocv.alpha_
    lasso=Lasso(max_iter=10000)
    tuned_parameters = [{'alpha': best_lambda*np.append(np.logspace(-2,2,30),1)}]
    lassocv = GridSearchCV(lasso, tuned_parameters, cv=5, scoring=None)

Best lambda for LASSO = 3.2826499761714096
Score for LASSO = 0.8546526987673924


In [13]:
# CV for the best hyperparameter
def lasy():
    lasso=Lasso(max_iter=10000)
    tuned_parameters = [{'alpha': best_lambda*np.append(np.logspace(-2,2,30),1)}]
    lassocv = GridSearchCV(lasso, tuned_parameters, cv=5, scoring=None)
    lassocv.fit(X, y)
    print('Best pars for LASSO = {} Default Scoring'.format(lassocv.best_params_))
    print('Best rmse for LASSO = {} Default Scoring'.format(lassocv.best_score_))
    lassocv = GridSearchCV(lasso, tuned_parameters, cv=5, scoring='neg_mean_squared_error')
    lassocv.fit(X, y)
    print('Best pars for LASSO = {}'.format(lassocv.best_params_))
    print('Best rmse for LASSO = {} '.format(np.sqrt(-lassocv.best_score_)))

    knn=KNeighborsRegressor()
    tuned_parameters = [{'weights': ['uniform', 'distance'],
                         'n_neighbors': range(1,20,2)}]
    knncv = GridSearchCV(knn, tuned_parameters, cv=5, scoring='neg_mean_squared_error')
    knncv.fit(X, y)
    print('Best pars for KNN = {}'.format(knncv.best_params_))
    print('Best rmse for KNN = {}'.format(np.sqrt(-knncv.best_score_)))
    
    net=ElasticNet(max_iter=5000)
    tuned_parameters = [{'alpha': best_lambda*np.append(np.logspace(-1,1,5),1), "l1_ratio": [.1, .5, .7, .9, .95, .99, 1]}]
    netcv = GridSearchCV(net, tuned_parameters, cv=5, scoring='neg_mean_squared_error')
    netcv.fit(X, y)
    print('Best pars for ElasticNet = {}'.format(netcv.best_params_))
    print('Best rmse for ElasticNet = {}'.format(np.sqrt(-netcv.best_score_)))
    
    xgb=XGB.XGBRegressor(learning_rate=0.1,n_estimators=1000,nthread=-1,early_stopping_rounds=20,objective='reg:squarederror') #nthread=-1 use all gpu
    tuned_parameters = [{'max_depth':[3], 'gamma':[0.01,0.15,0.5] ,'subsample':[0.5]}]
    xgbcv = GridSearchCV(xgb, tuned_parameters, cv=5, scoring='neg_mean_squared_error')

    xgbcv.fit(X,y)
    print('Best pars for XGB = {}'.format(xgbcv.best_params_))
    print('Best rmse for XGB = {}'.format(np.sqrt(-xgbcv.best_score_)))
    return lassocv , knncv, netcv , xgbcv

In [14]:
lassocv = lasy()

  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


KeyboardInterrupt: 

In [None]:
lassocv, knncv, xgbcv = lasy()
 

In [17]:
# preliminary verification

knn_bst=KNeighborsRegressor(**knncv.best_params_)
lasso_bst = Lasso(**lassocv.best_params_)
net_bst = ElasticNet(**netcv.best_params_)
# xgb_bst = XGB.XGBRegressor(**xgbcv.best_params_)

AttributeError: 'LassoCV' object has no attribute 'best_params_'

In [None]:
ys={}
for best_model in [knn_bst,lasso_bst,net_bst,xgb_bst]:
    best_model.fit(X,y)
    ys[best_model]=np.exp(best_model.predict(X_T))-1

In [None]:
ys=pd.DataFrame(ys)
ys.corr()

In [None]:
submission['SalePrice']=ys.iloc[:,1]*0.3+ys.iloc[:,2]*0.3+ys.iloc[:,3]*0.4
submission.to_csv("submission_combine_noKNN_full1.csv", index=False)

