## Load Packages

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
from scipy.stats import skew
from sklearn.model_selection import cross_val_score, train_test_split,GridSearchCV
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV, ElasticNetCV,ElasticNet, Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, make_scorer, mean_squared_log_error

## Read in Data Function

In [2]:
def read_process_data(filename):
    
    #read data from csv
    train = pd.read_csv(filename, index_col = 'Id')
    
    #get dummies
    train = pd.get_dummies(train)
    
    #the first 1460 data points are training data
    end_test_data = 1460
    
    # all zeros in which messes with standardization
    if 'MSSubClass_SC150' in train.columns:
        train = train.drop('MSSubClass_SC150',axis=1)
    
    #get test dataset and drop (test) target feature
    test = train.loc[train.index>end_test_data,:].drop(['SalePrice'],axis=1)
    
    #split training data into predictor and target variable
    train = train.loc[train.index<=end_test_data,:]
    
    #do log transform on target variable
    y=np.log1p(train.SalePrice)
    X=train.drop(['SalePrice'],axis=1)
    
    #standardize
    X = (X - X.mean())/X.std()
    
    return(X,y, test)




## Find best Model Fuction

In [3]:
def return_best_model(model,param_grid,  X, y):
    
    CV_object = GridSearchCV(estimator = model,
                        param_grid = param_grid,
                        n_jobs=-1,
                        cv = 10,
                        iid=False,
                        verbose=False,
                        scoring='neg_mean_squared_error')
    CV_object.fit(X,y)
    
    best_model = CV_object.best_estimator_
    best_pars = CV_object.best_params_
    
    print('RMSE',np.sqrt(-CV_object.best_score_))
        
    return(best_model,best_pars)
    
 
    
    

## List of Datasets to Build Models

In [None]:
# list of dataset names

data_set_names = ["dataset_1_0",
                  "dataset1_1_delete_outliers",
                  "dataset2_log", 
                  "dataset2_numeric_dummy",
                  "dataset2_addfeatures",
                  "dataset3_1",
                  "dataset3_2",
                  "dataset4_nodup",
                  "dataset4_nodup_nomulcol"]

## Lasso

In [None]:
data_set_names = ["dataset2_addfeatures"]

In [None]:
model_dct={}
for name in data_set_names:
    X,y,test = read_process_data(name+'.csv')
    print(name)
    print(np.shape(X))
    
    #Define model
    lasso_model=Lasso(max_iter=10000)

    #Define hyperparameter tune grid
    #coarser grid
    lassocv = LassoCV(n_alphas=100,cv=5,max_iter=10000)
    lassocv.fit(X,y)
    best_lambda=lassocv.alpha_
    #finer grid
    param_grid = {
    'alpha': best_lambda*np.append(np.logspace(-2,2,30),1)
    }
    model_dct[name]=return_best_model(lasso_model,param_grid,X,y)

dataset_1_0
(1460, 326)
RMSE 0.14172604668323158
dataset1_1_delete_outliers
(1458, 325)


## XGBoost

In [None]:
#Define model
xgb_model = XGB.XGBRegressor()

#Define hyperparameter tune grid
param_grid = {
        'min_child_weight': [1, 3, 5],
        'gamma':[0,0.03,0.1],
        'subsample': [0.6, 0.8],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'learning_rate':[0.1,0.07],
        'max_depth': [1, 3, 5]
        }


In [None]:
for name in data_set_names:
    X,y,test = read_process_data(name+'.csv')
    print(name)
    print(np.shape(X))
    return_best_model(xgb_model,param_grid,X,y) 