## Load Packages

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
from scipy.stats import skew
from sklearn.model_selection import cross_val_score, train_test_split,GridSearchCV
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV, ElasticNetCV,ElasticNet, Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, make_scorer, mean_squared_log_error
from sklearn.model_selection import RepeatedKFold
import xgboost as XGB

## Read in Data Function

In [None]:
def read_process_data(filename):
    
    #read data from csv
    train = pd.read_csv(filename, index_col = 'Id')
    
    #get dummies
    train = pd.get_dummies(train)
    
    #the first 1460 data points are training data
    end_test_data = 1460
    
    # all zeros in which messes with standardization
    if 'MSSubClass_SC150' in train.columns:
        train = train.drop('MSSubClass_SC150',axis=1)
    
    #get test dataset and drop (test) target feature
    test = train.loc[train.index>end_test_data,:].drop(['SalePrice'],axis=1)
    
    #split training data into predictor and target variable
    train = train.loc[train.index<=end_test_data,:]
    
    #do log transform on target variable
    y=np.log1p(train.SalePrice)
    X=train.drop(['SalePrice'],axis=1)
    
    #standardize
    X = (X - X.mean())/X.std()
    
    return(X,y, test)




## Model Validation Module

In [None]:
def model_validation_score(X,y,model=None,n_split=10,n_repeats=10,msg=True):
    if model == None: model = LassoCV(cv=10,n_jobs=-1)
    rkf = RepeatedKFold(n_splits=n_split, n_repeats=n_repeats, random_state=2652124)
    train_score = []
    test_score = []
    y_train_lst = []
    y_train_estimated_lst = []
    y_test_lst = []
    y_test_estimated_lst = []
    iteration = 1
    for train_index, test_index in rkf.split(y):
        if (msg==True): print('iteration {}'.format(iteration))
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y[train_index], y[test_index]
        reg = model.fit(X_train,y_train)                
        y_train_lst += y_train.tolist()
        y_test_lst += y_test.tolist()
        y_train_estimated_lst += model.predict(X_train).tolist()
        y_test_estimated_lst += model.predict(X_test).tolist()
        iteration += 1
    train_score = np.sqrt(mean_squared_error(y_train_lst,y_train_estimated_lst))
    test_score = np.sqrt(mean_squared_error(y_test_lst,y_test_estimated_lst))
    if (msg==True):
        print('train root_mean_squared_log_error: {}'.format(train_score))
        print('test root_mean_squared_log_error: {}'.format(test_score))
    return(train_score,test_score)

## List of Datasets to Build Models

In [None]:
# list of dataset names

data_set_names = [
                  "dataset2_addfeatures",
                  
                  "dataset3_2"]

## Lasso

In [None]:
for name in data_set_names:
    X,y,test = read_process_data(name+'.csv')
    print(name)
    print(np.shape(X))
    
    #Define model
    lasso_model=Lasso(max_iter=10000)

    #Define hyperparameter tune grid
    #coarser grid
    lassocv = LassoCV(n_alphas=100,cv=10,max_iter=10000)
    lassocv.fit(X,y)
    best_lambda=lassocv.alpha_
    #finer grid
    param_grid = {
    'alpha': best_lambda*np.append(np.logspace(-2,2,30),1)
    }


    CV_object = GridSearchCV(estimator = lasso_model,
                param_grid = param_grid,
                n_jobs=-1,
                cv = 10,
                iid=False,
                verbose=False)
    model_validation_score(X.reset_index(drop=True),y.reset_index(drop=True),model=CV_object,msg=True)
    

## XGBoost

In [None]:
#Define model
xgb_model = XGB.XGBRegressor()

#Define hyperparameter tune grid
param_grid = {
        'min_child_weight': [1, 3, 5],
        'gamma':[0,0.03,0.1],
        'subsample': [0.6, 0.8],
        'learning_rate':[0.1,0.07],
        'max_depth': [1, 3, 5]
        }


In [None]:
for name in data_set_names:
    X,y,test = read_process_data(name+'.csv')
    print(name)
    print(np.shape(X))
    CV_object = GridSearchCV(estimator = xgb_model,
                param_grid = param_grid,
                n_jobs=-1,
                cv = 10,
                iid=False,
                verbose=False)
    model_validation_score(X.reset_index(drop=True),y.reset_index(drop=True),model=CV_object,msg=True)