In [1]:
setwd(paste0(Sys.getenv('R_SOURCES'), '/house_prices'))
getwd()

In [6]:
source('main.R')
house_prices$helpers$import_libs()

combined_dataset <-
    # load data
    house_prices$helpers$load_data() %>%
    #select(-one_of(house_prices$attributes_selection$discared_cat_vars)) %>%
    #select(-one_of(house_prices$attributes_selection$discared_num_vars)) %>%
    # remove outliers
    (house_prices$outliers$remove_outliers) %>%
    # fix NA values
    (house_prices$missing$categ$replace_with_most_common) %>%
    (house_prices$missing$categ$fix_valid) %>%
    (house_prices$missing$numeric$replace_with_zero) %>%
    mutate(
        price_log = log(SalePrice)
    ) %>%
    select(-SalePrice, -Id)

#####################
# transform numeric #
#####################

transformation_config <- house_prices$trans$numeric$get_transformation_config(combined_dataset)

transformation_config

combined_dataset <-
    house_prices$trans$numeric$apply_transform(combined_dataset, transformation_config)


dataset <- 
    combined_dataset %>% 
    filter(dataSource == 'train') %>% 
    select(-dataSource)

dataset %>% nrow
dataset %>% head

var,predictor,score
BsmtUnfSF,sqrt,79.97603
GrLivArea,log,91.15794
LotArea,log,73.38993
TotRmsAbvGrd,log,30.68502
X1stFlrSF,log,90.77166


MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,⋯,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,price_log
60,RL,65,9.04204,Pave,_none_,Reg,Lvl,AllPub,Inside,⋯,0,_none_,_none_,_none_,0,2,2008,WD,Normal,12.24769
20,RL,80,9.169623,Pave,_none_,Reg,Lvl,AllPub,FR2,⋯,0,_none_,_none_,_none_,0,5,2007,WD,Normal,12.10901
60,RL,68,9.328212,Pave,_none_,IR1,Lvl,AllPub,Inside,⋯,0,_none_,_none_,_none_,0,9,2008,WD,Normal,12.31717
70,RL,60,9.164401,Pave,_none_,IR1,Lvl,AllPub,Corner,⋯,0,_none_,_none_,_none_,0,2,2006,WD,Abnorml,11.8494
60,RL,84,9.565284,Pave,_none_,IR1,Lvl,AllPub,FR2,⋯,0,_none_,_none_,_none_,0,12,2008,WD,Normal,12.42922
50,RL,85,9.555064,Pave,_none_,IR1,Lvl,AllPub,Inside,⋯,0,_none_,MnPrv,Shed,700,10,2009,WD,Normal,11.8706


In [61]:
L2Loss = function (vec) {
    sum(vec**2) / length(vec)
}

In [60]:
formulas_for_validation = data_frame(
    formula = c(
        'price_log ~ GrLivArea + OverallQual',
        'price_log ~ GrLivArea + OverallQual + BsmtFinSF1',
        'price_log ~ GrLivArea + OverallQual + BsmtFinSF1 + MSZoning',
        'price_log ~ GrLivArea + OverallQual + BsmtFinSF1 + MSZoning + SaleCondition',
        'price_log ~ GrLivArea + OverallQual + BsmtFinSF1 + MSZoning + SaleCondition + CentralAir',
        'price_log ~ GrLivArea + OverallQual + BsmtFinSF1 + MSZoning + SaleCondition + CentralAir + LotArea'
    )
)
formulas_for_validation

formula
price_log ~ GrLivArea + OverallQual
price_log ~ GrLivArea + OverallQual + BsmtFinSF1
price_log ~ GrLivArea + OverallQual + BsmtFinSF1 + MSZoning
price_log ~ GrLivArea + OverallQual + BsmtFinSF1 + MSZoning + SaleCondition
price_log ~ GrLivArea + OverallQual + BsmtFinSF1 + MSZoning + SaleCondition + CentralAir
price_log ~ GrLivArea + OverallQual + BsmtFinSF1 + MSZoning + SaleCondition + CentralAir + LotArea


In [75]:
iterate <- function(sample_index) {
    
    test_y <- dataset[-sample_index, 'price_log'][[1]]

    totalset <- 
        dataset %>% 
        mutate(price_log = replace(price_log, -sample_index, NA))

    totalset <-
        house_prices$trans$categ$rating_transform(totalset, price_log)

    trainset <- totalset %>% filter(!is.na(price_log))
    testset <- totalset %>% filter(is.na(price_log))

    trainset %>% nrow
    testset %>% nrow

    formulas_for_validation %>%
    mutate(
        model = map(formula, ~lm(as.formula(.), data=trainset)),

        r2 = map_dbl(model, function (mod) {
            summary(mod)$r.squared
        }),

        L2_train = map_dbl(model, function (mod) {
            augment <- broom::augment(mod)
            L2Loss(augment[['price_log']] - augment$.fitted)
        }),

        L2_test = map_dbl(model, function (mod) {
            test_predicted <- predict(mod, testset)
            L2Loss(test_predicted - test_y)
        })
    ) %>%
    select(-model)
}

In [88]:
caret::createDataPartition(
    y = dataset[['price_log']], 
    p = 0.5, 
    list = T, 
    times = 10
) %>%
map(iterate) -> x

In [90]:
y <- bind_rows(x, .id='sample')

In [91]:
y

sample,formula,r2,L2_train,L2_test
Resample01,price_log ~ GrLivArea + OverallQual,0.7731434,0.0367995,0.03866039
Resample01,price_log ~ GrLivArea + OverallQual + BsmtFinSF1,0.8171506,0.02966089,0.03075648
Resample01,price_log ~ GrLivArea + OverallQual + BsmtFinSF1 + MSZoning,0.8401242,0.02593422,0.02642863
Resample01,price_log ~ GrLivArea + OverallQual + BsmtFinSF1 + MSZoning + SaleCondition,0.8558766,0.02337895,0.02467857
Resample01,price_log ~ GrLivArea + OverallQual + BsmtFinSF1 + MSZoning + SaleCondition + CentralAir,0.8643475,0.02200484,0.02334799
Resample01,price_log ~ GrLivArea + OverallQual + BsmtFinSF1 + MSZoning + SaleCondition + CentralAir + LotArea,0.873706,0.02048676,0.02168708
Resample02,price_log ~ GrLivArea + OverallQual,0.7777078,0.03620621,0.03924898
Resample02,price_log ~ GrLivArea + OverallQual + BsmtFinSF1,0.8223841,0.02892947,0.03152975
Resample02,price_log ~ GrLivArea + OverallQual + BsmtFinSF1 + MSZoning,0.8406599,0.02595278,0.02645392
Resample02,price_log ~ GrLivArea + OverallQual + BsmtFinSF1 + MSZoning + SaleCondition,0.8524252,0.02403648,0.02419308
