In [3]:
setwd(paste0(Sys.getenv('R_SOURCES'), '/house_prices'))
house_prices <- source('main.R', local = TRUE)$value

combined_dataset_step1 <-
    # load data
    house_prices$helpers$load_data() %>%
    # remove outliers
    (house_prices$outliers$remove_outliers) %>%
    # fix NA values
    (house_prices$missing$fix_all) %>%
    # transform Y-variable
    mutate(
        price_log = log(SalePrice)
    ) %>%
    # remove redundant variables
    select(-SalePrice, -Id) %>%
    select(order(colnames(.)))

################################
# transform numeric predictors #
################################

c(combined_dataset_step2, trans_config) %<-%
    house_prices$trans$numeric$functional_transform(
        data = combined_dataset_step1,
        trans = tribble(
            ~tran_name,  ~tran_fn,
            'log',       function(x) log(x+1),
            'sqrt',      function(x) sqrt(x),
            'inv3',      function(x) x**(1/3),
            'inv4',      function(x) x**(1/4)
        ),
        target_var = price_log,        
        threshold = 30
    )    
combined_dataset_step2 <- combined_dataset_step2 %>% select(order(colnames(.)))
                
trans_config

house_prices$helpers$utils$frames_diff(
    combined_dataset_step1, 
    combined_dataset_step2
) %>% head

####################################                
# transform categorical predictors #
####################################
combined_dataset_step3 <- 
    house_prices$trans$categ$rating_transform(
        data = combined_dataset_step2, 
        target_var = price_log
    ) %>% 
    select(order(colnames(.)))

house_prices$helpers$utils$frames_diff(
    combined_dataset_step2, 
    combined_dataset_step3
) %>% head

var,tran_name,progress_score,tran_fn
GrLivArea,log,91.15794,"function (x) , log(x + 1)"
X1stFlrSF,log,90.77166,"function (x) , log(x + 1)"
BsmtUnfSF,sqrt,79.97603,"function (x) , sqrt(x)"
LotArea,inv4,78.86403,"function (x) , x^(1/4)"
TotRmsAbvGrd,log,30.68502,"function (x) , log(x + 1)"


BsmtUnfSF.1,BsmtUnfSF.2,GrLivArea.1,GrLivArea.2,LotArea.1,LotArea.2,TotRmsAbvGrd.1,TotRmsAbvGrd.2,X1stFlrSF.1,X1stFlrSF.2
150,12.24745,1710,7.444833,8450,9.587694,8,2.197225,856,6.753438
284,16.8523,1262,7.141245,9600,9.898464,6,1.94591,1262,7.141245
434,20.83267,1786,7.488294,11250,10.298836,6,1.94591,920,6.82546
540,23.2379,1717,7.448916,9550,9.88555,7,2.079442,961,6.869014
490,22.13594,2198,7.695758,14260,10.927728,9,2.302585,1145,7.044033
64,8.0,1362,7.217443,14115,10.899842,5,1.791759,796,6.680855


Alley.1,Alley.2,BldgType.1,BldgType.2,BsmtCond.1,BsmtCond.2,BsmtExposure.1,BsmtExposure.2,BsmtFinType1.1,BsmtFinType1.2,⋯,RoofStyle.1,RoofStyle.2,SaleCondition.1,SaleCondition.2,SaleType.1,SaleType.2,Street.1,Street.2,Utilities.1,Utilities.2
_none_,2.531822,1Fam,2.564039,TA,2.540871,No,2.320042,GLQ,3.262019,⋯,Gable,2.418054,Normal,2.451586,WD,2.421468,Pave,2.499311,AllPub,2.496911
_none_,2.531822,1Fam,2.564039,TA,2.540871,Gd,3.295455,ALQ,2.231818,⋯,Gable,2.418054,Normal,2.451586,WD,2.421468,Pave,2.499311,AllPub,2.496911
_none_,2.531822,1Fam,2.564039,TA,2.540871,Mn,2.675439,GLQ,3.262019,⋯,Gable,2.418054,Normal,2.451586,WD,2.421468,Pave,2.499311,AllPub,2.496911
_none_,2.531822,1Fam,2.564039,Gd,3.046154,No,2.320042,ALQ,2.231818,⋯,Gable,2.418054,Abnorml,1.920792,WD,2.421468,Pave,2.499311,AllPub,2.496911
_none_,2.531822,1Fam,2.564039,TA,2.540871,Av,2.900452,GLQ,3.262019,⋯,Gable,2.418054,Normal,2.451586,WD,2.421468,Pave,2.499311,AllPub,2.496911
_none_,2.531822,1Fam,2.564039,TA,2.540871,No,2.320042,GLQ,3.262019,⋯,Gable,2.418054,Normal,2.451586,WD,2.421468,Pave,2.499311,AllPub,2.496911


In [6]:
train_data <-
    combined_dataset_step3 %>% 
    filter(dataSource == 'train') %>% 
    select(-dataSource)

train_data %>% head(4)

Alley,BedroomAbvGr,BldgType,BsmtCond,BsmtExposure,BsmtFinSF1,BsmtFinSF2,BsmtFinType1,BsmtFinType2,BsmtFullBath,⋯,TotalBsmtSF,TotRmsAbvGrd,Utilities,WoodDeckSF,X1stFlrSF,X2ndFlrSF,X3SsnPorch,YearBuilt,YearRemodAdd,YrSold
2.531822,3,2.564039,2.540871,2.320042,706,0,3.262019,2.553429,1,⋯,856,2.197225,2.496911,0,6.753438,854,0,2003,2003,2008
2.531822,3,2.564039,2.540871,3.295455,978,0,2.231818,2.553429,0,⋯,1262,1.94591,2.496911,298,7.141245,0,0,1976,1976,2007
2.531822,3,2.564039,2.540871,2.675439,486,0,3.262019,2.553429,1,⋯,920,1.94591,2.496911,0,6.82546,866,0,2001,2002,2008
2.531822,3,2.564039,3.046154,2.320042,216,0,2.231818,2.553429,1,⋯,756,2.079442,2.496911,0,6.869014,756,0,1915,1970,2006


In [135]:
house_prices <- source('main.R', local = TRUE)$value

step1 <- house_prices$model_selection$lm$find_initial_best_r2_predictor(
    data = train_data,
    target_var = price_log, 
    r2_discard_level = 0.02
)

step1 %>% head

predictor,formula,r2
OverallQual,price_log ~ OverallQual,0.6747053
Neighborhood,price_log ~ Neighborhood,0.5477089
GrLivArea,price_log ~ GrLivArea,0.5438044
GarageCars,price_log ~ GarageCars,0.4638054
ExterQual,price_log ~ ExterQual,0.4452394
BsmtQual,price_log ~ BsmtQual,0.4353085


In [141]:
house_prices <- source('main.R', local = TRUE)$value

step2 <- house_prices$model_selection$lm$find_next_best_r2_predictor(
    data = train_data, 
    base_formula_str = step1[[1, 'formula']], 
    predictors = step1$predictor,
    r2_gain_discard_level = 0.5,
    a_max = 0, 
    a_avg = 0
)

step2 %>% head

[1] "OverallQual"
 [1] "Neighborhood"  "GrLivArea"     "GarageCars"    "ExterQual"    
 [5] "BsmtQual"      "KitchenQual"   "GarageArea"    "TotalBsmtSF"  
 [9] "X1stFlrSF"     "GarageFinish"  "FullBath"      "YearBuilt"    
[13] "GarageType"    "MSSubClass"    "YearRemodAdd"  "FireplaceQu"  
[17] "Foundation"    "TotRmsAbvGrd"  "Fireplaces"    "HeatingQC"    
[21] "BsmtFinType1"  "MasVnrType"    "MasVnrArea"    "Exterior1st"  
[25] "Exterior2nd"   "MSZoning"      "LotArea"       "BsmtFinSF1"   
[29] "GarageCond"    "BsmtExposure"  "GarageQual"    "SaleCondition"
[33] "CentralAir"    "GarageYrBlt"   "SaleType"      "WoodDeckSF"   
[37] "OpenPorchSF"   "X2ndFlrSF"     "HouseStyle"    "HalfBath"     
[41] "Electrical"    "PavedDrive"    "LotShape"      "BsmtCond"     
[45] "BsmtFullBath"  "BsmtUnfSF"     "ExterCond"     "BedroomAbvGr" 
[49] "BsmtFinType2"  "Fence"         "RoofStyle"     "Condition1"   
[53] "BldgType"      "LotFrontage"   "Heating"       "Alley"        
[57] "LandContou

predictor,formula,base_r2,r2,r2_gain,cor_abs_max,cor_abs_avg,r2_gain_adj
GrLivArea,price_log ~ OverallQual + GrLivArea,0.6747053,0.7641906,13.26286,0.608875,0.608875,13.26286
X1stFlrSF,price_log ~ OverallQual + X1stFlrSF,0.6747053,0.7512111,11.339139,0.4481937,0.4481937,11.339139
LotArea,price_log ~ OverallQual + LotArea,0.6747053,0.7462617,10.605577,0.1645073,0.1645073,10.605577
Neighborhood,price_log ~ OverallQual + Neighborhood,0.6747053,0.7363907,9.142558,0.6790092,0.6790092,9.142558
TotalBsmtSF,price_log ~ OverallQual + TotalBsmtSF,0.6747053,0.7339942,8.787371,0.5385945,0.5385945,8.787371
GarageArea,price_log ~ OverallQual + GarageArea,0.6747053,0.7318042,8.462784,0.55723,0.55723,8.462784


In [152]:
house_prices <- source('main.R', local = TRUE)$value

report <- house_prices$model_selection$lm$greedy_r2_gain_adj(
    data = train_data, 
    target_var = price_log, 
    r2_gain_discard_level = 0.5,
    r2_discard_level = 0.02, 
    a_max = 0, a_avg = 0
)

report %>% group_by(step) %>% filter(row_number() == 1)

predictor,formula,r2,step,base_r2,r2_gain,cor_abs_max,cor_abs_avg,r2_gain_adj
OverallQual,price_log ~ OverallQual,0.6747053,1,,,,,
GrLivArea,price_log ~ OverallQual + GrLivArea,0.7641906,2,0.6747053,13.2628602,0.608875,0.608875,13.2628602
Neighborhood,price_log ~ OverallQual + GrLivArea + Neighborhood,0.813279,3,0.7641906,6.4235833,0.6790092,0.5732441,6.4235833
BsmtFinSF1,price_log ~ OverallQual + GrLivArea + Neighborhood + BsmtFinSF1,0.8493875,4,0.813279,4.4398744,0.2426179,0.1983661,4.4398744
GarageArea,price_log ~ OverallQual + GrLivArea + Neighborhood + BsmtFinSF1 + GarageArea,0.86119,5,0.8493875,1.3895276,0.55723,0.4547195,1.3895276
CentralAir,price_log ~ OverallQual + GrLivArea + Neighborhood + BsmtFinSF1 + GarageArea + CentralAir,0.8702618,6,0.86119,1.0533979,0.2725223,0.2138656,1.0533979
LotArea,price_log ~ OverallQual + GrLivArea + Neighborhood + BsmtFinSF1 + GarageArea + CentralAir + LotArea,0.8776064,7,0.8702618,0.8439511,0.3697551,0.2320913,0.8439511
YearRemodAdd,price_log ~ OverallQual + GrLivArea + Neighborhood + BsmtFinSF1 + GarageArea + CentralAir + LotArea + YearRemodAdd,0.885986,8,0.8776064,0.9548328,0.5498273,0.3163918,0.9548328
TotalBsmtSF,price_log ~ OverallQual + GrLivArea + Neighborhood + BsmtFinSF1 + GarageArea + CentralAir + LotArea + YearRemodAdd + TotalBsmtSF,0.8914477,9,0.885986,0.6164546,0.5385945,0.4008991,0.6164546


In [153]:
house_prices <- source('main.R', local = TRUE)$value

report <- house_prices$model_selection$lm$greedy_r2_gain_adj(
    data = train_data, 
    target_var = price_log, 
    r2_gain_discard_level = 0.5,
    r2_discard_level = 0.02, 
    a_max = 1, a_avg = 1
)

report %>% group_by(step) %>% filter(row_number() == 1)

ERROR: Error: Column `predictor` must be length 1 or 1, not 0
