In [1]:
setwd(paste0(Sys.getenv('ROOT'), '/R'))
getwd()

In [106]:
source('main.R')

data <- kaggle.house$loadData()

df.combined <- 
    kaggle.house$getCombinedDataset(data$train, data$test) %>% 
    (kaggle.house$na$fixAll) %>%
    mutate(price.log = log(SalePrice)) %>%
    select(-SalePrice)

df.combined.after.binary <- kaggle.house$trans$binaryTransform(df.combined)

binary.vars <- setdiff(
    df.combined %>% (kaggle.house$getCategoricalColumnNames), 
    df.combined.after.binary %>% (kaggle.house$getCategoricalColumnNames)
)

averaging.vars <- df.combined.after.binary %>% select(-dataSource) %>% (kaggle.house$getCategoricalColumnNames)

df.combined.after.binary %>%
    select(one_of(binary.vars), dataSource, Id) %>%
    head(2)

df.combined.averaging.vars <- 
    df.combined.after.binary %>%
    select(one_of(averaging.vars), price.log, dataSource, Id)

df.combined.averaging.vars %>% head(2)

Alley,CentralAir,Electrical,Functional,Heating,LandContour,LandSlope,LotShape,MiscFeature,PavedDrive,PoolQC,RoofMatl,Street,Utilities,dataSource,Id
0,1,1,1,1,1,0,1,0,1,0,1,1,1,train,1
0,1,1,1,1,1,0,1,0,1,0,1,1,1,train,2


BldgType,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinType2,BsmtQual,Condition1,Condition2,ExterCond,Exterior1st,⋯,MasVnrType,MSSubClass,MSZoning,Neighborhood,RoofStyle,SaleCondition,SaleType,price.log,dataSource,Id
1Fam,TA,No,GLQ,Unf,Gd,Norm,Norm,TA,VinylSd,⋯,BrkFace,60,RL,CollgCr,Gable,Normal,WD,12.24769,train,1
1Fam,TA,Gd,ALQ,Unf,Gd,Feedr,Norm,TA,MetalSd,⋯,,20,RL,Veenker,Gable,Normal,WD,12.10901,train,2


In [79]:
kaggle.house$trans$averagingTransform(
    dataset = df.combined.averaging.vars, 
    y.var = price.log, 
    id.var = Id,
    src.var=dataSource,
    stat.fun = mean, 
    diff = TRUE
) %>% select(-price.log) -> df.combined.after.averaging

df.combined.after.averaging %>% head(2)

dataSource,Id,BldgType,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinType2,BsmtQual,Condition1,Condition2,⋯,HouseStyle,KitchenQual,LotConfig,MasVnrType,MSSubClass,MSZoning,Neighborhood,RoofStyle,SaleCondition,SaleType
train,496,0.02346151,0.01856394,-0.06839749,-0.1712046,0.01968722,-0.2131955,0.01886523,0.001844353,⋯,-0.03036812,-0.2134591,-0.02215127,-0.1311315,-0.59024195,-0.9057919,-0.5771619,-0.03985115,-0.2352766,-0.03299004
train,917,0.02346151,0.01856394,0.13421624,-0.1564165,0.01968722,-0.2131955,0.01886523,0.001844353,⋯,-0.03036812,-0.2134591,-0.02215127,-0.1311315,0.02873685,-0.9057919,-0.5771619,-0.03985115,-0.2352766,-0.03299004


In [128]:
source('main.R')

df.combined.final <- inner_join(
    df.combined.after.binary %>% select(-one_of(averaging.vars)),
    df.combined.after.averaging,
    by=c("dataSource", "Id")
)

df.combined.final2 <- kaggle.house$trans$transformCombindedDataset(df.combined)

df.combined.final %>% head(2)
df.combined.final2 %>% head(2)

all(df.combined.final %>% select(-price.log) == df.combined.final2 %>% select(-price.log))

Id,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LandSlope,OverallQual,⋯,HouseStyle,KitchenQual,LotConfig,MasVnrType,MSSubClass,MSZoning,Neighborhood,RoofStyle,SaleCondition,SaleType
1,65,8450,1,0,1,1,1,0,7,⋯,0.15863531,0.1982858,-0.02215127,0.1395788,0.31592375,0.06183413,0.1395904,-0.03985115,-0.01901721,-0.03299004
2,80,9600,1,0,1,1,1,0,6,⋯,-0.03036812,-0.2134591,0.0106905,-0.1311315,0.02873685,0.06183413,0.320125,-0.03985115,-0.01901721,-0.03299004


Id,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LandSlope,OverallQual,⋯,HouseStyle,KitchenQual,LotConfig,MasVnrType,MSSubClass,MSZoning,Neighborhood,RoofStyle,SaleCondition,SaleType
1,65,8450,1,0,1,1,1,0,7,⋯,0.15863531,0.1982858,-0.02215127,0.1395788,0.31592375,0.06183413,0.1395904,-0.03985115,-0.01901721,-0.03299004
2,80,9600,1,0,1,1,1,0,6,⋯,-0.03036812,-0.2134591,0.0106905,-0.1311315,0.02873685,0.06183413,0.320125,-0.03985115,-0.01901721,-0.03299004


In [93]:
stopifnot(df.combined.final %>% colnames %>% sort == df.combined %>% colnames %>% sort)
stopifnot(df.combined.final %>% nrow == df.combined %>% nrow)

In [111]:
X <- df.combined %>%
    gather(var.name, value.old, -dataSource, -Id, -price.log) %>%
    select(-price.log)

Y <- df.combined.final %>%
    gather(var.name, value.new, -dataSource, -Id, -price.log) %>%
    select(-price.log)

X %>% head(2)
Y %>% head(2)

Z <- inner_join(X, Y, by=c("dataSource", "var.name", "Id"))

Z %>% 
    filter(var.name %in% binary.vars) %>%
    group_by(dataSource, var.name, value.new, value.old) %>%
    count %>%
    arrange(var.name, dataSource) %>%
    head    

Id,dataSource,var.name,value.old
1,train,MSSubClass,60
2,train,MSSubClass,20


Id,dataSource,var.name,value.new
1,train,LotFrontage,65
2,train,LotFrontage,80


dataSource,var.name,value.new,value.old,n
test,Alley,0,_none_,1352
test,Alley,1,Grvl,70
test,Alley,1,Pave,37
train,Alley,0,_none_,1369
train,Alley,1,Grvl,50
train,Alley,1,Pave,41


In [122]:
Z %>% 
    filter(var.name %in% averaging.vars) %>%
    group_by(dataSource, var.name, value.old) %>%
    summarise(
        min(value.new), 
        max(value.new)
    ) %>%
    filter(var.name == "RoofStyle")

df.combined %>% 
    filter(dataSource == "train") %>%
    group_by(RoofStyle) %>%
    mutate(avg = mean(price.log)) %>%
    ungroup %>%
    mutate(avg = avg - mean(price.log)) %>%
    group_by(RoofStyle) %>%
    summarise(
        min(avg), 
        max(avg)
    )

dataSource,var.name,value.old,min(value.new),max(value.new)
test,RoofStyle,Flat,0.09885963,0.09885963
test,RoofStyle,Gable,-0.03985115,-0.03985115
test,RoofStyle,Gambrel,-0.22655512,-0.22655512
test,RoofStyle,Hip,0.16037887,0.16037887
test,RoofStyle,Mansard,0.03337435,0.03337435
test,RoofStyle,Shed,0.28755723,0.28755723
train,RoofStyle,Flat,0.09885963,0.09885963
train,RoofStyle,Gable,-0.03985115,-0.03985115
train,RoofStyle,Gambrel,-0.22655512,-0.22655512
train,RoofStyle,Hip,0.16037887,0.16037887


RoofStyle,min(avg),max(avg)
Flat,0.09885963,0.09885963
Gable,-0.03985115,-0.03985115
Gambrel,-0.22655512,-0.22655512
Hip,0.16037887,0.16037887
Mansard,0.03337435,0.03337435
Shed,0.28755723,0.28755723
