In [1]:
setwd(paste0(Sys.getenv('ROOT'), '/R'))
getwd()

In [16]:
source('main.R')

df.train <- data_frame(
    BldgType       = c('A', 'A', 'A', 'B', 'B', 'C', 'C'),
    Quality        = c( 1,   1,   2,   1,   3,   3,   2),
    price          = c(50,   40,  30,  30,   20,   10,   20)
) %>% mutate(id = 1:nrow(.))

df.test <- data_frame(
    BldgType       = c('A', 'A', 'B', 'B', 'C', 'D'),
    Quality        = c( 2,   1,   2,   2,   4,   3),
) %>% mutate(id = 1:nrow(.))

df.train
df.test

BldgType,Quality,price,id
A,1,50,1
A,1,40,2
A,2,30,3
B,1,30,4
B,3,20,5
C,3,10,6
C,2,20,7


BldgType,Quality,id
A,2,1
A,1,2
B,2,3
B,2,4
C,4,5
D,3,6


In [56]:
df.train.tran <-
    df.train %>%
    gather(var.name, var.value, -price, -id) %>%
    group_by(var.name, var.value) %>%
    mutate(avg.price = mean(price)) %>%
    ungroup %>% 
    select(-var.value, -price) %>% 
    spread(var.name, avg.price) %>% 
    arrange(id)

df.train.tran

Z <- 
    df.train.tran %>% 
    inner_join(df.train %>% rename(old.BldgType=BldgType, old.Quality=Quality), by='id')

id,BldgType,Quality
1,40,40
2,40,40
3,40,25
4,25,40
5,25,15
6,15,15
7,15,25


In [62]:
Z %>%
    group_by(old.BldgType) %>%
    mutate(avg.BldgType = mean(price)) %>%
    group_by(old.Quality) %>%
    mutate(avg.Quality = mean(price)) %>%
    mutate(
        avg.BldgType == BldgType,
         avg.Quality == Quality
    )

id,BldgType,Quality,old.BldgType,old.Quality,price,avg.BldgType,avg.Quality,avg.BldgType == BldgType,avg.Quality == Quality
1,40,40,A,1,50,40,40,True,True
2,40,40,A,1,40,40,40,True,True
3,40,25,A,2,30,40,25,True,True
4,25,40,B,1,30,25,40,True,True
5,25,15,B,3,20,25,15,True,True
6,15,15,C,3,10,15,15,True,True
7,15,25,C,2,20,15,25,True,True


In [95]:
train.long <- 
    df.train %>%
    gather(var.name, var.value, -price, -id) %>%
    mutate(src = 'train')

test.long <-
    df.test %>%
        gather(var.name, var.value, -id) %>%
        mutate(src = 'test')

long <- train.long %>% bind_rows(test.long)

long %>% head

price,id,var.name,var.value,src
50,1,BldgType,A,train
40,2,BldgType,A,train
30,3,BldgType,A,train
30,4,BldgType,B,train
20,5,BldgType,B,train
10,6,BldgType,C,train


In [94]:
long %>%
    group_by(var.name, var.value) %>%
    arrange(var.name, var.value) %>%
    mutate(avg.price = mean(price, na.rm=T)) %>%
    group_by(var.name) %>%
    mutate(avg.price = ifelse(is.na(avg.price), mean(price, na.rm=T), avg.price)) %>% 
    select(-var.value) %>% 
    spread(var.name, avg.price)

price,id,src,BldgType,Quality
10.0,6,train,15.0,15.0
20.0,5,train,25.0,15.0
20.0,7,train,15.0,25.0
30.0,3,train,40.0,25.0
30.0,4,train,25.0,40.0
40.0,2,train,40.0,40.0
50.0,1,train,40.0,40.0
,1,test,40.0,25.0
,2,test,40.0,40.0
,3,test,25.0,25.0


In [111]:
transform <- function (trainset, testset, y.var, id.var, stat.fun=mean) {
    
    y.var <- enquo(y.var)
    id.var <- enquo(id.var)
    
    train.long <- 
        trainset %>%
        gather(var.name, var.value, -!!y.var, -!!id.var) %>%
        mutate(src = 'train')

    test.long <-
        testset %>%
        gather(var.name, var.value, -!!id.var) %>%
        mutate(src = 'test')

    long <- train.long %>% bind_rows(test.long)
    long %>%
        group_by(var.name, var.value) %>%
        arrange(var.name, var.value) %>%
        mutate(avg_ = stat.fun(!!y.var, na.rm=T)) %>%
        group_by(var.name) %>%
        mutate(avg_ = ifelse(is.na(avg_), stat.fun(!!y.var, na.rm=T), avg_)) %>% 
        select(-var.value) %>% 
        spread(var.name, avg_)
}

transform(df.train, df.test, y.var=price, id.var=id)

price,id,src,BldgType,Quality
10.0,6,train,15.0,15.0
20.0,5,train,25.0,15.0
20.0,7,train,15.0,25.0
30.0,3,train,40.0,25.0
30.0,4,train,25.0,40.0
40.0,2,train,40.0,40.0
50.0,1,train,40.0,40.0
,1,test,40.0,25.0
,2,test,40.0,40.0
,3,test,25.0,25.0
