In [3]:
getwd()
if ( is.null(environment()$this_notebook_dir) ) {
    this_notebook_dir <- getwd()
    setwd(paste0(getwd(), '/..'))
}
getwd()

In [8]:
source('helpers.R')

kaggle.house$loadLibraries()
data <- kaggle.house$loadData()

df.training <- data$train
df.testing <- data$test

c(nrow(df.training),   nrow(df.testing))
c(length(df.training), length(df.testing))

df.training <- df.training %>% mutate(sale_price_log = log(SalePrice))

In [9]:
source('./utils.R')
kaggle.house$utils$get_char_columns_names(df.training)

In [11]:
source("./transform_vars.R")

groupAveragingTranFactory <- kaggle.house$trans$groupAveragingTranFactory

with(list(), {
    
    df.train <- data_frame(
        categAttr =       c('a', 'a', 'b', 'b', 'b', 'c', 'c'),
        sale_price_log  = c(1 ,   2,   3,   4,   5,   6,   7)
    )
    df.test <- data_frame(
        categAttr = c('a', 'a', 'b', 'b', 'c', 'c', 'd', 'd', 'd')
    )
    
    trainTransformator <- groupAveragingTranFactory(sale_price_log, categAttr, "categAttr.new")
    tmp <- trainTransformator(df.train)    
    
    test_that("train set tranformation", {
        
        df.train.new <- tmp$df.new
        
        x <- df.train.new %>% group_by(categAttr) %>% 
        summarise(min = min(categAttr.new), max = max(categAttr.new))
    
        y <- df.train %>% group_by(categAttr) %>% 
            summarise(median = median(sale_price_log))
        
        expect_true(all(x[, "min"] == x[, "max"]))
        expect_true(all(x[, "min"] == y[, "median"]))
        expect_equal(
            df.train.new$categAttr.new, 
            c(1.5, 1.5, 4, 4, 4, 6.5, 6.5)
        )
    })    
       
    test_that("test set tranformation", {
        
        testTransformator <- tmp$testset.transformator
        df.test.new <- testTransformator(df.test)
 
        expect_equal(df.test.new$categAttr, df.test$categAttr)
        expect_equal(
            df.test.new$categAttr.new, 
            c(1.5, 1.5, 4, 4, 6.5, 6.5, 4, 4, 4)
        )
    })
})

In [18]:
source("./transform_vars.R")

with(kaggle.house$trans$transformatorContainer, {
    test_that("Alley", {
        df <- Alley(df.training)

        expect_equal(df %>% filter(is.na(Alley) & has_alley_access != 0) %>% nrow, 0)
        expect_equal(df %>% filter(!is.na(Alley) & has_alley_access != 1) %>% nrow, 0)
    })
    
    test_that("CentralAir", {
        df <- CentralAir(df.training)

        expect_equal(df %>% filter(CentralAir == 'Y' & has_central_air != 1) %>% nrow, 0)
        expect_equal(df %>% filter(CentralAir != 'Y' & has_central_air != 0) %>% nrow, 0)
    })
    
    test_that("Electrical", {
        df <- Electrical(df.training)

        expect_equal(df %>% filter(Electrical == 'SBrkr' & standard_electrical != 1) %>% nrow, 0)
        expect_equal(df %>% filter(Electrical != 'SBrkr' & standard_electrical != 0) %>% nrow, 0)
    })
    
    test_that("Functional", {
        df <- Functional(df.training)

        expect_equal(df %>% filter(Functional == 'Typ' & is_full_functional != 1) %>% nrow, 0)
        expect_equal(df %>% filter(Functional != 'Typ' & is_full_functional != 0) %>% nrow, 0)
    })
    
    test_that("BldgType", {
        res <- BldgType(df.training)
        df.training.new <- res$df.new
        testTran <- res$testset.transformator
        
       
        res$df.new %>% group_by(BldgType) %>% 
            summarise(min = min(building_type), max = max(building_type), avg = median(sale_price_log)) %>% print
        
        df.test.new <- res$testset.transformator(df.testing)
        
        #expect_equal(df %>% filter(Functional == 'Typ' & is_full_functional != 1) %>% nrow, 0)
        #expect_equal(df %>% filter(Functional != 'Typ' & is_full_functional != 0) %>% nrow, 0)
    })
})


# A tibble: 5 x 4
  BldgType      min      max      avg
     <chr>    <dbl>    <dbl>    <dbl>
1     1Fam 12.03112 12.03112 12.03112
2   2fmCon 11.75587 11.75587 11.75587
3   Duplex 11.82026 11.82026 11.82026
4    Twnhs 11.83138 11.83138 11.83138
5   TwnhsE 12.05641 12.05641 12.05641


In [11]:
removify <- function(trans, attr_name) {
    function(df, remove=T) {
        df <- trans(df)
        if ( remove ) {
            df[, attr_name] <- NULL
        }
        df
    }
}

In [12]:
transform <- list()

In [13]:
transform$Alley <- removify(
    function (df) {
        df %>% mutate(has_alley_access = ifelse(!is.na(Alley), 1, 0))
    }, 
    "Alley")

training %>% group_by(Alley) %>% count
tranformed <- transform$Alley(training, remove=F)
tranformed %>% group_by(has_alley_access, Alley) %>% count

Alley,n
Grvl,50
Pave,41
,1369


has_alley_access,Alley,n
0,,1369
1,Grvl,50
1,Pave,41


In [14]:
transform$CentralAir <- removify(
    function (df) {
        df %>% mutate(has_central_air = ifelse(CentralAir == 'Y', 1, 0))
    }, 
    "CentralAir")

training %>% group_by(CentralAir) %>% count
tranformed <- transform$CentralAir(training, remove=F)
tranformed %>% group_by(has_central_air, CentralAir) %>% count

CentralAir,n
N,95
Y,1365


has_central_air,CentralAir,n
0,N,95
1,Y,1365


In [88]:
transform$Electrical <- removify(
    function (df) {
        df %>% mutate(standard_electrical = ifelse(Electrical == 'SBrkr' | is.na(Electrical), 1, 0))
    }, 
    "Electrical")

training %>% group_by(Electrical) %>% count
tranformed <- transform$Electrical(training, remove=F)
tranformed %>% group_by(standard_electrical, Electrical) %>% count

Electrical,n
FuseA,94
FuseF,27
FuseP,3
Mix,1
SBrkr,1334
,1


standard_electrical,Electrical,n
0,FuseA,94
0,FuseF,27
0,FuseP,3
0,Mix,1
1,SBrkr,1334
1,,1


In [139]:
training %>% group_by(Functional) %>% summarise(n=n(), avg_price=median(sale_price_log))

transform$Functional <- removify(
    function (df) {
        df %>% mutate(is_full_functional = ifelse(Functional == 'Typ', 1, 0))
    }, 
    "Functional")

tranformed <- transform$Functional(training, remove=F)
tranformed %>% group_by(is_full_functional, Functional) %>% count

Functional,n,avg_price
Maj1,14,11.85447
Maj2,5,11.35041
Min1,31,11.84223
Min2,34,11.8494
Mod,15,11.83428
Sev,1,11.76757
Typ,1360,12.01673


is_full_functional,Functional,n
0,Maj1,14
0,Maj2,5
0,Min1,31
0,Min2,34
0,Mod,15
0,Sev,1
1,Typ,1360


In [15]:
transform$Heating <- removify(
    function (df) {
        df %>% mutate(heating_air_furnace = ifelse(Heating == 'GasA', 1, 0))
    }, 
    "Heating")

training %>% group_by(Heating) %>% count
tranformed <- transform$Heating(training, remove=F)
tranformed %>% group_by(heating_air_furnace, Heating) %>% count

Heating,n
Floor,1
GasA,1428
GasW,18
Grav,7
OthW,2
Wall,4


heating_air_furnace,Heating,n
0,Floor,1
0,GasW,18
0,Grav,7
0,OthW,2
0,Wall,4
1,GasA,1428


In [16]:
transform$LandContour <- removify(
    function (df) {
        df %>% mutate(is_land_level = ifelse(LandContour == 'Lvl', 1, 0))
    }, 
    "LandContour")

training %>% group_by(LandContour) %>% count
tranformed <- transform$LandContour(training, remove=F)
tranformed %>% group_by(is_land_level, LandContour) %>% count

LandContour,n
Bnk,63
HLS,50
Low,36
Lvl,1311


is_land_level,LandContour,n
0,Bnk,63
0,HLS,50
0,Low,36
1,Lvl,1311


In [161]:
training %>% group_by(LandSlope) %>% summarise(n=n(), avg_price=median(sale_price_log))

transform$LandSlope <- removify(
    function (df) {
        df %>% mutate(is_slope = ifelse(LandSlope != 'Gtl', 1, 0))
    }, 
    "LandSlope")

tranformed <- transform$LandSlope(training, remove=F)
tranformed %>% group_by(is_slope, LandSlope) %>% count

LandSlope,n,avg_price
Gtl,1382,11.99458
Mod,65,12.13726
Sev,13,12.12811


is_slope,LandSlope,n
0,Gtl,1382
1,Mod,65
1,Sev,13


In [17]:
transform$LotShape <- removify(
    function (df) {
        df %>% mutate(is_lotshape_regular = ifelse(LotShape == 'Reg', 1, 0))
    }, 
    "LotShape")

training %>% group_by(LotShape) %>% count
tranformed <- transform$LotShape(training, remove=F)
tranformed %>% group_by(is_lotshape_regular, LotShape) %>% count

LotShape,n
IR1,484
IR2,41
IR3,10
Reg,925


is_lotshape_regular,LotShape,n
0,IR1,484
0,IR2,41
0,IR3,10
1,Reg,925


In [18]:
transform$MiscFeature <- removify(
    function (df) {
        df %>% mutate(has_misc_feature = ifelse(!is.na(MiscFeature), 1, 0))
    }, 
    "MiscFeature")

training %>% group_by(MiscFeature) %>% count
tranformed <- transform$MiscFeature(training, remove=F)
tranformed %>% group_by(has_misc_feature, MiscFeature) %>% count

MiscFeature,n
Gar2,2
Othr,2
Shed,49
TenC,1
,1406


has_misc_feature,MiscFeature,n
0,,1406
1,Gar2,2
1,Othr,2
1,Shed,49
1,TenC,1


In [19]:
transform$PavedDrive <- removify(
    function (df) {
        df %>% mutate(has_paved_drive = ifelse(PavedDrive == 'Y', 1, 0))
    }, 
    "PavedDrive")

training %>% group_by(PavedDrive) %>% count
tranformed <- transform$PavedDrive(training, remove=F)
tranformed %>% group_by(has_paved_drive, PavedDrive) %>% count

PavedDrive,n
N,90
P,30
Y,1340


has_paved_drive,PavedDrive,n
0,N,90
0,P,30
1,Y,1340


In [20]:
transform$PoolQC <- removify(
    function (df) {
        df %>% mutate(has_pool = ifelse(!is.na(PoolQC), 1, 0))
    }, 
    "PoolQC")

training %>% group_by(PoolQC) %>% count
tranformed <- transform$PoolQC(training, remove=F)
tranformed %>% group_by(has_pool, PoolQC) %>% count

PoolQC,n
Ex,2
Fa,2
Gd,3
,1453


has_pool,PoolQC,n
0,,1453
1,Ex,2
1,Fa,2
1,Gd,3


In [21]:
transform$RoofMatl <- removify(
    function (df) {
        df %>% mutate(standard_roof_material = ifelse(RoofMatl == 'CompShg', 1, 0))
    }, 
    "RoofMatl")

training %>% group_by(RoofMatl) %>% count
tranformed <- transform$RoofMatl(training, remove=F)
tranformed %>% group_by(standard_roof_material, RoofMatl) %>% count

RoofMatl,n
ClyTile,1
CompShg,1434
Membran,1
Metal,1
Roll,1
Tar&Grv,11
WdShake,5
WdShngl,6


standard_roof_material,RoofMatl,n
0,ClyTile,1
0,Membran,1
0,Metal,1
0,Roll,1
0,Tar&Grv,11
0,WdShake,5
0,WdShngl,6
1,CompShg,1434


In [22]:
transform$Street <- removify(
    function (df) {
        df %>% mutate(is_street_paved = ifelse(Street == 'Pave', 1, 0))
    }, 
    "Street")

training %>% group_by(Street) %>% count
tranformed <- transform$Street(training, remove=F)
tranformed %>% group_by(is_street_paved, Street) %>% count

Street,n
Grvl,6
Pave,1454


is_street_paved,Street,n
0,Grvl,6
1,Pave,1454


In [57]:
xxx <- function (df, attr_name, new_attr_name) {
    attr_name <- enquo(attr_name)
    df %>% group_by(!!attr_name) %>% mutate(!!new_attr_name := median(sale_price_log))
}

df <- xxx(training, BldgType, "building_type")
df %>% group_by(BldgType) %>% summarise(min(building_type), max(building_type), median(sale_price_log))

BldgType,min(building_type),max(building_type),median(sale_price_log)
1Fam,12.03112,12.03112,12.03112
2fmCon,11.75587,11.75587,11.75587
Duplex,11.82026,11.82026,11.82026
Twnhs,11.83138,11.83138,11.83138
TwnhsE,12.05641,12.05641,12.05641


In [60]:
group_averaging_tran <- function (attr_name, new_attr_name) {
    
    attr_name <- enquo(attr_name)
    
    function (df, remove=T) {
        df.new <- df %>%
            group_by(!!attr_name) %>%
            mutate(!!new_attr_name := median(sale_price_log))
        if ( remove ) {
            df.new[, attr_name] <- NULL
        }
        df.new
    }
}

In [61]:
transform$BldgType <- group_averaging_tran(BldgType, "building_type")

training %>% group_by(BldgType) %>% count
tranformed <- transform$BldgType(training, remove=F)

tranformed %>% group_by(BldgType) %>% 
summarise(min(building_type), max(building_type), median(sale_price_log))

BldgType,n
1Fam,1220
2fmCon,31
Duplex,52
Twnhs,43
TwnhsE,114


BldgType,min(building_type),max(building_type),median(sale_price_log)
1Fam,12.03112,12.03112,12.03112
2fmCon,11.75587,11.75587,11.75587
Duplex,11.82026,11.82026,11.82026
Twnhs,11.83138,11.83138,11.83138
TwnhsE,12.05641,12.05641,12.05641


In [63]:
transform$BsmtCond <- group_averaging_tran(BsmtCond, "basement_condition")

training %>% group_by(BsmtCond) %>% count
tranformed <- transform$BsmtCond(training, remove=F)

tranformed %>% group_by(BsmtCond) %>% 
summarise(min(basement_condition), max(basement_condition), median(sale_price_log))

BsmtCond,n
Fa,45
Gd,65
Po,2
TA,1311
,37


BsmtCond,min(basement_condition),max(basement_condition),median(sale_price_log)
Fa,11.68267,11.68267,11.68267
Gd,12.17499,12.17499,12.17499
Po,11.06554,11.06554,11.06554
TA,12.0137,12.0137,12.0137
,11.53077,11.53077,11.53077


In [65]:
transform$BsmtExposure <- group_averaging_tran(BsmtExposure, "basement_exposure")

training %>% group_by(BsmtExposure) %>% count
tranformed <- transform$BsmtExposure(training, remove=F)

tranformed %>% group_by(BsmtExposure) %>% 
summarise(min(basement_exposure), max(basement_exposure), median(sale_price_log))

BsmtExposure,n
Av,221
Gd,134
Mn,114
No,953
,38


BsmtExposure,min(basement_exposure),max(basement_exposure),median(sale_price_log)
Av,12.1327,12.1327,12.1327
Gd,12.33256,12.33256,12.33256
Mn,12.11413,12.11413,12.11413
No,11.94471,11.94471,11.94471
,11.55216,11.55216,11.55216


In [68]:
transform$BsmtFinType1 <- group_averaging_tran(BsmtFinType1, "basement_finish1")

training %>% group_by(BsmtFinType1) %>% count
tranformed <- transform$BsmtFinType1(training, remove=F)

tranformed %>% group_by(BsmtFinType1) %>% 
summarise(min(basement_finish1), max(basement_finish1), median(sale_price_log))

BsmtFinType1,n
ALQ,220
BLQ,148
GLQ,418
LwQ,74
Rec,133
Unf,430
,37


BsmtFinType1,min(basement_finish1),max(basement_finish1),median(sale_price_log)
ALQ,11.91338,11.91338,11.91338
BLQ,11.84295,11.84295,11.84295
GLQ,12.27256,12.27256,12.27256
LwQ,11.84223,11.84223,11.84223
Rec,11.86358,11.86358,11.86358
Unf,11.99381,11.99381,11.99381
,11.53077,11.53077,11.53077


In [71]:
transform$BsmtFinType2 <- group_averaging_tran(BsmtFinType2, "basement_finish2")

training %>% group_by(BsmtFinType2) %>% count
tranformed <- transform$BsmtFinType2(training, remove=F)

tranformed %>% group_by(BsmtFinType2) %>% 
summarise(min(basement_finish2), max(basement_finish2), median(sale_price_log))

BsmtFinType2,n
ALQ,19
BLQ,33
GLQ,14
LwQ,46
Rec,54
Unf,1256
,38


BsmtFinType2,min(basement_finish2),max(basement_finish2),median(sale_price_log)
ALQ,12.07197,12.07197,12.07197
BLQ,11.8706,11.8706,11.8706
GLQ,12.22033,12.22033,12.22033
LwQ,11.94469,11.94469,11.94469
Rec,11.90995,11.90995,11.90995
Unf,12.02575,12.02575,12.02575
,11.55216,11.55216,11.55216


In [73]:
transform$BsmtQual <- group_averaging_tran(BsmtQual, "basement_height_quality")

training %>% group_by(BsmtQual) %>% count
tranformed <- transform$BsmtQual(training, remove=F)

tranformed %>% group_by(BsmtQual) %>% 
summarise(min(basement_height_quality), max(basement_height_quality), median(sale_price_log))

BsmtQual,n
Ex,121
Fa,35
Gd,618
TA,649
,37


BsmtQual,min(basement_height_quality),max(basement_height_quality),median(sale_price_log)
Ex,12.66981,12.66981,12.66981
Fa,11.62625,11.62625,11.62625
Gd,12.16562,12.16562,12.16562
TA,11.81673,11.81673,11.81673
,11.53077,11.53077,11.53077


In [130]:
training %>% group_by(Fence) %>% summarise(n=n(), avg_price=median(sale_price_log))

transform$Fence <- group_averaging_tran(Fence, "fence")

tranformed <- transform$Fence(training, remove=F)

tranformed %>% group_by(Fence) %>% 
summarise(min(fence), max(fence), median(sale_price_log))

Fence,n,avg_price
GdPrv,59,12.02874
GdWo,54,11.84043
MnPrv,157,11.83102
MnWw,11,11.77529
,1179,12.06105


Fence,min(fence),max(fence),median(sale_price_log)
GdPrv,12.02874,12.02874,12.02874
GdWo,11.84043,11.84043,11.84043
MnPrv,11.83102,11.83102,11.83102
MnWw,11.77529,11.77529,11.77529
,12.06105,12.06105,12.06105


In [131]:
training %>% group_by(FireplaceQu) %>% summarise(n=n(), avg_price=median(sale_price_log))

transform$FireplaceQu <- group_averaging_tran(FireplaceQu, "fireplace_qual")

tranformed <- transform$FireplaceQu(training, remove=F)

tranformed %>% group_by(FireplaceQu) %>% 
summarise(min(fireplace_qual), max(fireplace_qual), median(sale_price_log))

FireplaceQu,n,avg_price
Ex,24,12.65794
Fa,33,11.97035
Gd,380,12.24023
Po,20,11.78658
TA,313,12.14153
,690,11.81303


FireplaceQu,min(fireplace_qual),max(fireplace_qual),median(sale_price_log)
Ex,12.65794,12.65794,12.65794
Fa,11.97035,11.97035,11.97035
Gd,12.24023,12.24023,12.24023
Po,11.78658,11.78658,11.78658
TA,12.14153,12.14153,12.14153
,11.81303,11.81303,11.81303


In [135]:
training %>% group_by(Foundation) %>% summarise(n=n(), avg_price=median(sale_price_log))

transform$Foundation <- group_averaging_tran(Foundation, "foundation")

tranformed <- transform$Foundation(training, remove=F)

tranformed %>% group_by(Foundation) %>% 
summarise(min(foundation), max(foundation), median(sale_price_log))

Foundation,n,avg_price
BrkTil,146,11.73807
CBlock,634,11.86005
PConc,647,12.23077
Slab,24,11.55333
Stone,6,11.74454
Wood,3,12.00762


Foundation,min(foundation),max(foundation),median(sale_price_log)
BrkTil,11.73807,11.73807,11.73807
CBlock,11.86005,11.86005,11.86005
PConc,12.23077,12.23077,12.23077
Slab,11.55333,11.55333,11.55333
Stone,11.74454,11.74454,11.74454
Wood,12.00762,12.00762,12.00762


In [145]:
training %>% group_by(GarageFinish) %>% summarise(n=n(), avg_price=median(sale_price_log))

transform$GarageFinish <- group_averaging_tran(GarageFinish, "garage_finish")

tranformed <- transform$GarageFinish(training, remove=F)

tranformed %>% group_by(GarageFinish) %>% 
summarise(min(garage_finish), max(garage_finish), median(sale_price_log))

GarageFinish,n,avg_price
Fin,352,12.27839
RFn,422,12.15478
Unf,605,11.81303
,81,11.51293


GarageFinish,min(garage_finish),max(garage_finish),median(sale_price_log)
Fin,12.27839,12.27839,12.27839
RFn,12.15478,12.15478,12.15478
Unf,11.81303,11.81303,11.81303
,11.51293,11.51293,11.51293


In [148]:
training %>% group_by(GarageType) %>% summarise(n=n(), avg_price=median(sale_price_log))

transform$GarageType <- group_averaging_tran(GarageType, "garage_type")

tranformed <- transform$GarageType(training, remove=F)

tranformed %>% group_by(GarageType) %>% 
summarise(min(garage_type), max(garage_type), median(sale_price_log))

GarageType,n,avg_price
2Types,6,11.97505
Attchd,870,12.12811
Basment,19,11.90497
BuiltIn,88,12.33485
CarPort,9,11.58989
Detchd,387,11.77144
,81,11.51293


GarageType,min(garage_type),max(garage_type),median(sale_price_log)
2Types,11.97505,11.97505,11.97505
Attchd,12.12811,12.12811,12.12811
Basment,11.90497,11.90497,11.90497
BuiltIn,12.33485,12.33485,12.33485
CarPort,11.58989,11.58989,11.58989
Detchd,11.77144,11.77144,11.77144
,11.51293,11.51293,11.51293


In [151]:
training %>% group_by(HeatingQC) %>% summarise(n=n(), avg_price=median(sale_price_log))

transform$HeatingQC <- group_averaging_tran(HeatingQC, "heating_quality")

tranformed <- transform$HeatingQC(training, remove=F)

tranformed %>% group_by(HeatingQC) %>% 
summarise(min(heating_quality), max(heating_quality), median(sale_price_log))

HeatingQC,n,avg_price
Ex,741,12.17922
Fa,49,11.724
Gd,241,11.93164
Po,1,11.37366
TA,428,11.81303


HeatingQC,min(heating_quality),max(heating_quality),median(sale_price_log)
Ex,12.17922,12.17922,12.17922
Fa,11.724,11.724,11.724
Gd,11.93164,11.93164,11.93164
Po,11.37366,11.37366,11.37366
TA,11.81303,11.81303,11.81303


In [154]:
training %>% group_by(HouseStyle) %>% summarise(n=n(), avg_price=median(sale_price_log))

transform$HouseStyle <- group_averaging_tran(HouseStyle, "house_style")

tranformed <- transform$HouseStyle(training, remove=F)

tranformed %>% group_by(HouseStyle) %>% 
summarise(min(house_style), max(house_style), median(sale_price_log))

HouseStyle,n,avg_price
1.5Fin,154,11.79055
1.5Unf,14,11.61941
1Story,726,11.94956
2.5Fin,8,12.17549
2.5Unf,11,11.80485
2Story,445,12.15478
SFoyer,37,11.82012
SLvl,65,12.01067


HouseStyle,min(house_style),max(house_style),median(sale_price_log)
1.5Fin,11.79055,11.79055,11.79055
1.5Unf,11.61941,11.61941,11.61941
1Story,11.94956,11.94956,11.94956
2.5Fin,12.17549,12.17549,12.17549
2.5Unf,11.80485,11.80485,11.80485
2Story,12.15478,12.15478,12.15478
SFoyer,11.82012,11.82012,11.82012
SLvl,12.01067,12.01067,12.01067


In [157]:
training %>% group_by(KitchenQual) %>% summarise(n=n(), avg_price=median(sale_price_log))

transform$KitchenQual <- group_averaging_tran(KitchenQual, "kitchen_qual")

tranformed <- transform$KitchenQual(training, remove=F)

tranformed %>% group_by(KitchenQual) %>% 
summarise(min(kitchen_qual), max(kitchen_qual), median(sale_price_log))

KitchenQual,n,avg_price
Ex,100,12.66586
Fa,39,11.65269
Gd,586,12.21305
TA,735,11.82774


KitchenQual,min(kitchen_qual),max(kitchen_qual),median(sale_price_log)
Ex,12.66586,12.66586,12.66586
Fa,11.65269,11.65269,11.65269
Gd,12.21305,12.21305,12.21305
TA,11.82774,11.82774,11.82774


In [164]:
training %>% group_by(LotConfig) %>% summarise(n=n(), avg_price=median(sale_price_log))

transform$LotConfig <- group_averaging_tran(LotConfig, "lot_config")

tranformed <- transform$LotConfig(training, remove=F)

tranformed %>% group_by(LotConfig) %>% 
summarise(min(lot_config), max(lot_config), median(sale_price_log))

LotConfig,n,avg_price
Corner,263,11.98293
CulDSac,94,12.20235
FR2,47,12.0137
FR3,4,12.18131
Inside,1052,11.98104


LotConfig,min(lot_config),max(lot_config),median(sale_price_log)
Corner,11.98293,11.98293,11.98293
CulDSac,12.20235,12.20235,12.20235
FR2,12.0137,12.0137,12.0137
FR3,12.18131,12.18131,12.18131
Inside,11.98104,11.98104,11.98104


In [165]:
setdiff(char_vars_names, transform %>% names) %>% sort

In [167]:
training %>% group_by(MasVnrType) %>% summarise(n=n(), avg_price=median(sale_price_log))

transform$MasVnrType <- group_averaging_tran(MasVnrType, "masonry_veneer_type")

tranformed <- transform$MasVnrType(training, remove=F)

tranformed %>% group_by(MasVnrType) %>% 
summarise(min(masonry_veneer_type), max(masonry_veneer_type), median(sale_price_log))

MasVnrType,n,avg_price
BrkCmn,15,11.84223
BrkFace,445,12.10625
,864,11.8706
Stone,128,12.41647
,8,12.22229


MasVnrType,min(masonry_veneer_type),max(masonry_veneer_type),median(sale_price_log)
BrkCmn,11.84223,11.84223,11.84223
BrkFace,12.10625,12.10625,12.10625
,11.8706,11.8706,11.8706
Stone,12.41647,12.41647,12.41647
,12.22229,12.22229,12.22229


In [106]:
transform$Condition1.Condition2 <- function (df) {
    training %>% mutate(
        adjacent_railroad = ifelse(Condition1 %in% c('RRAe', 'RRAn', 'RRNe', 'RRNn') | 
                                   Condition2 %in% c('RRAe', 'RRAn', 'RRNe', 'RRNn'),
                                   1, 0),

        adjacent_traffic_street = ifelse(Condition1 %in% c('Artery', 'Feedr') | 
                                   Condition2 %in% c('Artery', 'Feedr'),
                                   1, 0),

        adjacent_positive = ifelse(Condition1 %in% c('PosA', 'PosN') | 
                                   Condition2 %in% c('PosA', 'PosN'),
                                   1, 0)
    )
}

tranformed <- transform$Condition1.Condition2(training)

training %>% group_by(Condition1) %>% summarise(avg = median(sale_price_log), n=n())
training %>% group_by(Condition2) %>% summarise(avg = median(sale_price_log), n=n())

tranformed %>% 
group_by(adjacent_railroad, adjacent_traffic_street, adjacent_positive, Condition1, Condition2) %>% 
count

Condition1,avg,n
Artery,11.69149,48
Feedr,11.8494,81
Norm,12.02275,1260
PosA,12.26106,8
PosN,12.20607,19
RRAe,11.8671,11
RRAn,12.05227,26
RRNe,12.15853,2
RRNn,12.27373,5


Condition2,avg,n
Artery,11.57004,2
Feedr,11.75586,6
Norm,12.00457,1445
PosA,12.69158,1
PosN,12.49388,2
RRAe,12.15478,1
RRAn,11.82704,1
RRNn,11.43533,2


adjacent_railroad,adjacent_traffic_street,adjacent_positive,Condition1,Condition2,n
0,0,0,Norm,Norm,1260
0,0,1,PosA,Norm,8
0,0,1,PosN,Norm,17
0,0,1,PosN,PosN,2
0,1,0,Artery,Artery,2
0,1,0,Artery,Norm,45
0,1,0,Feedr,Feedr,1
0,1,0,Feedr,Norm,76
0,1,1,Artery,PosA,1
1,0,0,RRAe,Norm,11


In [121]:
training %>% group_by(ExterQual, ExterCond) %>% summarise(avg = median(sale_price_log), n=n())

step1 <- function (df) {
    df %>% mutate(
        ExterQualCond = case_when(
            
            ExterQual == 'Ex' & ExterCond %in% c('Ex', 'Gd', 'TA') ~ 'Ex',
            ExterQual == 'Ex' & ExterCond == 'Fa' ~ 'good',
            
            ExterQual == 'Gd' & ExterCond %in% c('Ex', 'Gd', 'TA') ~ 'Gd',
            ExterQual == 'Gd' & ExterCond == 'Fa' ~ 'typical',
            
            ExterQual == 'TA' & ExterCond != 'Po' ~ 'TA',
            ExterQual == 'Fa' & ExterCond != 'Po' ~ 'Fa',
            T ~ 'Po'
        )
    )
}

step2 <- group_averaging_tran(ExterQualCond, "exterior_qual_cond")

transform$ExterQual.ExterCond <- function(df, remove=T) {
    df.new <- step1(df)
    df.new <- step2(df.new, remove=remove)
    if ( remove ) {
        df.new$ExterQual <- NULL
        df.new$ExterQual <- NULL
    }
    df.new
}

tranformed %>% group_by(ExterQualCond) %>%
summarise(min(exterior_qual_cond), max(exterior_qual_cond), median(sale_price_log))

ExterQual,ExterCond,avg,n
Ex,Ex,12.69158,1
Ex,Gd,12.49313,3
Ex,TA,12.83248,48
Fa,Fa,11.01036,6
Fa,TA,11.50104,8
Gd,Gd,12.25486,35
Gd,TA,12.30592,453
TA,Ex,11.8338,2
TA,Fa,11.68462,22
TA,Gd,11.8494,108


ExterQualCond,min(exterior_qual_cond),max(exterior_qual_cond),median(sale_price_log)
Ex,12.80655,12.80655,12.80655
Fa,11.31751,11.31751,11.31751
Gd,12.30138,12.30138,12.30138
Po,11.24505,11.24505,11.24505
TA,11.84582,11.84582,11.84582
