In [1]:
getwd()
if ( is.null(environment()$this_notebook_dir) ) {
    this_notebook_dir <- getwd()
    setwd(paste0(getwd(), '/..'))
}
getwd()

In [4]:
source('helpers.R')

kaggle.house.loadLibraries()
data <- kaggle.house.loadData()

data$combined %>% 
select(-dataSource) %>% 
filter(!is.na(LotFrontage)) %>% 
mutate(
    LotFrontage.log = log(LotFrontage),
    LotArea.log = log(LotArea)
) -> df.LotFrontage

df.LotFrontage %>% dim

# ~ LotArea.log

In [8]:
df.LotFrontage %>% 
lm(LotFrontage.log ~ LotArea.log, data = .) -> model

model %>% summary
sum(model$residuals ^ 2)


Call:
lm(formula = LotFrontage.log ~ LotArea.log, data = .)

Residuals:
     Min       1Q   Median       3Q      Max 
-1.33420 -0.12683  0.02732  0.13909  1.25960 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) -0.816932   0.085101    -9.6   <2e-16 ***
LotArea.log  0.551207   0.009374    58.8   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.2299 on 2431 degrees of freedom
Multiple R-squared:  0.5872,	Adjusted R-squared:  0.587 
F-statistic:  3458 on 1 and 2431 DF,  p-value: < 2.2e-16


# ~ LotArea.log + MSSubClass

In [9]:
df.LotFrontage %>% 
select(LotFrontage.log, LotArea.log, MSSubClass) %>%
lm(LotFrontage.log ~ LotArea.log + MSSubClass, data = .) -> model

model %>% summary
sum(model$residuals ^ 2)


Call:
lm(formula = LotFrontage.log ~ LotArea.log + MSSubClass, data = .)

Residuals:
     Min       1Q   Median       3Q      Max 
-1.02965 -0.09845  0.00116  0.10641  1.14375 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept)    0.87696    0.10829   8.098 8.75e-16 ***
LotArea.log    0.33748    0.01253  26.935  < 2e-16 ***
MSSubClass160 -0.24130    0.02740  -8.806  < 2e-16 ***
MSSubClass180 -0.24097    0.05546  -4.345 1.45e-05 ***
MSSubClass190  0.22774    0.03308   6.884 7.39e-12 ***
MSSubClass20   0.32685    0.02032  16.082  < 2e-16 ***
MSSubClass30   0.20083    0.02545   7.892 4.48e-15 ***
MSSubClass40   0.04072    0.09436   0.431   0.6661    
MSSubClass45   0.15438    0.05178   2.981   0.0029 ** 
MSSubClass50   0.19014    0.02214   8.589  < 2e-16 ***
MSSubClass60   0.32849    0.02171  15.128  < 2e-16 ***
MSSubClass70   0.19911    0.02635   7.556 5.86e-14 ***
MSSubClass75   0.27565    0.04897   5.629 2.02e-08 ***
MSSubClass80   0.34972    0.02940  11

## merging levels based on frequency

In [10]:
df.LotFrontage %>% select(MSSubClass, LotFrontage.log) %>%
group_by(MSSubClass) %>%
summarise(n = n(), median = median(LotFrontage.log)) %>%
mutate(freq = (n / sum(n)) * 100) %>%
arrange(n)

MSSubClass,n,median,freq
40,5,4.007333,0.2055076
180,16,3.044522,0.6576243
45,18,4.007333,0.7398274
75,21,4.174387,0.8631319
85,32,4.276666,1.3152487
190,57,4.094345,2.3427867
80,85,4.356709,3.4936293
90,92,4.248495,3.7813399
160,116,3.178054,4.7677764
70,117,4.094345,4.8088779


In [14]:
df <- df.LotFrontage %>%
mutate(
    MSSubClass2 = case_when(
        MSSubClass == '20' ~ 'A',
        MSSubClass %in% c('60', '50') ~ 'B',
        TRUE ~ 'C'
)) 

df %>%
select(MSSubClass2, MSSubClass, LotFrontage.log) %>%
group_by(MSSubClass2) %>%
summarise(n = n(), median = median(LotFrontage.log))

df %>% 
lm(LotFrontage.log ~ LotArea.log + MSSubClass2, data = .) -> model

model %>% summary
sum(model$residuals ^ 2)

MSSubClass2,n,median
A,894,4.317488
B,704,4.248495
C,835,4.094345



Call:
lm(formula = LotFrontage.log ~ LotArea.log + MSSubClass2, data = .)

Residuals:
     Min       1Q   Median       3Q      Max 
-1.26593 -0.10980  0.01512  0.12746  1.29271 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept)  -0.36997    0.09525  -3.884 0.000105 ***
LotArea.log   0.50768    0.01027  49.438  < 2e-16 ***
MSSubClass2B -0.04133    0.01137  -3.633 0.000285 ***
MSSubClass2C -0.11779    0.01195  -9.858  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.2255 on 2429 degrees of freedom
Multiple R-squared:  0.6032,	Adjusted R-squared:  0.6027 
F-statistic:  1231 on 3 and 2429 DF,  p-value: < 2.2e-16


## merging levels based on response

In [76]:
df.LotFrontage %>% select(MSSubClass, LotFrontage.log) %>%
group_by(MSSubClass) %>%
summarise(n = n(), median = median(LotFrontage.log)) %>%
mutate(freq = (n / sum(n)) * 100) %>%
arrange(median)

MSSubClass,n,median,freq
180,16,3.044522,0.6576243
160,116,3.178054,4.7677764
120,150,3.7612,6.1652281
40,5,4.007333,0.2055076
45,18,4.007333,0.7398274
190,57,4.094345,2.3427867
30,126,4.094345,5.1787916
50,262,4.094345,10.7685984
70,117,4.094345,4.8088779
75,21,4.174387,0.8631319


In [78]:
df <- df.LotFrontage %>%
mutate(
    MSSubClass2 = case_when(
        MSSubClass == '20' ~ 'A',
        MSSubClass %in% c('60', '80', '85', '90', '75') ~ 'B',
        TRUE ~ 'C'
)) 

df %>%
select(MSSubClass2, MSSubClass, LotFrontage.log) %>%
group_by(MSSubClass2) %>%
summarise(n = n(), median = median(LotFrontage.log))

df %>% 
lm(LotFrontage.log ~ LotArea.log + MSSubClass2, data = .) -> model

model %>% summary
sum(model$residuals ^ 2)

MSSubClass2,n,median
A,894,4.317488
B,672,4.317488
C,867,4.007333



Call:
lm(formula = LotFrontage.log ~ LotArea.log + MSSubClass2, data = .)

Residuals:
     Min       1Q   Median       3Q      Max 
-1.25359 -0.09317  0.01800  0.11241  1.30736 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept)  -0.078392   0.095077  -0.825    0.410    
LotArea.log   0.476148   0.010251  46.447   <2e-16 ***
MSSubClass2B -0.001931   0.011228  -0.172    0.863    
MSSubClass2C -0.161562   0.011693 -13.817   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.2199 on 2429 degrees of freedom
Multiple R-squared:  0.6226,	Adjusted R-squared:  0.6222 
F-statistic:  1336 on 3 and 2429 DF,  p-value: < 2.2e-16


# averaging trick

In [79]:
df <- df.LotFrontage %>%
group_by(MSSubClass) %>%
mutate(MSSubClass.avg = median(LotFrontage.log))

df %>% 
lm(LotFrontage.log ~ LotArea.log + MSSubClass.avg, data = .) -> model

model %>% summary
sum(model$residuals ^ 2)


Call:
lm(formula = LotFrontage.log ~ LotArea.log + MSSubClass.avg, 
    data = .)

Residuals:
     Min       1Q   Median       3Q      Max 
-1.02247 -0.10112  0.00395  0.10651  1.13561 

Coefficients:
               Estimate Std. Error t value Pr(>|t|)    
(Intercept)    -1.01192    0.07714  -13.12   <2e-16 ***
LotArea.log     0.33655    0.01238   27.18   <2e-16 ***
MSSubClass.avg  0.51427    0.02168   23.72   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.2072 on 2430 degrees of freedom
Multiple R-squared:  0.6648,	Adjusted R-squared:  0.6645 
F-statistic:  2410 on 2 and 2430 DF,  p-value: < 2.2e-16


In [80]:
df <- df.LotFrontage %>%
select(LotFrontage.log, LotArea.log, MSSubClass) %>%
mutate(
    MSSubClass2 = case_when(
        MSSubClass == '20' ~ 'A',
        MSSubClass %in% c('60', '50') ~ 'B',
        TRUE ~ 'C'
)) %>%
group_by(MSSubClass) %>%
mutate(
    MSSubClass.avg = median(LotFrontage.log),
    MSSubClass.A = ifelse(MSSubClass2 == 'A', MSSubClass.avg, 0),
    MSSubClass.B = ifelse(MSSubClass2 == 'B', MSSubClass.avg, 0),
    MSSubClass.C = ifelse(MSSubClass2 == 'C', MSSubClass.avg, 0)
) 

df %>% 
lm(LotFrontage.log ~ LotArea.log + MSSubClass.A + MSSubClass.B + MSSubClass.C, data = .) -> model

model %>% summary
sum(model$residuals ^ 2)


Call:
lm(formula = LotFrontage.log ~ LotArea.log + MSSubClass.A + MSSubClass.B + 
    MSSubClass.C, data = .)

Residuals:
    Min      1Q  Median      3Q     Max 
-1.0197 -0.1013  0.0035  0.1052  1.1386 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept)  -0.99196    0.08661  -11.45   <2e-16 ***
LotArea.log   0.33690    0.01242   27.12   <2e-16 ***
MSSubClass.A  0.50982    0.02302   22.14   <2e-16 ***
MSSubClass.B  0.50812    0.02338   21.73   <2e-16 ***
MSSubClass.C  0.50795    0.02410   21.07   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.2073 on 2428 degrees of freedom
Multiple R-squared:  0.6649,	Adjusted R-squared:  0.6643 
F-statistic:  1204 on 4 and 2428 DF,  p-value: < 2.2e-16


In [81]:
df <- df.LotFrontage %>%
select(LotFrontage.log, LotArea.log, MSSubClass) %>%
mutate(
    MSSubClass2 = case_when(
        MSSubClass == '20' ~ 'A',
        MSSubClass %in% c('60', '80', '85', '90', '75') ~ 'B',
        TRUE ~ 'C'
)) %>%
group_by(MSSubClass) %>%
mutate(
    MSSubClass.avg = median(LotFrontage.log),
    MSSubClass.A = ifelse(MSSubClass2 == 'A', MSSubClass.avg, 0),
    MSSubClass.B = ifelse(MSSubClass2 == 'B', MSSubClass.avg, 0),
    MSSubClass.C = ifelse(MSSubClass2 == 'C', MSSubClass.avg, 0)
) 

df %>% 
lm(LotFrontage.log ~ LotArea.log + MSSubClass.A + MSSubClass.B + MSSubClass.C, data = .) -> model

model %>% summary
sum(model$residuals ^ 2)


Call:
lm(formula = LotFrontage.log ~ LotArea.log + MSSubClass.A + MSSubClass.B + 
    MSSubClass.C, data = .)

Residuals:
     Min       1Q   Median       3Q      Max 
-1.03247 -0.09346  0.00203  0.10397  1.15374 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept)  -0.89001    0.09238  -9.635   <2e-16 ***
LotArea.log   0.33859    0.01240  27.300   <2e-16 ***
MSSubClass.A  0.48259    0.02535  19.037   <2e-16 ***
MSSubClass.B  0.48320    0.02542  19.009   <2e-16 ***
MSSubClass.C  0.47592    0.02697  17.649   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.2071 on 2428 degrees of freedom
Multiple R-squared:  0.6656,	Adjusted R-squared:  0.665 
F-statistic:  1208 on 4 and 2428 DF,  p-value: < 2.2e-16


# ~ LotArea.log + Neighborhood

In [82]:
df.LotFrontage %>% 
select(LotFrontage.log, LotArea.log, Neighborhood) %>%
lm(LotFrontage.log ~ LotArea.log + Neighborhood, data = .) -> model

model %>% summary
sum(model$residuals ^ 2)


Call:
lm(formula = LotFrontage.log ~ LotArea.log + Neighborhood, data = .)

Residuals:
     Min       1Q   Median       3Q      Max 
-1.16061 -0.09468  0.01633  0.11166  1.22417 

Coefficients:
                     Estimate Std. Error t value Pr(>|t|)    
(Intercept)          0.067486   0.111318   0.606  0.54441    
LotArea.log          0.464425   0.012320  37.695  < 2e-16 ***
NeighborhoodBlueste -0.339746   0.084339  -4.028 5.79e-05 ***
NeighborhoodBrDale  -0.488368   0.063186  -7.729 1.58e-14 ***
NeighborhoodBrkSide -0.147554   0.054082  -2.728  0.00641 ** 
NeighborhoodClearCr -0.123757   0.071417  -1.733  0.08324 .  
NeighborhoodCollgCr -0.091977   0.052392  -1.756  0.07929 .  
NeighborhoodCrawfor -0.110890   0.055602  -1.994  0.04623 *  
NeighborhoodEdwards -0.143807   0.052621  -2.733  0.00632 ** 
NeighborhoodGilbert -0.119620   0.054631  -2.190  0.02865 *  
NeighborhoodIDOTRR  -0.132662   0.054895  -2.417  0.01574 *  
NeighborhoodMeadowV -0.384902   0.061953  -6.213 6.11e-10 ***

## merging levels based on frequency

In [17]:
df.LotFrontage %>% select(Neighborhood, LotFrontage.log) %>%
group_by(Neighborhood) %>%
summarise(n = n(), median = median(LotFrontage.log)) %>%
mutate(freq = (n / sum(n)) * 100) %>%
arrange(n) %>%
mutate(cumfreq = cumsum(freq))

Neighborhood,n,median,freq,cumfreq
Blueste,10,3.178054,0.4110152,0.4110152
Veenker,16,4.382027,0.6576243,1.0686395
Blmngtn,20,3.7612,0.8220304,1.89067
ClearCr,20,4.388238,0.8220304,2.7127004
NPkVill,21,3.178054,0.8631319,3.5758323
BrDale,30,3.044522,1.2330456,4.8088779
MeadowV,33,3.044522,1.3563502,6.1652281
SWISU,44,4.094345,1.8084669,7.973695
StoneBr,46,4.094345,1.89067,9.864365
NoRidge,54,4.488636,2.2194821,12.0838471


In [31]:
df <- df.LotFrontage %>%
select(Neighborhood, LotFrontage.log, LotArea.log) %>%
group_by(Neighborhood) %>%
mutate(n = n()) %>%
mutate(
    Neighborhood2 = case_when(
        n  <= 95 ~ 'A',
        95 < n & n < 178 ~ 'B',
        TRUE ~ 'C'
)) 

df %>%
group_by(Neighborhood2) %>%
summarise(n = n())

df %>% 
lm(LotFrontage.log ~ LotArea.log + Neighborhood2, data = .) -> model

model %>% summary
sum(model$residuals ^ 2)

Neighborhood2,n
A,791
B,640
C,1002



Call:
lm(formula = LotFrontage.log ~ LotArea.log + Neighborhood2, data = .)

Residuals:
     Min       1Q   Median       3Q      Max 
-1.28604 -0.12222  0.02109  0.13323  1.24195 

Coefficients:
               Estimate Std. Error t value Pr(>|t|)    
(Intercept)    -0.78494    0.08489  -9.247  < 2e-16 ***
LotArea.log     0.54385    0.00943  57.670  < 2e-16 ***
Neighborhood2B  0.05860    0.01227   4.774 1.91e-06 ***
Neighborhood2C  0.04694    0.01096   4.283 1.92e-05 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.2287 on 2429 degrees of freedom
Multiple R-squared:  0.5918,	Adjusted R-squared:  0.5913 
F-statistic:  1174 on 3 and 2429 DF,  p-value: < 2.2e-16


## merging levels based on response

In [32]:
df.LotFrontage %>% select(Neighborhood, LotFrontage.log) %>%
group_by(Neighborhood) %>%
summarise(n = n(), median = median(LotFrontage.log)) %>%
mutate(freq = (n / sum(n)) * 100) %>%
arrange(median) %>%
mutate(cumfreq = cumsum(freq))

Neighborhood,n,median,freq,cumfreq
BrDale,30,3.044522,1.2330456,1.233046
MeadowV,33,3.044522,1.3563502,2.589396
Blueste,10,3.178054,0.4110152,3.000411
NPkVill,21,3.178054,0.8631319,3.863543
Blmngtn,20,3.7612,0.8220304,4.685573
BrkSide,95,3.931826,3.9046445,8.590218
IDOTRR,87,4.094345,3.5758323,12.16605
OldTown,229,4.094345,9.4122483,21.578298
StoneBr,46,4.094345,1.89067,23.468968
SWISU,44,4.094345,1.8084669,25.277435


In [34]:
df <- df.LotFrontage %>%
select(Neighborhood, LotFrontage.log, LotArea.log) %>%
group_by(Neighborhood) %>%
mutate(median = median(LotFrontage.log)) %>%
mutate(
    Neighborhood2 = case_when(
        median <= 4.16 ~ 'A',
        4.16 < median & median < 4.29 ~ 'B',
        TRUE ~ 'C'
)) 

df %>%
group_by(Neighborhood2) %>%
summarise(n = n())

df %>% 
lm(LotFrontage.log ~ LotArea.log + Neighborhood2, data = .) -> model

model %>% summary
sum(model$residuals ^ 2)

Neighborhood2,n
A,726
B,847
C,860



Call:
lm(formula = LotFrontage.log ~ LotArea.log + Neighborhood2, data = .)

Residuals:
     Min       1Q   Median       3Q      Max 
-1.27351 -0.11104  0.02549  0.12207  1.29472 

Coefficients:
                Estimate Std. Error t value Pr(>|t|)    
(Intercept)    -0.565464   0.087551  -6.459 1.27e-10 ***
LotArea.log     0.516775   0.009905  52.176  < 2e-16 ***
Neighborhood2B  0.054376   0.011806   4.606 4.32e-06 ***
Neighborhood2C  0.118089   0.012223   9.661  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.2256 on 2429 degrees of freedom
Multiple R-squared:  0.6027,	Adjusted R-squared:  0.6022 
F-statistic:  1228 on 3 and 2429 DF,  p-value: < 2.2e-16


## averaging trick

In [35]:
df <- df.LotFrontage %>%
group_by(Neighborhood) %>%
mutate(Neighborhood.avg = median(LotFrontage.log))

df %>% 
lm(LotFrontage.log ~ LotArea.log + Neighborhood.avg, data = .) -> model

model %>% summary
sum(model$residuals ^ 2)


Call:
lm(formula = LotFrontage.log ~ LotArea.log + Neighborhood.avg, 
    data = .)

Residuals:
     Min       1Q   Median       3Q      Max 
-1.25630 -0.10107  0.02113  0.12140  1.21467 

Coefficients:
                 Estimate Std. Error t value Pr(>|t|)    
(Intercept)      -1.17368    0.08597  -13.65   <2e-16 ***
LotArea.log       0.44854    0.01171   38.32   <2e-16 ***
Neighborhood.avg  0.30680    0.02226   13.78   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.2215 on 2430 degrees of freedom
Multiple R-squared:  0.6171,	Adjusted R-squared:  0.6168 
F-statistic:  1958 on 2 and 2430 DF,  p-value: < 2.2e-16
