In [1]:
library(moments)  # skewness
library(corrplot)  # corrplot

In [2]:
# load the data set
data = read.csv('./train.csv')
dim(data)

In [3]:
# compute the number of missing values in each column
num.NA = colSums(apply(data[, -c(1, 81)], 2, is.na))
data.type = sapply(data[, names(which(num.NA != 0))], class)

In [4]:
# drop column of Alley, PoolQC, Fence, MiscFeature
drop.names = c("Alley", "PoolQC", "Fence", "MiscFeature")
data = data[ , !(names(data) %in% drop.names)]

In [5]:
# find all categorical and numerical variables
data.type = sapply(data[ , -c(1, ncol(data))], class)
categorical.var = names(data)[which(c(NA, data.type, NA) == 'factor')]
numerical.var = names(data)[which(c(NA, data.type, NA) == 'integer')]

In [6]:
# create new feature named "NA" for categorical variables
for (i in categorical.var) {
    data[, i] = addNA(data[, i])
}

In [7]:
# create new feature using the median value for numerical variables
numerical.missing.var = c('LotFrontage', 'MasVnrArea', 'GarageYrBlt')
for (i in numerical.missing.var) {
    na.id = is.na(data[, i])
    tmp.median = median(data[, i], na.rm=TRUE)
    data[which(na.id), i] = tmp.median
    data[, paste(i, 'NAInd', sep='_')] = as.numeric(na.id)
}

In [9]:
# transform the response variable into log scale
data$SalePrice = log(data$SalePrice + 1)

# transform numerical feature whose skewness is larger than 0.75
skewed.features = sapply(data[, numerical.var], skewness)
skewed.features = numerical.var[which(skewed.features > 0.75)]
for (i in skewed.features) {
    data[, i] = log(data[, i] + 1)
}

In [50]:
tmp = lapply(data[, categorical.var], as.numeric)

In [57]:
model.matrix(- 1, data=tmp)

ERROR: Error: $ operator is invalid for atomic vectors


In [45]:
summary(data[, 'MSZoning'])

In [47]:
summary(as.factor(as.integer(data[, 'MSZoning'])))

In [42]:
tmp = as.factor(as.integer(data[, 'MSZoning']))

In [48]:
model.matrix(~tmp - 1)

Unnamed: 0,tmp1,tmp2,tmp3,tmp4,tmp5
1,0,0,0,1,0
2,0,0,0,1,0
3,0,0,0,1,0
4,0,0,0,1,0
5,0,0,0,1,0
6,0,0,0,1,0
7,0,0,0,1,0
8,0,0,0,1,0
9,0,0,0,0,1
10,0,0,0,1,0


In [28]:
model = lm(SalePrice ~ ., data=data[, c(numerical.var, 'SalePrice')])

In [29]:
summary(model)


Call:
lm(formula = SalePrice ~ ., data = data[, c(numerical.var, "SalePrice")])

Residuals:
     Min       1Q   Median       3Q      Max 
-1.38545 -0.06124  0.00372  0.07224  0.46803 

Coefficients:
                Estimate Std. Error t value Pr(>|t|)    
(Intercept)    1.127e+01  5.594e+00   2.016 0.044026 *  
MSSubClass    -8.864e-03  7.931e-03  -1.118 0.263950    
LotFrontage    7.246e-03  1.609e-02   0.450 0.652460    
LotArea        8.259e-02  1.054e-02   7.838 8.94e-15 ***
OverallQual    8.021e-02  4.670e-03  17.177  < 2e-16 ***
OverallCond    5.045e-02  4.091e-03  12.331  < 2e-16 ***
YearBuilt      2.805e-03  2.735e-04  10.255  < 2e-16 ***
YearRemodAdd   9.178e-04  2.728e-04   3.364 0.000788 ***
MasVnrArea    -2.226e-03  1.625e-03  -1.370 0.171027    
BsmtFinSF1     9.757e-03  1.736e-03   5.619 2.30e-08 ***
BsmtFinSF2    -3.384e-03  2.160e-03  -1.567 0.117315    
BsmtUnfSF     -1.371e-03  2.973e-03  -0.461 0.644649    
TotalBsmtSF    1.266e-02  4.761e-03   2.659 0.007924 ** 
X1

In [31]:
summary(data[, numerical.var])

   MSSubClass     LotFrontage       LotArea        OverallQual    
 Min.   :3.045   Min.   :3.091   Min.   : 7.171   Min.   : 1.000  
 1st Qu.:3.045   1st Qu.:4.111   1st Qu.: 8.930   1st Qu.: 5.000  
 Median :3.932   Median :4.248   Median : 9.157   Median : 6.000  
 Mean   :3.819   Mean   :4.214   Mean   : 9.111   Mean   : 6.099  
 3rd Qu.:4.263   3rd Qu.:4.382   3rd Qu.: 9.359   3rd Qu.: 7.000  
 Max.   :5.252   Max.   :5.749   Max.   :12.280   Max.   :10.000  
  OverallCond      YearBuilt     YearRemodAdd    MasVnrArea      BsmtFinSF1   
 Min.   :1.000   Min.   :1872   Min.   :1950   Min.   :0.000   Min.   :0.000  
 1st Qu.:5.000   1st Qu.:1954   1st Qu.:1967   1st Qu.:0.000   1st Qu.:0.000  
 Median :5.000   Median :1973   Median :1994   Median :0.000   Median :5.952  
 Mean   :5.575   Mean   :1971   Mean   :1985   Mean   :2.120   Mean   :4.230  
 3rd Qu.:6.000   3rd Qu.:2000   3rd Qu.:2004   3rd Qu.:5.107   3rd Qu.:6.570  
 Max.   :9.000   Max.   :2010   Max.   :2010   Max.   :7.