In [None]:
# This R environment comes with many helpful analytics packages installed
# It is defined by the kaggle/rstats Docker image: https://github.com/kaggle/docker-rstats
# For example, here's a helpful package to load

library(tidyverse) # metapackage of all tidyverse packages

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

list.files(path = "../input")

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
library(ggplot2)
library(moments)
library(gridExtra)
library(Metrics)
library(caret)

In [None]:
train <- read.csv("../input/housetrainnotencoded/dataEncodedLob.csv", stringsAsFactors = F)

In [None]:
dim(train)

In [None]:
print(names(train))

In [None]:
train <-subset(train, select=-c(RowChanged))

### Retrait des valeurs atypiques

In [None]:
dataEncodedOutlierLog <- train[-c(524, 1299),]

### Logarithme de SalePrice

In [None]:
dataEncodedOutlierLog$SalePrice <- log(dataEncodedOutlierLog$SalePrice)

### Données d'apprentissage et de test

In [None]:
set.seed(123)
# Extraction des échantillons
test.ratio = 0.30 # part de l'échantillon test
npop  = nrow(dataEncodedOutlierLog) # nombre de lignes dans les données
ntest  = ceiling(npop*test.ratio) # taille de l’échantillon test
testi   = sample(1:npop, ntest) # indices de l'échantillon test
appri  = setdiff(1:npop, testi) # indices de l’échant. d’apprentissage
# Construction des  ́echantillons avec les variables explicatives
dataAppLog   = dataEncodedOutlierLog[appri, ] # construction de l’échantillon d’apprentissage
dataTestLog  = dataEncodedOutlierLog[testi, ] # construction de l’échantillon test

### Régression linéaire

In [None]:
reslmLog <- lm(SalePrice~.,data=dataAppLog)

In [None]:
# Make predictions
predictionsApp <- reslmLog %>% predict(dataAppLog)
predictLmApp <- exp(predictionsApp)

data.frame(
  Rsquare = R2(predictLmApp, exp(dataAppLog$SalePrice)),
  RMSE = RMSE(predictLmApp, exp(dataAppLog$SalePrice)),
  RMSLE = rmsle(predictLmApp, exp(dataAppLog$SalePrice))
)

In [None]:
# Make predictions
predictions <- reslmLog %>% predict(dataTestLog)
predictLm <- exp(predictions)

data.frame(
  Rsquare = R2(predictLm, exp(dataTestLog$SalePrice)),
  RMSE = RMSE(predictLm, exp(dataTestLog$SalePrice)),
  RMSLE = rmsle(predictLm, exp(dataTestLog$SalePrice))
)

In [None]:
library(mgcv)

## GAM

In [None]:
## Mettre SalePrice comme dernière colonne
dataAppLogGam <- select(dataAppLog, -SalePrice, everything())

### Variables quantitatives

In [None]:
colQuant <- c('LotFrontage', 'LotArea', 'MasVnrArea','BsmtFinSF1','BsmtFinSF2',
             'BsmtUnfSF', 'X1stFlrSF','X2ndFlrSF','LowQualFinSF',
             'GrLivArea',
             'GarageCars','GarageArea','WoodDeckSF','OpenPorchSF',
             'EnclosedPorch', 'X3SsnPorch','ScreenPorch','PoolArea')

#### Variables quantitatives retirées à cause du nombre de noeuds
'TotalBsmtSF','BsmtFullBath','BsmtHalfBath', 'FullBath','HalfBath', 'BedroomAbvGr','KitchenAbvGr','TotRmsAbvGrd','Fireplaces',

In [None]:
colQuant

### Variables qualitatives

In [None]:
colQual <- names(dataAppLogGam[, !names(dataAppLogGam[, -ncol(dataAppLogGam)]) %in% colQuant])
colQual

### Modèle pour toutes les colonnes

In [None]:
fmQual <- paste(colQual, sep = "", collapse = ' + ')
fmQuant <- paste('s(', colQuant, ', k=5, bs="cr")', sep = "", collapse = ' + ')
fm <- paste(fmQual,'+',fmQuant)
fm <- as.formula(paste('SalePrice ~', fm))
fm

### GAM avec des composantes non linéaires

In [None]:
modelGam <- gam(fm, data = dataAppLogGam)

In [None]:
summary(modelGam)

In [None]:
# Make predictions
predictionsApp <- modelGam %>% predict(dataAppLog)
predictLmApp <- exp(predictionsApp)

data.frame(
  Rsquare = R2(predictLmApp, exp(dataAppLog$SalePrice)),
  RMSE = RMSE(predictLmApp, exp(dataAppLog$SalePrice)),
  RMSLE = rmsle(predictLmApp, exp(dataAppLog$SalePrice))
)

In [None]:
# Make predictions
predictionsTest <- modelGam %>% predict(dataTestLog)
predictLmTest <- exp(predictionsTest)

data.frame(
  Rsquare = R2(predictLmTest, exp(dataTestLog$SalePrice)),
  RMSE = RMSE(predictLmTest, exp(dataTestLog$SalePrice)),
  RMSLE = rmsle(predictLmTest, exp(dataTestLog$SalePrice))
)

### GAM Linéaire

In [None]:
fmQual <- paste(colQual, sep = "", collapse = ' + ')
fmQuant <- paste(colQuant, sep = "", collapse = ' + ')
fmL <- paste(fmQual,'+',fmQuant)
fmL <- as.formula(paste('SalePrice ~', fmL))
fmL

In [None]:
modelGamL <- gam(fmL, data = dataAppLog)
summary(modelGamL)

### ANOVA

In [None]:
print(anova(modelGamL, modelGam, test="Chisq"))

In [None]:
summary(modelGam)$s.table

In [None]:
plot(modelGam)