# Sin regularizar

In [1]:
library('fastDummies')

set.seed(103783)

# Cargo el csv
mat <- read.csv("student-por.csv")

# Train y Test
sample <- sample.int(n = nrow(mat), size = floor(.75*nrow(mat)), replace = F)
train <- mat[sample, ]
test  <- mat[-sample, ]
y_train <- c(train$G3)
x_train <- subset(train, select = -c(G3))
y_test <- c(test$G3)
x_test <- subset(test, select = -c(G3))

In [2]:
# Preproceso
preprocesar <- function(mat) {
    mat_prepros <- dummy_cols(mat,remove_first_dummy = TRUE)
    mat_prepros <- mat_prepros[-c(1,2,4,5,6,9,10,11,12,16,17,18,19,20,21,22,23)]
    as.data.frame(scale(mat_prepros))
}
x_train <- preprocesar(x_train)

In [3]:
model <- lm(formula=y_train~.,data=x_train)
summary(model)


Call:
lm(formula = y_train ~ ., data = x_train)

Residuals:
    Min      1Q  Median      3Q     Max 
-8.5486 -0.4959  0.0111  0.5685  2.6657 

Coefficients:
                   Estimate Std. Error t value Pr(>|t|)    
(Intercept)       11.905350   0.051557 230.916   <2e-16 ***
age                0.099368   0.061939   1.604   0.1094    
Medu              -0.087148   0.085593  -1.018   0.3092    
Fedu               0.069382   0.074251   0.934   0.3506    
traveltime         0.086866   0.058746   1.479   0.1399    
studytime          0.053623   0.058035   0.924   0.3560    
failures          -0.133175   0.062068  -2.146   0.0324 *  
famrel            -0.049136   0.056238  -0.874   0.3827    
freetime          -0.071597   0.057968  -1.235   0.2174    
goout             -0.063080   0.061566  -1.025   0.3061    
Dalc              -0.133375   0.068815  -1.938   0.0532 .  
Walc              -0.025892   0.074658  -0.347   0.7289    
health            -0.090847   0.055539  -1.636   0.1026    
ab

In [4]:
x_test <- preprocesar(x_test)
pred <- predict(model, x_test)
modelEval <- cbind(y_test, pred)
colnames(modelEval) <- c('Actual', 'Predicted')
modelEval <- as.data.frame(modelEval)

In [5]:
mse <- mean((modelEval$Actual - modelEval$Predicted)**2)
rmse <- sqrt(mse)
print(cat("Mean Squared Error:",mse))
print(cat("Mean Absolute Error:",rmse))

Mean Squared Error: 2.767733NULL
Mean Absolute Error: 1.663651NULL


# Lasso

In [6]:
library("glmnet")

Loading required package: Matrix

Loaded glmnet 4.1-2



In [7]:
# Cargo el csv
mat <- read.csv("student-por.csv")

# Train y Test
sample <- sample.int(n = nrow(mat), size = floor(.75*nrow(mat)), replace = F)
train <- mat[sample, ]
test  <- mat[-sample, ]
y_train <- c(train$G3)
x_train <- subset(train, select = -c(G3))
y_test <- c(test$G3)
x_test <- subset(test, select = -c(G3))

In [8]:
mat_prepros <- dummy_cols(x_train,remove_first_dummy = TRUE)
mat_prepros <- mat_prepros[-c(1,2,4,5,6,9,10,11,12,16,17,18,19,20,21,22,23)]
x_train <- scale(mat_prepros)

mat_prepros <- dummy_cols(x_test,remove_first_dummy = TRUE)
mat_prepros <- mat_prepros[-c(1,2,4,5,6,9,10,11,12,16,17,18,19,20,21,22,23)]
x_test <- scale(mat_prepros)

In [9]:
lambdas <- 10^seq(2, -3, by = -.1)

In [10]:
# Setting alpha = 1 implements lasso regression
lasso_reg <- cv.glmnet(x_train, y_train, alpha = 1, lambda = lambdas, standardize = TRUE, nfolds = 5)

# Best 
lambda_best <- lasso_reg$lambda.min
lambda_best

In [11]:
eval_results <- function(true, predicted, df) {
  SSE <- sum((predicted - true)^2)
  SST <- sum((true - mean(true))^2)
  R_square <- 1 - SSE / SST
  RMSE = sqrt(SSE/nrow(df))

  
  # Model performance metrics
data.frame(
  RMSE = RMSE,
  Rsquare = R_square
)
  
}

# Prediction and evaluation on train data
predictions_train <- predict(lasso_reg, s = lambda_best, newx = x_train)
eval_results(y_train, predictions_train, x_train)

predictions_test <- predict(lasso_reg, s = lambda_best, newx = x_test)
eval_results(y_test, predictions_test, x_test)

RMSE,Rsquare
<dbl>,<dbl>
1.257086,0.8508725


RMSE,Rsquare
<dbl>,<dbl>
1.268414,0.8374397


In [12]:
coef(lasso_reg, s = "lambda.min")

42 x 1 sparse Matrix of class "dgCMatrix"
                           s1
(Intercept)       11.90946502
age                .         
Medu               .         
Fedu               .         
traveltime         .         
studytime          .         
failures          -0.04227312
famrel             .         
freetime           .         
goout              .         
Dalc               .         
Walc               .         
health             .         
absences           .         
G1                 0.27230750
G2                 2.62173841
school_MS          .         
sex_M              .         
address_U          .         
famsize_LE3        .         
Pstatus_T          .         
Mjob_health        .         
Mjob_other         .         
Mjob_services      .         
Mjob_teacher       .         
Fjob_health        .         
Fjob_other         .         
Fjob_services      .         
Fjob_teacher       .         
reason_home        .         
reason_other      -0.0206478

# Ridge

In [13]:
library("glmnet")

In [14]:
# Cargo el csv
mat <- read.csv("student-por.csv")

# Train y Test
sample <- sample.int(n = nrow(mat), size = floor(.75*nrow(mat)), replace = F)
train <- mat[sample, ]
test  <- mat[-sample, ]
y_train <- c(train$G3)
x_train <- subset(train, select = -c(G3))
y_test <- c(test$G3)
x_test <- subset(test, select = -c(G3))

In [15]:
mat_prepros <- dummy_cols(x_train,remove_first_dummy = TRUE)
mat_prepros <- mat_prepros[-c(1,2,4,5,6,9,10,11,12,16,17,18,19,20,21,22,23)]
x_train <- scale(mat_prepros)

mat_prepros <- dummy_cols(x_test,remove_first_dummy = TRUE)
mat_prepros <- mat_prepros[-c(1,2,4,5,6,9,10,11,12,16,17,18,19,20,21,22,23)]
x_test <- scale(mat_prepros)

In [16]:
lambdas <- 10^seq(2, -3, by = -.1)

In [17]:
ridge_reg <- cv.glmnet(x_train, y_train, alpha = 0, lambda = lambdas)
optimal_lambda <- ridge_reg$lambda.min
optimal_lambda

In [18]:
# Prediction and evaluation on train data
predictions_train <- predict(ridge_reg, s = optimal_lambda, newx = x_train)
eval_results(y_train, predictions_train, x_train)

# Prediction and evaluation on test data
predictions_test <- predict(ridge_reg, s = optimal_lambda, newx = x_test)
eval_results(y_test, predictions_test, x_test)

RMSE,Rsquare
<dbl>,<dbl>
1.241106,0.8556206


RMSE,Rsquare
<dbl>,<dbl>
1.240751,0.839189


In [19]:
coef(ridge_reg, s = "lambda.min")

42 x 1 sparse Matrix of class "dgCMatrix"
                             s1
(Intercept)       12.0020576132
age                0.0498902504
Medu              -0.1716398804
Fedu               0.0688709504
traveltime         0.0810366089
studytime          0.0912781967
failures          -0.2420528380
famrel             0.0296305038
freetime          -0.0766860360
goout             -0.1255635497
Dalc              -0.0501234185
Walc               0.0773748824
health            -0.1444183065
absences           0.0518657074
G1                 0.5466920749
G2                 2.2894670909
school_MS         -0.0810488794
sex_M             -0.0881897146
address_U          0.0772111523
famsize_LE3       -0.0006771885
Pstatus_T         -0.0580680843
Mjob_health        0.0766146635
Mjob_other        -0.0364484519
Mjob_services      0.0809835665
Mjob_teacher       0.0859772369
Fjob_health       -0.1177708242
Fjob_other        -0.1617984419
Fjob_services     -0.2091861143
Fjob_teacher      -0.121235388