# Sin regularizar

In [1]:
library('fastDummies')

set.seed(103783)

# Cargo el csv
mat <- read.csv("student-mat.csv")

# Train y Test
sample <- sample.int(n = nrow(mat), size = floor(.75*nrow(mat)), replace = F)
train <- mat[sample, ]
test  <- mat[-sample, ]
y_train <- c(train$G3)
x_train <- subset(train, select = -c(G3))
y_test <- c(test$G3)
x_test <- subset(test, select = -c(G3))

In [2]:
# Preproceso
preprocesar <- function(mat) {
    mat_prepros <- dummy_cols(mat,remove_first_dummy = TRUE)
    mat_prepros <- mat_prepros[-c(1,2,4,5,6,9,10,11,12,16,17,18,19,20,21,22,23)]
    as.data.frame(scale(mat_prepros))
}
x_train <- preprocesar(x_train)

In [3]:
model <- lm(formula=y_train~.,data=x_train)
summary(model)


Call:
lm(formula = y_train ~ ., data = x_train)

Residuals:
    Min      1Q  Median      3Q     Max 
-7.5996 -0.6483  0.2539  1.0342  4.5665 

Coefficients:
                  Estimate Std. Error t value Pr(>|t|)    
(Intercept)       10.38514    0.11435  90.816   <2e-16 ***
age               -0.26030    0.15767  -1.651   0.1000 .  
Medu               0.17251    0.19598   0.880   0.3796    
Fedu              -0.17275    0.16680  -1.036   0.3013    
traveltime         0.02837    0.13147   0.216   0.8293    
studytime         -0.16550    0.13690  -1.209   0.2278    
failures          -0.14880    0.14364  -1.036   0.3012    
famrel             0.30403    0.12411   2.450   0.0150 *  
freetime          -0.04655    0.13715  -0.339   0.7346    
goout              0.06854    0.14202   0.483   0.6298    
Dalc              -0.10115    0.16256  -0.622   0.5344    
Walc               0.15596    0.17528   0.890   0.3744    
health             0.12374    0.13012   0.951   0.3425    
absences        

In [4]:
x_test <- preprocesar(x_test)
pred <- predict(model, x_test)
modelEval <- cbind(y_test, pred)
colnames(modelEval) <- c('Actual', 'Predicted')
modelEval <- as.data.frame(modelEval)

In [5]:
mse <- mean((modelEval$Actual - modelEval$Predicted)**2)
rmse <- sqrt(mse)
print(cat("Mean Squared Error:",mse))
print(cat("Mean Absolute Error:",rmse))

Mean Squared Error: 3.341237NULL
Mean Absolute Error: 1.827905NULL


# Lasso

In [6]:
library("glmnet")

Loading required package: Matrix

Loaded glmnet 4.1-2



In [7]:
# Cargo el csv
mat <- read.csv("student-mat.csv")

# Train y Test
sample <- sample.int(n = nrow(mat), size = floor(.75*nrow(mat)), replace = F)
train <- mat[sample, ]
test  <- mat[-sample, ]
y_train <- c(train$G3)
x_train <- subset(train, select = -c(G3))
y_test <- c(test$G3)
x_test <- subset(test, select = -c(G3))

In [8]:
mat_prepros <- dummy_cols(x_train,remove_first_dummy = TRUE)
mat_prepros <- mat_prepros[-c(1,2,4,5,6,9,10,11,12,16,17,18,19,20,21,22,23)]
x_train <- scale(mat_prepros)

mat_prepros <- dummy_cols(x_test,remove_first_dummy = TRUE)
mat_prepros <- mat_prepros[-c(1,2,4,5,6,9,10,11,12,16,17,18,19,20,21,22,23)]
x_test <- scale(mat_prepros)

In [9]:
lambdas <- 10^seq(2, -3, by = -.1)

In [10]:
# Setting alpha = 1 implements lasso regression
lasso_reg <- cv.glmnet(x_train, y_train, alpha = 1, lambda = lambdas, standardize = TRUE, nfolds = 5)

# Best 
lambda_best <- lasso_reg$lambda.min
lambda_best

In [11]:
eval_results <- function(true, predicted, df) {
  SSE <- sum((predicted - true)^2)
  SST <- sum((true - mean(true))^2)
  R_square <- 1 - SSE / SST
  RMSE = sqrt(SSE/nrow(df))

  
  # Model performance metrics
data.frame(
  RMSE = RMSE,
  Rsquare = R_square
)
  
}

# Prediction and evaluation on train data
predictions_train <- predict(lasso_reg, s = lambda_best, newx = x_train)
eval_results(y_train, predictions_train, x_train)

predictions_test <- predict(lasso_reg, s = lambda_best, newx = x_test)
eval_results(y_test, predictions_test, x_test)

RMSE,Rsquare
<dbl>,<dbl>
1.887821,0.8226309


RMSE,Rsquare
<dbl>,<dbl>
2.06505,0.8180328


In [17]:
coef(lasso_reg, s = "lambda.min")

42 x 1 sparse Matrix of class "dgCMatrix"
                           s1
(Intercept)       10.45945946
age                .         
Medu               .         
Fedu               .         
traveltime         .         
studytime          .         
failures          -0.17659855
famrel             0.02123133
freetime           .         
goout              .         
Dalc               .         
Walc               .         
health             .         
absences           0.10431589
G1                 0.46150996
G2                 3.37595063
school_MS          .         
sex_M              .         
address_U          .         
famsize_LE3        .         
Pstatus_T          .         
Mjob_health        .         
Mjob_other         .         
Mjob_services      .         
Mjob_teacher       .         
Fjob_health        .         
Fjob_other         .         
Fjob_services     -0.03969034
Fjob_teacher       .         
reason_home       -0.04956248
reason_other       .        

# Ridge

In [18]:
library("glmnet")

In [19]:
# Cargo el csv
mat <- read.csv("student-mat.csv")

# Train y Test
sample <- sample.int(n = nrow(mat), size = floor(.75*nrow(mat)), replace = F)
train <- mat[sample, ]
test  <- mat[-sample, ]
y_train <- c(train$G3)
x_train <- subset(train, select = -c(G3))
y_test <- c(test$G3)
x_test <- subset(test, select = -c(G3))

In [20]:
mat_prepros <- dummy_cols(x_train,remove_first_dummy = TRUE)
mat_prepros <- mat_prepros[-c(1,2,4,5,6,9,10,11,12,16,17,18,19,20,21,22,23)]
x_train <- scale(mat_prepros)

mat_prepros <- dummy_cols(x_test,remove_first_dummy = TRUE)
mat_prepros <- mat_prepros[-c(1,2,4,5,6,9,10,11,12,16,17,18,19,20,21,22,23)]
x_test <- scale(mat_prepros)

In [21]:
lambdas <- 10^seq(2, -3, by = -.1)

In [22]:
ridge_reg <- cv.glmnet(x_train, y_train, alpha = 0, lambda = lambdas)
optimal_lambda <- ridge_reg$lambda.min
optimal_lambda

In [23]:
# Prediction and evaluation on train data
predictions_train <- predict(ridge_reg, s = optimal_lambda, newx = x_train)
eval_results(y_train, predictions_train, x_train)

# Prediction and evaluation on test data
predictions_test <- predict(ridge_reg, s = optimal_lambda, newx = x_test)
eval_results(y_test, predictions_test, x_test)

RMSE,Rsquare
<dbl>,<dbl>
1.590127,0.8676857


RMSE,Rsquare
<dbl>,<dbl>
2.695203,0.7158834


In [24]:
coef(ridge_reg, s = "lambda.min")

42 x 1 sparse Matrix of class "dgCMatrix"
                            s1
(Intercept)       10.679054054
age               -0.159738060
Medu              -0.106945984
Fedu              -0.127835019
traveltime         0.133913017
studytime          0.066882301
failures          -0.115124095
famrel             0.283967655
freetime           0.105476411
goout             -0.027368585
Dalc              -0.296898844
Walc               0.227371892
health             0.055727432
absences           0.342006129
G1                 0.827802413
G2                 3.180116777
school_MS          0.066064249
sex_M              0.150105256
address_U         -0.011293040
famsize_LE3        0.017081662
Pstatus_T         -0.069275931
Mjob_health       -0.011507760
Mjob_other         0.027920244
Mjob_services      0.181023446
Mjob_teacher       0.169523364
Fjob_health        0.089738501
Fjob_other         0.079833348
Fjob_services      0.009117581
Fjob_teacher      -0.035616840
reason_home       -0.1446417