<a href="https://www.kaggle.com/code/lillynguyen0510/abalone-age-prediction?scriptVersionId=174179442" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
# This R environment comes with many helpful analytics packages installed
# It is defined by the kaggle/rstats Docker image: https://github.com/kaggle/docker-rstats
# For example, here's a helpful package to load

library(tidyverse) # metapackage of all tidyverse packages

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

list.files(path = "../input")

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Load the dataset

In [None]:
train_data = read_csv('/kaggle/input/playground-series-s4e4/train.csv', show_col_types = FALSE)
test_data = read_csv('/kaggle/input/playground-series-s4e4/test.csv', show_col_types = FALSE)

In [None]:
head(train_data)
head(test_data)

# Data Cleaning and Exploratory Data Analysis

## Data Cleaning

In [None]:
# Check data types and dimension
str(train_data)
str(test_data)

In [None]:
# Rename the variables
train_data <- train_data %>% rename('Whole_weight' = 'Whole weight')
train_data <- train_data %>% rename('Whole_weight_1' = 'Whole weight.1')
train_data <- train_data %>% rename('Whole_weight_2' = 'Whole weight.2')
train_data <- train_data %>% rename('Shell_weight' = 'Shell weight')

In [None]:
test_data <- test_data %>% rename('Whole_weight' = 'Whole weight')
test_data <- test_data %>% rename('Whole_weight_1' = 'Whole weight.1')
test_data <- test_data %>% rename('Whole_weight_2' = 'Whole weight.2')
test_data <- test_data %>% rename('Shell_weight' = 'Shell weight')

In [None]:
head(train_data)
head(test_data)

In [None]:
# Check for missing values
sum(is.na(train_data))
sum(is.na(test_data))

## Exploratory Data Analysis

In [None]:
# Distribution of Rings
hist(train_data$Rings,breaks = 20, xlab = "Rings Distribution", main = 'Distribution of Abalone Rings', col = '#135D66', border = '#E3FEF7')

In [None]:
# Distribution of sex
barplot(table(train_data$Sex), xlab = "Sex Distribution", main = 'Distribution of Abalone Sex', col = '#135D66', border = '#E3FEF7')

In [None]:
# Draw correlation plot between variables
library(corrplot)
corrplot.mixed(cor(train_data[,-c(1,2)]), upper = 'ellipse', lower = 'number', number.cex = 1, tl.cex = 0.8)

# Modeling

## Ridge Regression

In [None]:
library(glmnet)

In [None]:
# Set lambda
lambdas = 10^seq(-1.5, 1.5, by = 0.1)
lambdas

In [None]:
# Fit model using cross validation
cv_fit = cv.glmnet(as.matrix(train_data[,c('Length','Diameter','Height','Whole_weight','Whole_weight_1','Whole_weight_2','Shell_weight')]),
                as.matrix(train_data$Rings),
                alpha = 0, 
                lambda = lambdas)

plot(cv_fit)

In [None]:
# lambda.min is the value of λ that gives minimum mean cross-validated error
cv_fit$lambda.min

In [None]:
# Return coefficients of the best model
coef(cv_fit, s = 'lambda.min')

In [None]:
# Predictions on the training data
predict_best_cv = predict(cv_fit, s = 'lambda.min',
                          newx = as.matrix(train_data[,c('Length','Diameter','Height','Whole_weight','Whole_weight_1','Whole_weight_2','Shell_weight')]))

In [None]:
# Calculate RMSLE
RMSLE_best_cv = sqrt(mean((log(1+predict_best_cv) - log(1+train_data$Rings))^2))
RMSLE_best_cv

In [None]:
# Prediction on test data
y_pred = predict(cv_fit, s = 'lambda.min',
                 newx = as.matrix(test_data[,c('Length','Diameter','Height','Whole_weight','Whole_weight_1','Whole_weight_2','Shell_weight')]))

In [None]:
# Add new predicted column to the test data
test_data <- test_data %>% mutate(Rings= y_pred)

head(test_data)

In [None]:
# Create submission file
test_data <- test_data %>% select(c(id, Rings))
head(test_data)

In [None]:
# Export to csv
write.csv(test_data, 'submission.csv', row.names = FALSE)