In [None]:
library(randomForest)
library(caret)
library(data.table)
library(tidyverse)
library(corrplot)

In [None]:
# Load the train.csv dataset
train <- read.csv("/kaggle/input/wine-dataset/train.csv")

# Load the test.csv dataset
test <- read.csv("/kaggle/input/wine-dataset/test.csv")

In [None]:
# Check the structure of the datasets
str(train)
str(test)

# Feature Engineering (example: adding a feature)
train$alcohol_squared <- train$alcohol^2
test$alcohol_squared <- test$alcohol^2

# Splitting the training data into red and white types
red_train <- train[train$type == "red", ]
white_train <- train[train$type == "white", ]

# Select the input features
predictors  <- c('fixed.acidity','volatile.acidity','citric.acid','residual.sugar',
                'chlorides','free.sulfur.dioxide','total.sulfur.dioxide','density',
                'pH','sulphates','alcohol')

# Define the evaluation metric
rmse <- function(data, obs, pred) {
  sqrt(mean((obs - pred)^2))
}

In [None]:
train %>%
  ggplot(aes(x = quality)) +
  geom_bar() +
  labs(title = 'Quality Distribution for Red and White Wines') +
  theme_bw() +
  facet_grid(. ~ type)

In [None]:
# Calculate the correlation matrix for red wine
red_correlation_matrix <- cor(red_train[, predictors])

# Calculate the correlation matrix for white wine
white_correlation_matrix <- cor(white_train[, predictors])

# Create correlation plots for red wine
corrplot(red_correlation_matrix, method = "color", type = "upper", tl.cex = 0.9, tl.col = "black", title = "Red Wine Correlation Plot")

# Create correlation plots for white wine
corrplot(white_correlation_matrix, method = "color", type = "upper", tl.cex = 0.9, tl.col = "black", title = "White Wine Correlation Plot")

In [None]:
# Function to train and predict quality using Random Forest
predict_quality <- function(train_data, test_data, predictors) {
  # Create a Random Forest model
  model <- randomForest(quality ~ ., data = train_data[, c(predictors, "quality")], ntree = 100)
  
  # Predict quality for test data
  predictions <- predict(model, newdata = test_data)
  
  return(predictions)
}

# Predict quality for red wine
red_predictions <- predict_quality(red_train, test[test$type == "red", ], predictors)

# Predict quality for white wine
white_predictions <- predict_quality(white_train, test[test$type == "white", ], predictors)

# Combine predictions for red and white wines
test_predictions <- rep(0, nrow(test))
test_predictions[test$type == "red"] <- red_predictions
test_predictions[test$type == "white"] <- white_predictions

In [None]:
# Create a data frame with ID and predictions
result <- data.frame(id = test$id, quality = test_predictions)

# Export predictions to a CSV file
write.csv(result, "predictions.csv", row.names = FALSE)

In [None]:
result

In [None]:
# Load required libraries
library(caret)
library(randomForest)
library(dplyr)
library(data.table)

# Load the datasets
train_data <- read.csv("/kaggle/input/wine-dataset/train.csv")
test_data <- read.csv("/kaggle/input/wine-dataset/test.csv")

# Preprocess the data
# One-hot encode the 'type' variable
train_data <- train_data %>% 
  mutate(type = ifelse(type == "red", 1, 0))

test_data <- test_data %>% 
  mutate(type = ifelse(type == "red", 1, 0))

In [None]:
# Split the data into features and target
X_train <- train_data[, c(1:11, 13)]
y_train <- train_data$quality

X_test <- test_data[, 2:13]

# Train a model (Random Forest, for example)
set.seed(123)  # for reproducibility
model <- randomForest(y_train ~ ., data = X_train, trControl = ctrl, tuneGrid = grid, ntree = 1000)

# Make predictions
predictions <- predict(model, X_test)

# Create a data frame with ID and predicted quality
result <- data.frame(id = test_data$id, quality = predictions)

# Export the predictions to a CSV file
write.csv(result, "wine_quality_predictions.csv", row.names = FALSE)


In [None]:
# Create a data frame with ID and predicted quality
result <- data.frame(id = test_data$id, quality = predictions)

# Export the predictions to a CSV file
write.csv(result, "wine_quality_predictions.csv", row.names = FALSE)

In [None]:
result

In [1]:
# Load libraries
library(caret)
library(ROSE)
library(randomForest)

# Load the dataset
data <- read.csv("/kaggle/input/wine-dataset/train.csv")

Loading required package: ggplot2

Loading required package: lattice


Attaching package: ‘caret’


The following object is masked from ‘package:httr’:

    progress


Loaded ROSE 0.0-4


randomForest 4.6-14

Type rfNews() to see new features/changes/bug fixes.


Attaching package: ‘randomForest’


The following object is masked from ‘package:ggplot2’:

    margin




In [2]:
# Check for missing values
missing_values <- sum(is.na(data))
if (missing_values > 0) {
  # Handle missing values (e.g., impute or remove rows with missing values)
  data <- na.omit(data)
}

# Encode the "type" variable into binary
data$type <- ifelse(data$type == "white", 1, 0)

In [3]:
# # Data preprocessing
# data[, -c(12)] <- scale(data[, -c(12)])

# Cross-validation for model evaluation
set.seed(123)  # for reproducibility
train_control <- trainControl(method = "cv", number = 5)  # 5-fold cross-validation

# Split the data
train_indices <- createDataPartition(data$quality, p = 0.9, list = FALSE)
train_data <- data[train_indices, ]
test_data <- data[-train_indices, ]

In [None]:
train_data

In [None]:
test_data

In [4]:
# Hyperparameter Tuning
model <- randomForest(quality ~ ., data = train_data, 
                      ntree = 1000, 
                      max_depth = 20,           # Set your desired max depth
                      min_samples_split = 2,   # Set your desired min_samples_split
                      min_samples_leaf = 1,    # Set your desired min_samples_leaf
                      mtry = 4)  # Example hyperparameters

predictions <- predict(model, test_data)

In [5]:
rmse <- sqrt(mean((test_data$quality - predictions)^2))
print(paste("RMSE:", rmse))

[1] "RMSE: 0.552087695072618"


In [7]:
test_csv <- read.csv("/kaggle/input/wine-dataset/test.csv")
test_prediction <- predict(model, test_csv)
# Create a data frame with ID and predicted quality
result <- data.frame(id = test_csv$id, quality = test_prediction)

# Export the predictions to a CSV file
write.csv(result, "wine_quality_predictions.csv", row.names = FALSE)


In [8]:
result

Unnamed: 0_level_0,id,quality
Unnamed: 0_level_1,<int>,<dbl>
1,1257,6.452417
2,6409,5.596283
3,136,4.999683
4,1631,7.014250
5,6084,5.751000
6,5434,5.258650
7,1094,5.894083
8,5146,6.224683
9,5921,6.569900
10,1076,5.910333
