In [37]:
#Read before running
library(tidyverse)
library(repr)
library(tidymodels)
library(cowplot)

#read red wine data, assign wine-type label and new column names
red_wine_data <- read_delim("winequality-red (1).csv", delim = ";")%>%
                    mutate(quality = as.factor(quality))%>%
                    mutate(wine_type = as.factor("red"))%>%
                    setNames (c("fixed_acidity", "volatile_acidity", "citric_acid", "residual_sugar", "chlorides", "free_sulfur_dioxide", "total_sulfur_dioxide", "density", "pH", "sulphates", "alcohol", "quality", "wine_type"))

#read white wine data, assign wine-type label and new column names
white_wine_data <- read_delim("winequality-white.csv", delim = ";")%>%
                    mutate(quality = as.factor(quality))%>%
                    mutate(wine_type = as.factor("white"))%>%
                    setNames (c("fixed_acidity", "volatile_acidity", "citric_acid", "residual_sugar", "chlorides", "free_sulfur_dioxide", "total_sulfur_dioxide", "density", "pH", "sulphates", "alcohol", "quality", "wine_type"))
#combine both wine types into one dataframe
wine_data <- rbind(red_wine_data, white_wine_data)

#split dataset into training and testing data
wine_split <- initial_split(wine_data, prop = 0.75, strata = quality)
wine_training <- training(wine_split)
wine_testing <- testing(wine_split)

glimpse(wine_training)

classes <- wine_training %>% pull(quality) %>% levels()
classes

num_obs <- nrow(wine_training)

wine_training %>%
  group_by(quality) %>%
  summarize(
    count = n(),
    percentage = n() / num_obs * 100
  )

missing_data <- wine_training %>%
    select(fixed_acidity, volatile_acidity, residual_sugar, free_sulfur_dioxide, total_sulfur_dioxide, alcohol)%>%
    summary()

missing_data


options(repr.plot.width = 14, repr.plot.height = 8)

predictors <- wine_training %>%
    select(fixed_acidity, volatile_acidity, residual_sugar, free_sulfur_dioxide, total_sulfur_dioxide, alcohol)

fixed_acidity_plot <- predictors %>% ggplot(aes(x = fixed_acidity)) +
    geom_histogram(binwidth = .5)+
    xlab("Fixed Acidity (g(tartaric acid)/dm3)")+
    ylab("Count")+
    ggtitle("Fixed Acidity Distribution")
 
volatile_acidity_plot <- predictors %>% ggplot(aes(x = volatile_acidity)) +
    geom_histogram(binwidth = .5)+
    xlab("Volatile Acidity (g(acetic acid)/dm3)")+
    ylab("Count")+
    ggtitle("Volatile Acidity Distribution")

residual_sugar_plot <- predictors %>% ggplot(aes(x = residual_sugar)) +
    geom_histogram(binwidth = .5)+
    xlab("Residual sugar (g/dm3)")+
    ylab("Count")+
    ggtitle("Residual Sugar Distribution")

free_sulfur_dioxide_plot <- predictors %>% ggplot(aes(x = free_sulfur_dioxide)) +
    geom_histogram(binwidth = .5)+
    xlab("Free sulfur dioxide (mg/dm3)")+
    ylab("Count")+
    ggtitle("Free Sulfur Dioxide Distribution")

total_sulfur_dioxide_plot <- predictors %>% ggplot(aes(x = total_sulfur_dioxide)) +
    geom_histogram(binwidth = .5)+
    xlab("Total sulfur dioxide (mg/dm3)")+
    ylab("Count")+
    ggtitle("Total Sulfur Dioxide Distribution")

alcohol_plot <- predictors %>% ggplot(aes(x = alcohol)) +
    geom_histogram(binwidth = .5)+
    xlab("Alcohol (%vol)")+
    ylab("Count")+
    ggtitle("Alcohol Distribution")


predictor_distribution <- plot_grid(fixed_acidity_plot, volatile_acidity_plot, residual_sugar_plot,free_sulfur_dioxide_plot, total_sulfur_dioxide_plot, alcohol_plot)
predictor_distribution

“package ‘cowplot’ was built under R version 4.0.3”
