# Penguins Dataset Analysis - Clustering
### Mehmet Emin Titrek - 2018300231

## Importing necessary libraries

In [None]:
library(data.table)
library(tidyverse)
library(scales)
library(NbClust)
library(ggplot2)
library(gridExtra)
library(corrplot)
library(psych)
library(BBmisc)
library(stats)
library(fastcluster)

## Loading dataset

In [None]:
url <- "https://raw.githubusercontent.com/MeminT99/ad455/main/assignment-10/penguins.csv"
penguin_data <- fread(url)

str(penguin_data)

- **species:** penguin species (Chinstrap, Adélie, or Gentoo)
- **culmen_length_mm:** culmen length (mm)
- **culmen_depth_mm:** culmen depth (mm)
- **flipper_length_mm:** flipper length (mm)
- **body_mass_g:** body mass (g)
- **island:** island name (Dream, Torgersen, or Biscoe) in the Palmer Archipelago (Antarctica)
- **sex:** penguin sex

In [None]:
missing_raw <- penguin_data %>%
  summarise_all(~ sum(is.na(.) | . == ""))

missing_raw

**We have a few missing values in the data!!!**

In [None]:
## Apply function to calculate number of unique elements in each column
sapply(penguin_data, function(x) length(unique(x)))

In [None]:
DT <- penguin_data %>%
  select(-sex) %>%
  na.omit() %>%
  mutate(across(where(is.numeric), normalize)) %>%
  mutate(species:=as.factor(species),
         island:=as.factor(island))
    
missing_DT <- DT %>%
  summarise_all(~ sum(is.na(.) | . == ""))

missing_DT

---
- Cleaned rows with missing values
- Removing irrelevant column (sex)
- Normalized numeric variables
- Factorize categorical variables (species, island)
---

## Separated the data into numeric and non-numeric

In [None]:
DT_chr <- DT %>%
    select(species, island)

DT_num <- DT %>%
    select(-c(species, island))

## Variables distribution

In [None]:
options(repr.plot.width = 20, repr.plot.height = 5)

hist_plots <- list()

for (var in names(DT_num)) {
  hist_plots[[var]] <- ggplot(DT_num, aes(x = !!sym(var))) +
    geom_histogram(bins = 20, alpha = 0.5, fill = "blue") +
    theme(plot.title = element_blank()) +
    geom_density()
}

grid.arrange(grobs = hist_plots, ncol=4)

## Correlation plot

In [None]:
options(repr.plot.width = 9, repr.plot.height = 9)

cor_mat <- cor(DT_num, method = "spearman")
cor_test_mat <- corr.test(DT_num, method = "spearman")$p  

corrplot(cor_mat,
         method = "color", 
         type = "upper",
         addCoef.col = "black",
         tl.col = "black",
         diag=FALSE,
         p.mat = cor_test_mat, sig.level = 0.005, insig = "blank")

## 

### Optimal k with NbClust - kmeans

In [None]:
DT_nb <- NbClust::NbClust(DT_num, min.nc=2, max.nc=10, index="all", method="kmeans")

In [None]:
DT_nb

### Combining cluster labels and the data

In [None]:
set.seed(455)
DT_kmeans <- kmeans(DT_num, centers = 2)
DT_kmeans

In [None]:
DT_kmeans$size

In [None]:
DT_cluster <- cbind(DT, clust = as.factor(DT_kmeans$cluster))

str(DT_cluster)

In [None]:
head(DT_cluster)

In [None]:
tail(DT_cluster)

## Explore and visualize the data summaries across clusters for k-means clustering:

In [None]:
# Calculate summary statistics for each cluster
cluster_summary <- DT_cluster %>%
  group_by(clust) %>%
  summarize(
    culmen_length_mean = mean(culmen_length_mm),
    culmen_depth_mean = mean(culmen_depth_mm),
    flipper_length_mean = mean(flipper_length_mm),
    body_mass_mean = mean(body_mass_g)
  )

cluster_summary

The `cluster_summary` table summarizes the mean values of numeric variables across two clusters (`clust`). 

- **Cluster 1**:
  - **Culmen Length Mean**: 0.6562677
  - **Culmen Depth Mean**: -1.0983711
  - **Flipper Length Mean**: 1.1571696
  - **Body Mass Mean**: 1.0901639

- **Cluster 2**:
  - **Culmen Length Mean**: -0.3685887
  - **Culmen Depth Mean**: 0.6168934
  - **Flipper Length Mean**: -0.6499172
  - **Body Mass Mean**: -0.6122838

**The mean values of the variables in Cluster 1 are generally higher compared to Cluster 2 for all variables except for culmen depth, where Cluster 2 has a higher mean value.**

In [None]:
options(repr.plot.width = 20, repr.plot.height = 10)

DT_cluster_slct <- DT_cluster %>% select(-c(species, island))
names_without_clust <- names(DT_cluster_slct)[names(DT_cluster_slct) != "clust"]

box_plots <- list()

for (var in names_without_clust) {
  box_plots[[var]] <- ggplot(DT_cluster_slct, aes(x = clust, y = !!sym(var), fill = clust)) +
    geom_boxplot() +
    labs(x = "Cluster", y = var, fill = "Cluster") +
    theme_minimal()
}

grid.arrange(grobs = box_plots, ncol = 2, nrow = 2)

### Cluster - Species

In [None]:
options(repr.plot.width = 10, repr.plot.height = 5)

# Grouped bar plot of species and island by cluster
ggplot(DT_cluster, aes(x = clust, fill = species)) +
  geom_bar(position = "dodge", color = "black") +
  labs(x = "Cluster", fill = "Species") +
  theme_minimal()

### Cluster - Island

In [None]:
options(repr.plot.width = 10, repr.plot.height = 5)

# Grouped bar plot of island by cluster
ggplot(DT_cluster, aes(x = clust, fill = island)) +
  geom_bar(position = "dodge", color = "black") +
  labs(x = "Cluster", fill = "Island") +
  theme_minimal()

### Optimal k with NbClust - ward.D2

In [None]:
DT_wardD2 <- NbClust(DT_num,
                     distance = "euclidean",
                     min.nc = 2,
                     max.nc = 10,
                     method = "ward.D2",
                     index = "all")

DT_wardD2

### Hierarchical clustering

In [None]:
options(repr.plot.width = 15, repr.plot.height = 15)
dis <- dist(DT_num, method = "euclidian")
factoextra::fviz_dist(dis)

In [None]:
hc <- hclust(dis, method = "complete")
hc

In [None]:
options(repr.plot.width = 20, repr.plot.height = 10)
plot(hc, hang = -1, labels = F, main = "Complete-Linkage")

In [None]:
comp2 <- cutree(hc, 2)

In [None]:
options(repr.plot.width = 20, repr.plot.height = 10)
sparcl::ColorDendrogram(hc,
                       y = comp2,
                       main = "Complete",
                       branchlength = 30)