In [None]:
# Cocktail Clustering Project
# Exploratory data analysis and clustering of cocktail data using Machine Learning for the Solvro student research group.

# This notebook includes exploratory data analysis (EDA), data preprocessing, clustering with K-Means, and visualization of the results.

# Import necessary libraries for data analysis, preprocessing, clustering, and visualization
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

# 1. EDA (Exploratory Data Analysis)
# Goal: Perform an initial exploration of the dataset to understand the distribution of categories, tags, and ingredients.

# Load the dataset from the specified file path
file_path = "../data/cocktail_dataset.json"
cocktail_data = pd.read_json(file_path)

# Display the first few rows of the dataset to get an overview
cocktail_data.head()

# Display basic info about the dataset, including column names, types, and non-null counts
cocktail_data.info()

# Check for any missing values in the dataset
print("\nMissing values:\n", cocktail_data.isnull().sum())

# Count occurrences of each tag to see which are most common
tag_counts = cocktail_data.explode("tags")["tags"].value_counts()
print("Tag counts:\n", tag_counts)

# Count occurrences of each category to understand the distribution across cocktail types
category_counts = cocktail_data["category"].value_counts()
print("\nCategory counts:\n", category_counts)

# Calculate and display basic statistics on the number of ingredients for each cocktail
cocktail_data["num_ingredients"] = cocktail_data["ingredients"].apply(len)
cocktail_data["num_ingredients"].describe()

# 2. Data Preprocessing
# Goal: Prepare the data for clustering by encoding categorical variables and normalizing numerical values.

# Perform one-hot encoding for 'category' and 'glass' columns
category_encoded = pd.get_dummies(cocktail_data["category"], prefix="category")
glass_encoded = pd.get_dummies(cocktail_data["glass"], prefix="glass")

# One-hot encode tags by exploding, then regrouping by index for each cocktail
tags_encoded = cocktail_data.explode("tags")
tags_encoded = pd.get_dummies(tags_encoded["tags"], prefix="tag").groupby(level=0).sum()

# Concatenate encoded columns with the original dataset
cocktail_data_encoded = pd.concat([cocktail_data, category_encoded, glass_encoded, tags_encoded], axis=1)

# Select only numerical columns, excluding non-numeric and unnecessary columns for clustering
numeric_data = cocktail_data_encoded.drop(
    ["id", "name", "category", "glass", "tags", "createdAt", "updatedAt", "instructions", "imageUrl", "ingredients"],
    axis=1
)

# Normalize the selected numeric columns using MinMaxScaler
scaler = MinMaxScaler()
normalized_data = scaler.fit_transform(numeric_data)

# Convert normalized data back into a DataFrame for ease of handling
normalized_df = pd.DataFrame(normalized_data, columns=numeric_data.columns)

# 3. Clustering
# Goal: Use K-Means clustering to group cocktails based on their ingredients and characteristics.

# Initialize and apply K-Means clustering with 5 clusters (this number can be adjusted)
kmeans = KMeans(n_clusters=5, random_state=42)
kmeans.fit(normalized_df)

# Add the resulting cluster labels to the original dataset
cocktail_data_encoded["cluster"] = kmeans.labels_

# Calculate the Silhouette Score to evaluate the quality of clustering
sil_score = silhouette_score(normalized_df, kmeans.labels_)
print(f"Silhouette Score (K-Means): {sil_score}")

# 4. Visualization
# Goal: Visualize the clusters in a 2D space using PCA for dimensionality reduction.

# Apply PCA to reduce the data to 2D for visualization purposes
pca = PCA(n_components=2)
pca_result = pca.fit_transform(normalized_df)

# Plot the clusters based on the PCA components
plt.figure(figsize=(10, 7))
plt.scatter(pca_result[:, 0], pca_result[:, 1], c=kmeans.labels_, cmap="viridis", s=50, alpha=0.7)
plt.title("Cocktail Clusters (PCA) - K-Means")
plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 2")
plt.colorbar(label="Cluster")
plt.show()

# 5. Summary
# This project demonstrates the clustering of cocktail data using exploratory data analysis, data preprocessing, and K-Means clustering.

# Key insights:
# - The dataset was divided into 5 clusters based on cocktail ingredients and categories.
# - The Silhouette Score indicates moderate separation between clusters, suggesting that the clustering is meaningful but could potentially be refined.

# Further improvements could include experimenting with different clustering algorithms, fine-tuning the number of clusters, or using additional preprocessing techniques.