##  2.9.1 Clustering algorithms for sensory quality grade classification

In [None]:
# Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.mixture import GaussianMixture
from sklearn.ensemble import IsolationForest
from sklearn.metrics import calinski_harabasz_score, davies_bouldin_score
from sklearn.model_selection import train_test_split

# Load data
# Assume data has been loaded into a dataframe named df
# df = pd.read_csv('data.csv')

# Data Preprocessing
# Select relevant features
X = df.drop(columns=['target'])  # Assuming 'target' is the target column
y = df['target']  # If there's a target column

# Select the top K best features using ANOVA F-test
X_new = SelectKBest(f_classif, k=10).fit_transform(X, y)

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_new)

# Outlier detection and removal (5% of outliers)
iso_forest = IsolationForest(contamination=0.05, random_state=42)
outliers = iso_forest.fit_predict(X_scaled)
X_scaled = X_scaled[outliers == 1]  # Retain only normal data points

# 2.9.1 K-means Clustering
# Use the elbow method to determine the optimal number of clusters
wcss = []  # Within-cluster sum of squares (WCSS)
for k in range(1, 11):  # k from 1 to 10
    kmeans = KMeans(n_clusters=k, init='k-means++', max_iter=300, tol=1e-4, n_init=10, random_state=42)
    kmeans.fit(X_scaled)
    wcss.append(kmeans.inertia_)

# Plot the elbow method graph
plt.plot(range(1, 11), wcss)
plt.title('Elbow Method')
plt.xlabel('Number of Clusters')
plt.ylabel('WCSS')
plt.show()

# From the elbow method, assume the optimal number of clusters k=3
k = 3
kmeans = KMeans(n_clusters=k, init='k-means++', max_iter=300, tol=1e-4, n_init=10, random_state=42)
kmeans_labels = kmeans.fit_predict(X_scaled)

# 2.9.2 Gaussian Mixture Model (GMM) Clustering
# Perform clustering with GMM, where k is the number of clusters
gmm = GaussianMixture(n_components=k, covariance_type='full', max_iter=100, tol=1e-3, random_state=42, reg_covar=1e-6)
gmm_labels = gmm.fit_predict(X_scaled)

# 2.9.3 Hierarchical Clustering (HCA)
# Perform hierarchical clustering, where k is the number of clusters
hca = AgglomerativeClustering(n_clusters=k, affinity='euclidean', linkage='ward')
hca_labels = hca.fit_predict(X_scaled)

# 2.9.4 Clustering Evaluation
# Evaluate clustering performance using Calinski-Harabasz index and Davies-Bouldin index

# K-means evaluation
ch_score_kmeans = calinski_harabasz_score(X_scaled, kmeans_labels)
dbi_score_kmeans = davies_bouldin_score(X_scaled, kmeans_labels)

# GMM evaluation
ch_score_gmm = calinski_harabasz_score(X_scaled, gmm_labels)
dbi_score_gmm = davies_bouldin_score(X_scaled, gmm_labels)

# HCA evaluation
ch_score_hca = calinski_harabasz_score(X_scaled, hca_labels)
dbi_score_hca = davies_bouldin_score(X_scaled, hca_labels)

# Print evaluation results
print(f'K-means - Calinski-Harabasz Index: {ch_score_kmeans}, Davies-Bouldin Index: {dbi_score_kmeans}')
print(f'GMM - Calinski-Harabasz Index: {ch_score_gmm}, Davies-Bouldin Index: {dbi_score_gmm}')
print(f'HCA - Calinski-Harabasz Index: {ch_score_hca}, Davies-Bouldin Index: {dbi_score_hca}')

# Visualize clustering results
# Plot a scatter plot for K-means clustering results (using the first two features)
plt.scatter(X_scaled[:, 0], X_scaled[:, 1], c=kmeans_labels, cmap='viridis')
plt.title('K-means Clustering Results')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.show()

# Similarly, you can visualize GMM and HCA results as well.
