In [1]:

!pip install ucimlrepo kmodes scikit-learn


from ucimlrepo import fetch_ucirepo
from kmodes.kmodes import KModes
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score
import numpy as np


zoo = fetch_ucirepo(id=111)


X = zoo.data.features.astype(int)
y = zoo.data.targets.values.ravel()


print(zoo.metadata)
print(zoo.variables)


Collecting ucimlrepo
  Downloading ucimlrepo-0.0.6-py3-none-any.whl (8.0 kB)
Collecting kmodes
  Downloading kmodes-0.12.2-py2.py3-none-any.whl (20 kB)
Installing collected packages: ucimlrepo, kmodes
Successfully installed kmodes-0.12.2 ucimlrepo-0.0.6
{'uci_id': 111, 'name': 'Zoo', 'repository_url': 'https://archive.ics.uci.edu/dataset/111/zoo', 'data_url': 'https://archive.ics.uci.edu/static/public/111/data.csv', 'abstract': 'Artificial, 7 classes of animals', 'area': 'Biology', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 101, 'num_features': 16, 'feature_types': ['Categorical', 'Integer'], 'demographics': [], 'target_col': ['type'], 'index_col': ['animal_name'], 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 1990, 'last_updated': 'Fri Sep 15 2023', 'dataset_doi': '10.24432/C5R59V', 'creators': ['Richard Forsyth'], 'intro_paper': None, 'additional_info': {'summary': 'A simple database containing 17 Boolea

In [2]:
def initialize_parameters(n_clusters, n_features):
    """Initialize the parameters of the Bernoulli Mixture Model."""
    np.random.seed(42)
    weights = np.random.rand(n_clusters)
    weights /= weights.sum()
    probabilities = np.random.rand(n_clusters, n_features)
    return weights, probabilities

def stochastic_e_step(data, weights, probabilities):
    """Perform the stochastic E-step of the EM algorithm."""
    n_clusters = weights.shape[0]
    responsibilities = np.zeros((data.shape[0], n_clusters))
    for k in range(n_clusters):
        responsibilities[:, k] = weights[k] * np.prod(probabilities[k]**data * (1 - probabilities[k])**(1 - data), axis=1)
    responsibilities /= responsibilities.sum(axis=1, keepdims=True)
    return responsibilities

def m_step(data, responsibilities):
    """Perform the M-step of the EM algorithm."""
    weights = responsibilities.mean(axis=0)
    probabilities = (responsibilities.T @ data) / responsibilities.sum(axis=0)[:, None]
    return weights, probabilities

def bernoulli_mixture_model(data, n_clusters, n_iterations):
    """Fit a Bernoulli Mixture Model using the Stochastic EM algorithm."""
    n_features = data.shape[1]
    weights, probabilities = initialize_parameters(n_clusters, n_features)
    for _ in range(n_iterations):
        responsibilities = stochastic_e_step(data, weights, probabilities)
        weights, probabilities = m_step(data, responsibilities)
    return weights, probabilities, responsibilities

# Clustering with Bernoulli Mixture Model
n_clusters = len(np.unique(y))
weights, probabilities, responsibilities = bernoulli_mixture_model(X.values, n_clusters, 100)
bmm_labels = np.argmax(responsibilities, axis=1)


In [3]:
def run_kmodes(data, n_clusters):
    """Function to run K-Modes clustering."""
    km = KModes(n_clusters=n_clusters, init='Huang', n_init=5, verbose=1)
    clusters = km.fit_predict(data)
    return clusters

# Clustering with K-Modes
km_labels = run_kmodes(X, n_clusters)


Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 1, iteration: 1/100, moves: 13, cost: 147.0
Run 1, iteration: 2/100, moves: 5, cost: 142.0
Run 1, iteration: 3/100, moves: 1, cost: 142.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 2, iteration: 1/100, moves: 25, cost: 160.0
Run 2, iteration: 2/100, moves: 0, cost: 160.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 3, iteration: 1/100, moves: 4, cost: 152.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 4, iteration: 1/100, moves: 15, cost: 159.0
Run 4, iteration: 2/100, moves: 0, cost: 159.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 5, iteration: 1/100, moves: 20, cost: 198.0
Run 5, iteration: 2/100, moves: 24, cost: 158.0
Run 5, iteration: 3/100, moves: 1, cost: 158.0
Best run was number 1


In [4]:
def evaluate_clusters(true_labels, predicted_labels):
    """Evaluate clustering using ARI and NMI."""
    ari = adjusted_rand_score(true_labels, predicted_labels)
    nmi = normalized_mutual_info_score(true_labels, predicted_labels)
    return ari, nmi

bmm_ari, bmm_nmi = evaluate_clusters(y, bmm_labels)
km_ari, km_nmi = evaluate_clusters(y, km_labels)

print(f"BMM - ARI: {bmm_ari}, NMI: {bmm_nmi}")
print(f"K-Modes - ARI: {km_ari}, NMI: {km_nmi}")


BMM - ARI: 0.6912237056693479, NMI: 0.7769102617551067
K-Modes - ARI: 0.9189916284665557, NMI: 0.8907854844326066


In [7]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming you have already the 'X' and 'y' loaded, and clustering labels 'bmm_labels' and 'km_labels' obtained.

# Convert X to a pandas DataFrame for easier manipulation
X_df = pd.DataFrame(X, columns=zoo.variables.names)

# Feature Distributions
def plot_feature_distributions(data, variables_info):
    fig, axes = plt.subplots(nrows=3, ncols=3, figsize=(15, 10))  # Adjust the grid size based on the number of features
    axes = axes.flatten()
    for i, col in enumerate(data.columns):
        sns.countplot(x=col, data=data, ax=axes[i])
        axes[i].set_title(f'Distribution of {col}', fontsize=9)
        axes[i].set_xlabel('')
        axes[i].set_ylabel('Count')
    plt.tight_layout()
    plt.show()

plot_feature_distributions(X_df, zoo.variables)


AttributeError: 'DataFrame' object has no attribute 'names'