In [5]:
import numpy as np
import pandas as pd
import pymc as pm
import arviz as az

# For printing in a nice format
from IPython.display import display

# Read the Iris dataset (ensure iris.csv is in the same directory)
df = pd.read_csv("iris.csv")
df.head()


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [6]:
def fit_1d_gaussian_mixture(data, k=3, tune=10, draws=10, chains=2):
    """
    Fits a 1D Gaussian Mixture Model with 'k' components to 'data'.
    Returns the pymc Trace (InferenceData).
    """
    with pm.Model() as model:
        # Mixture component weights
        weights = pm.Dirichlet('weights', np.ones(k), shape=k)

        # Means of each mixture component
        means = pm.Normal('means', mu=0, sigma=5, shape=k)

        # Standard deviations of each mixture component
        sds = pm.HalfNormal('sds', sigma=5, shape=k)

        # The observed mixture
        pm.NormalMixture(
            'obs',
            w=weights,
            mu=means,
            sigma=sds,
            observed=data
        )

        # Sample (small tune/draws just for demo)
        trace = pm.sample(
            tune=tune,
            draws=draws,
            chains=chains,
            random_seed=42,
            progressbar=True
        )

    return trace


In [7]:
features = ["sepal_length", "sepal_width", "petal_length", "petal_width"]
traces = {}

for feature in features:
    print(f"\nFitting mixture for: {feature}")
    data_1d = df[feature].values
    trace = fit_1d_gaussian_mixture(data_1d, k=3, tune=10, draws=10, chains=2)
    traces[feature] = trace


Only 10 samples in chain.



Fitting mixture for: sepal_length


Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (2 chains in 4 jobs)
NUTS: [weights, means, sds]


Sampling 2 chains for 10 tune and 10 draw iterations (20 + 20 draws total) took 12 seconds.
  trace = pm.sample(



Fitting mixture for: sepal_width


Only 10 samples in chain.
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (2 chains in 4 jobs)
NUTS: [weights, means, sds]


Sampling 2 chains for 10 tune and 10 draw iterations (20 + 20 draws total) took 20 seconds.
Only 10 samples in chain.



Fitting mixture for: petal_length


Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (2 chains in 4 jobs)
NUTS: [weights, means, sds]


Sampling 2 chains for 10 tune and 10 draw iterations (20 + 20 draws total) took 10 seconds.
There was 1 divergence after tuning. Increase `target_accept` or reparameterize.
There was 1 divergence after tuning. Increase `target_accept` or reparameterize.
Only 10 samples in chain.



Fitting mixture for: petal_width


Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (2 chains in 4 jobs)
NUTS: [weights, means, sds]


Sampling 2 chains for 10 tune and 10 draw iterations (20 + 20 draws total) took 15 seconds.
The acceptance probability does not match the target. It is 2.08e-28, but should be close to 0.8. Try to increase the number of tuning steps.


In [8]:
import itertools

# Map species strings to numeric labels: 0,1,2
species_labels = {'setosa': 0, 'versicolor': 1, 'virginica': 2}
df['species_num'] = df['species'].map(species_labels)

def get_cluster_assignments(data, trace):
    """
    Given data 1D and a trace (InferenceData), return the cluster assignment
    for each data point using the posterior mean of mixture parameters.
    """
    # Posterior means (averaged over all draws and chains)
    weight_means = trace.posterior["weights"].mean(dim=("chain", "draw")).values
    mean_means   = trace.posterior["means"].mean(dim=("chain", "draw")).values
    sds_means    = trace.posterior["sds"].mean(dim=("chain", "draw")).values

    # For each x in data, compute p(cluster=k | x)
    # p_k(x) = weight_means[k] * NormalPDF(x|mean_means[k], sds_means[k])
    # We'll assign the cluster with the highest p_k(x).

    def normal_pdf(x, mu, sigma):
        return (1.0 / (sigma * np.sqrt(2*np.pi))) * np.exp(-0.5 * ((x - mu)/sigma)**2)

    cluster_assignments = []
    for x in data:
        # compute each cluster's probability (not normalized across clusters, but it's enough to pick argmax)
        cluster_probs = [
            weight_means[k] * normal_pdf(x, mean_means[k], sds_means[k])
            for k in range(3)
        ]
        # pick the cluster with max probability
        cluster_assignments.append(np.argmax(cluster_probs))

    return np.array(cluster_assignments)

def best_accuracy(true_labels, cluster_labels):
    """
    Find the best mapping from cluster_labels={0,1,2} to species labels={0,1,2}
    so that accuracy is maximized. Return that accuracy.
    """
    best_acc = 0.0
    for perm in itertools.permutations([0,1,2]):
        # perm is a tuple like (0,1,2), meaning cluster0->0, cluster1->1, cluster2->2
        mapped = np.array([perm[c] for c in cluster_labels])
        acc = np.mean(mapped == true_labels)
        if acc > best_acc:
            best_acc = acc
    return best_acc

results = []

for feature in features:
    data_1d = df[feature].values
    trace = traces[feature]
    clusters = get_cluster_assignments(data_1d, trace)
    acc = best_accuracy(df['species_num'].values, clusters)
    results.append((feature, acc))

print("Accuracy for each feature (1D mixture, best cluster-species mapping):")
for feature, acc in results:
    print(f"{feature:15s} -> Accuracy: {acc:.3f}")


Accuracy for each feature (1D mixture, best cluster-species mapping):
sepal_length    -> Accuracy: 0.413
sepal_width     -> Accuracy: 0.333
petal_length    -> Accuracy: 0.667
petal_width     -> Accuracy: 0.333
