* Download the Iris dataset,  and normalise it.
* Run 1st algorithm and 2nd algorithm on it, with values of beta from 1.1 until 3.0 in steps of 0.1 (run 30 times each value of beta). 
* Calculate the adjusted rand index for each, show the average per beta for each algorithm.

In [24]:
from sklearn.datasets import load_iris

# Load the Iris dataset
iris = load_iris()

# Access the features (X) and target (y) data
X = iris.data
y = iris.target

# Optional: Print the description of the dataset
print(iris.DESCR)


.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
                
    :Summary Statistics:

                    Min  Max   Mean    SD   Class Correlation
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)

    :Missing Attribute Values: None
    :Class Distribution: 33.3% for each of 3 classes.
    :Creator: R.A. Fisher
    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
    :

In [25]:
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [26]:
n_clusters = 3
n_features = X.shape[1]
n_samples = X.shape[0]
print(f'Number of clusters :{n_clusters}')
print(f"Number of features:{n_features}")
print(f"Number of samples: {n_samples}")

Number of clusters :3
Number of features:4
Number of samples: 150


In [27]:
import numpy as np
def normalizer(data):
    return (data - np.mean(data,axis=0)) / (np.max(data, axis=0) - np.min(data, axis=0))

    

In [28]:
X_norm = normalizer(X)
X_norm

array([[-2.06481481e-01,  1.84444444e-01, -3.99661017e-01,
        -4.16388889e-01],
       [-2.62037037e-01, -2.38888889e-02, -3.99661017e-01,
        -4.16388889e-01],
       [-3.17592593e-01,  5.94444444e-02, -4.16610169e-01,
        -4.16388889e-01],
       [-3.45370370e-01,  1.77777778e-02, -3.82711864e-01,
        -4.16388889e-01],
       [-2.34259259e-01,  2.26111111e-01, -3.99661017e-01,
        -4.16388889e-01],
       [-1.23148148e-01,  3.51111111e-01, -3.48813559e-01,
        -3.33055556e-01],
       [-3.45370370e-01,  1.42777778e-01, -3.99661017e-01,
        -3.74722222e-01],
       [-2.34259259e-01,  1.42777778e-01, -3.82711864e-01,
        -4.16388889e-01],
       [-4.00925926e-01, -6.55555556e-02, -3.99661017e-01,
        -4.16388889e-01],
       [-2.62037037e-01,  1.77777778e-02, -3.82711864e-01,
        -4.58055556e-01],
       [-1.23148148e-01,  2.67777778e-01, -3.82711864e-01,
        -4.16388889e-01],
       [-2.89814815e-01,  1.42777778e-01, -3.65762712e-01,
      

# First algorithm

In [33]:
from sklearn.metrics import adjusted_rand_score
from  WKmeans import wkmeans
from utils import clusters_vec

repetition = 30 # number of repetition for each beta
beta_values = np.arange(1.1, 3.1, 0.1).round(1).tolist()


average_adjusted_rand_score_array = np.zeros_like(beta_values)

for b, beta in enumerate(beta_values):
    adjusted_rand_score_array = np.zeros(repetition)
    for i in range(repetition):
        history = wkmeans(X_norm, n_clusters, beta)
        U = history["U"][-1]
        clusters = clusters_vec(U)
        a_r_s = adjusted_rand_score(clusters, y)
        adjusted_rand_score_array[i] = a_r_s

    average_adjusted_rand_score_array[b] =  adjusted_rand_score_array.mean()   
    





In [32]:
import matplotlib.pyplot as plt

# Plot the data
plt.plot(beta_values,average_adjusted_rand_score_array )

# Add labels and title
plt.xlabel('beta values')
plt.ylabel('Average of Adjusted Rand Score')
plt.title('First Algorithm ')

# Show the plot
plt.show()

ModuleNotFoundError: No module named 'matplotlib'

# Second Algorithm

In [31]:
from sklearn.metrics import adjusted_rand_score
from  SubspaceWKmeans import sub_w_k_means
from utils import clusters_vec

repetition = 30 # number of repetition for each beta
beta_values = np.arange(1.1, 3.1, 0.1).round(1).tolist()


average_adjusted_rand_score_array = np.zeros_like(beta_values)

for b, beta in enumerate(beta_values):
    adjusted_rand_score_array = np.zeros(repetition)
    for i in range(repetition):
        history = sub_w_k_means(X_norm, n_clusters, beta)
        U = history["U"][-1]
        clusters = clusters_vec(U)
        a_r_s = adjusted_rand_score(clusters, y)
        adjusted_rand_score_array[i] = a_r_s

    average_adjusted_rand_score_array[b] =  adjusted_rand_score_array.mean()   
    


array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [19]:


# True labels
true_labels = [0, 0, 1, 1, 2, 2]

# Predicted labels
predicted_labels = [0, 0, 1, 1, 3, 3]

# Calculate ARI
ari = adjusted_rand_score(true_labels, predicted_labels)

print("Adjusted Rand Index:", ari)


Adjusted Rand Index: 1.0
