In [None]:
# datasets:
# Here we performed manual clustering calculations over different excel files::
# dataset Path: "/content/drive/MyDrive/Transformer_Review/third_work/results/{Dataset_name}/{search_technique_captions}.ods"
# Use it 6 times because we have total 3 datasets and 2 search techniques. so, overall 6 files are there.
# dataset link: https://drive.google.com/drive/folders/1doSn3PxWoks5yA9vLRWUcObkpOotxDJp?usp=sharing

In [None]:
!pip install odfpy

Collecting odfpy
  Downloading odfpy-1.4.1.tar.gz (717 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m717.0/717.0 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: odfpy
  Building wheel for odfpy (setup.py) ... [?25l[?25hdone
  Created wheel for odfpy: filename=odfpy-1.4.1-py2.py3-none-any.whl size=160672 sha256=239cd78ccf9e25590cf49256dd1a0b50fc6e5dbdf345d0083406a7e1b65187b0
  Stored in directory: /root/.cache/pip/wheels/d6/1d/c8/8c29be1d73ca42d15977c75193d9f39a98499413c2838ac54c
Successfully built odfpy
Installing collected packages: odfpy
Successfully installed odfpy-1.4.1


In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
from sklearn.cluster import KMeans
import pandas as pd
import numpy as np
from scipy.stats import gmean

# Function to initialize centroids
def initialize_centroids(data, n_clusters=3, random_state=42):
    """
    Initialize centroids using K-Means++ initialization.
    """
    kmeans = KMeans(n_clusters=n_clusters, init='k-means++', n_init=1, max_iter=1, random_state=random_state)
    kmeans.fit(data)  # Perform a single iteration to initialize centroids
    return kmeans.cluster_centers_

# Function to perform manual K-means clustering
def kmeans_manual(data, centroids, max_iter=300, tol=1e-4):
    """
    Perform manual K-Means clustering given initial centroids.
    """
    for i in range(max_iter):
        # Calculate distance matrix
        distances = np.linalg.norm(data.values[:, np.newaxis, :] - centroids, axis=2)
        # Assign points to nearest centroids
        cluster_assignments = np.argmin(distances, axis=1)
        # Recalculate centroids
        new_centroids = np.array([data.iloc[cluster_assignments == j].mean(axis=0) for j in range(centroids.shape[0])])
        # Check for convergence
        if np.all(np.linalg.norm(new_centroids - centroids, axis=1) < tol):
            print(f"Converged after {i+1} iterations")
            break
        centroids = new_centroids
    return centroids, distances, cluster_assignments

# Function to compute geometric mean of centroids
def get_geometric_mean(centroids):
    """
    Calculate the geometric mean for each cluster centroid.
    """
    gm_values = {}
    for i, centroid in enumerate(centroids):
        gm = gmean(centroid)
        gm_values[i] = gm
    return gm_values

# Function to validate clustering
def validate_clustering(data, centroids, distances, clusters):
    """
    Cross-check and validate clustering assignments.
    """
    # Check cluster sizes
    cluster_sizes = {i: np.sum(clusters == i) for i in range(centroids.shape[0])}
    print("Cluster Sizes:", cluster_sizes)

    # Calculate geometric means of centroids
    gm_values = get_geometric_mean(centroids)
    print("Geometric Mean of Centroids:", gm_values)

    # Examine distances within each cluster
    for i in range(centroids.shape[0]):
        cluster_distances = distances[clusters == i, i]
        print(f"Cluster {i} - Average Distance to Centroid: {np.mean(cluster_distances):.4f}")
        print(f"Cluster {i} - Max Distance to Centroid: {np.max(cluster_distances):.4f}")

    # Verify assignment consistency
    sorted_clusters = sorted(gm_values, key=gm_values.get, reverse=True)
    print("Clusters ranked by Geometric Mean (Good -> Bad):", sorted_clusters)

    return cluster_sizes, gm_values, sorted_clusters





In [None]:
# Main Execution
path ='/content/drive/MyDrive/Transformer_Review/third_work/results/UCM/greedy_captions.ods'
data = pd.read_excel(path , engine='odf', index_col=0)

# Normalize CIDEr score
data['CIDEr'] = data['CIDEr'] / 5

# Initialize centroids
initial_centroids = initialize_centroids(data.values, n_clusters=3)
print("Initialized Centroids:\n", initial_centroids)

# Perform manual K-means clustering
final_centroids, distances, clusters = kmeans_manual(data, initial_centroids)
print("Final Centroids:\n", final_centroids)

# Validate the clustering
cluster_sizes, gm_values, sorted_clusters = validate_clustering(data, final_centroids, distances, clusters)

Initialized Centroids:
 [[0.79682    0.721      0.66326    0.61324    0.4197     0.75604
  0.632956  ]
 [0.835725   0.770325   0.7152     0.66505    0.450775   0.7975
  0.692625  ]
 [0.82503333 0.7501     0.69026667 0.63823333 0.43666667 0.7783
  0.66923333]]
Converged after 3 iterations
Final Centroids:
 [[0.782      0.7023     0.64215    0.59075    0.40925    0.7391
  0.60785   ]
 [0.835725   0.770325   0.7152     0.66505    0.450775   0.7975
  0.692625  ]
 [0.81586667 0.74178333 0.6838     0.63323333 0.43166667 0.77281667
  0.65946333]]
Cluster Sizes: {0: 2, 1: 4, 2: 6}
Geometric Mean of Centroids: {0: 0.6274921232246911, 1: 0.6923336007750995, 2: 0.665230434810189}
Cluster 0 - Average Distance to Centroid: 0.0191
Cluster 0 - Max Distance to Centroid: 0.0191
Cluster 1 - Average Distance to Centroid: 0.0243
Cluster 1 - Max Distance to Centroid: 0.0324
Cluster 2 - Average Distance to Centroid: 0.0238
Cluster 2 - Max Distance to Centroid: 0.0410
Clusters ranked by Geometric Mean (Good 