In [1]:
# 1. Importing Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split 
from sklearn.cluster import KMeans

from sklearn.metrics import classification_report, accuracy_score
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

import seaborn as sns
from matplotlib.colors import ListedColormap
import os
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA


In [None]:
dataSetPath = r"..\1_DataSets\iris.csv"

dataset = pd.read_csv(dataSetPath)
dataSetName = os.path.basename(dataSetPath)

print(f"Dataset Columns: {dataset.columns.to_list()}")
print("****************************")
print(f"Dataset Shape: {dataset.shape}")
print("****************************")

totalNumberOfRowInActualDataset = len(dataset)
print(f"Total Rows in dataset File: {totalNumberOfRowInActualDataset}")


In [None]:
dataset.drop_duplicates(inplace=True)
print(f"Dataset Shape after removing duplcates: {dataset.shape}")
print("****************************")

numberOfRowAfterRemovingDuplicates = len(dataset)
print(f"Remaining Rows in Dataset: {numberOfRowAfterRemovingDuplicates}")

print("****************************")
print(f"Total Duplicates: {(totalNumberOfRowInActualDataset - numberOfRowAfterRemovingDuplicates)}")

In [None]:
# Step 2: Handle missing values (remove rows with missing values)
cleanedDataset = dataset.dropna()
numberOfRowAfterRemovingNullValues = len(cleanedDataset)
print(f"Remaining Rows in Dataset: {numberOfRowAfterRemovingNullValues}")

print("****************************")
print(f"Removed Number of rows containing Null Values: {( numberOfRowAfterRemovingDuplicates - numberOfRowAfterRemovingNullValues)}")

In [None]:
target_column = cleanedDataset[cleanedDataset.columns.to_list()[-1]].unique()
print(target_column) 

In [None]:
cleanedDataset.head()

In [None]:
cleanedDataset.info()

In [8]:
# Encode the target column if it's categorical
if cleanedDataset[cleanedDataset.columns.to_list()[-1]].dtype == 'object':
    cleanedDataset[cleanedDataset.columns.to_list()[-1]] = cleanedDataset[cleanedDataset.columns.to_list()[-1]].astype('category').cat.codes


In [None]:
cleanedDataset.info()

In [None]:
features = cleanedDataset.iloc[:, :-1].values  
print("features Shape:", features.shape)
n_clusters = 3
iterations = 25

In [None]:
firstFeatureIndex = 0
secondFeatureIndex = 1
# Randomly initialize centroids
np.random.seed(42)
initial_centroids = features[np.random.choice(features.shape[0], n_clusters, replace=False), :]

# Initialize variables
centroids = initial_centroids
output_dir = f"kmeans_steps_{dataSetName}_{cleanedDataset.columns.to_list()[firstFeatureIndex]}_vs_{cleanedDataset.columns.to_list()[secondFeatureIndex]}"  # Directory to save the plots

# Ensure the output directory exists
os.makedirs(output_dir, exist_ok=True)

for iteration in range(iterations):
    # Step 1: Assign points to the nearest centroid
    distances = np.linalg.norm(features[:, np.newaxis, :] - centroids, axis=2)
    labels = np.argmin(distances, axis=1)

    # Step 2: Update centroids based on the mean of assigned points
    new_centroids = np.array([features[labels == cluster].mean(axis=0) if np.any(labels == cluster) else centroids[cluster]
                              for cluster in range(n_clusters)])

    # Plot current iteration
    plt.figure(figsize=(8, 6))
    for cluster in range(n_clusters):
        plt.scatter(features[labels == cluster, firstFeatureIndex], features[labels == cluster, secondFeatureIndex], s=50, alpha=0.6, label=f'Cluster {cluster + 1}')
    plt.scatter(centroids[:, firstFeatureIndex], centroids[:, secondFeatureIndex], s=200, c='black', marker='x', label='Centroids')
    plt.title(f"K-Means Iteration {iteration + 1}")
    plt.xlabel(f"{cleanedDataset.columns.to_list()[firstFeatureIndex]}")
    plt.ylabel(f"{cleanedDataset.columns.to_list()[secondFeatureIndex]}")
    plt.legend()
    plt.grid(True)

    # Save the plot
    picturePath = os.path.join(output_dir, f"k_mean_Clustering_iteration_{iteration + 1}.png") 
    plt.savefig(picturePath,  dpi=300, bbox_inches='tight')

    plt.close()

    # Check for convergence (if centroids do not change)
    if np.allclose(centroids, new_centroids):
        break

    centroids = new_centroids  # Update centroids for next iteration

print(f"Plots saved in the directory: {output_dir}")

In [None]:
firstFeatureIndex = 0
secondFeatureIndex = 2
# Randomly initialize centroids
np.random.seed(42)
initial_centroids = features[np.random.choice(features.shape[0], n_clusters, replace=False), :]

# Initialize variables
centroids = initial_centroids
output_dir = f"kmeans_steps_{dataSetName}_{cleanedDataset.columns.to_list()[firstFeatureIndex]}_vs_{cleanedDataset.columns.to_list()[secondFeatureIndex]}"  # Directory to save the plots

# Ensure the output directory exists
os.makedirs(output_dir, exist_ok=True)

for iteration in range(iterations):
    # Step 1: Assign points to the nearest centroid
    distances = np.linalg.norm(features[:, np.newaxis, :] - centroids, axis=2)
    labels = np.argmin(distances, axis=1)

    # Step 2: Update centroids based on the mean of assigned points
    new_centroids = np.array([features[labels == cluster].mean(axis=0) if np.any(labels == cluster) else centroids[cluster]
                              for cluster in range(n_clusters)])

    # Plot current iteration
    plt.figure(figsize=(8, 6))
    for cluster in range(n_clusters):
        plt.scatter(features[labels == cluster, firstFeatureIndex], features[labels == cluster, secondFeatureIndex], s=50, alpha=0.6, label=f'Cluster {cluster + 1}')
    plt.scatter(centroids[:, firstFeatureIndex], centroids[:, secondFeatureIndex], s=200, c='black', marker='x', label='Centroids')
    plt.title(f"K-Means Iteration {iteration + 1}")
    plt.xlabel(f"{cleanedDataset.columns.to_list()[firstFeatureIndex]}")
    plt.ylabel(f"{cleanedDataset.columns.to_list()[secondFeatureIndex]}")
    plt.legend()
    plt.grid(True)

    # Save the plot
    picturePath = os.path.join(output_dir, f"k_mean_Clustering_iteration_{iteration + 1}.png") 
    plt.savefig(picturePath,  dpi=300, bbox_inches='tight')

    plt.close()

    # Check for convergence (if centroids do not change)
    if np.allclose(centroids, new_centroids):
        break

    centroids = new_centroids  # Update centroids for next iteration

print(f"Plots saved in the directory: {output_dir}")

In [None]:
from sklearn.cluster import KMeans
import numpy as np
import matplotlib.pyplot as plt

# Sample Data
np.random.seed(42)
X = np.random.rand(100, 2)

# Apply K-Means
kmeans = KMeans(n_clusters=3, random_state=42)
kmeans.fit(X)
y_kmeans = kmeans.predict(X)

# Plotting
plt.scatter(X[:, 0], X[:, 1], c=y_kmeans, s=50, cmap='viridis')
centroids = kmeans.cluster_centers_
plt.scatter(centroids[:, 0], centroids[:, 1], c='red', s=200, alpha=0.75, label="Centroids")
plt.title("K-Means Clustering")
plt.legend()
plt.show()
