# Artificial Intelligence PW2 - K-Means Clustering

## By Murad Mustafayev & Kamal Ahmadov

In [1]:
# Importing necessary libraries
import  numpy as np
import pandas as pd
%matplotlib inline

In [2]:
# Read the Iris dataset
data = pd.read_csv('iris.csv')

# Remove unnecessary columns for clustering (e.g., 'Id' and 'Species')
data.drop('Id', axis=1, inplace=True)
data.drop('Species', axis=1, inplace=True)

# Display the first few rows of the dataset
data.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [3]:
# Determine the range of each feature (SepalLengthCm, SepalWidthCm, PetalLengthCm, PetalWidthCm)
range_sl = data['SepalLengthCm'].min(), data['SepalLengthCm'].max()
range_sw = data['SepalWidthCm'].min(), data['SepalWidthCm'].max()
range_pl = data['PetalLengthCm'].min(), data['PetalLengthCm'].max()
range_pw = data['PetalWidthCm'].min(), data['PetalWidthCm'].max()

print(range_sl, range_sw, range_pl, range_pw)

(4.3, 7.9) (2.0, 4.4) (1.0, 6.9) (0.1, 2.5)


In [4]:
# Function to randomly select initial centroids
def select_random_points(df, n):
    sample = df.sample(n)
    sample.index = range(n)
    return sample

# Test the select_random_points function by selecting 3 random points
select_random_points(data, 3)

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
0,4.8,3.4,1.9,0.2
1,5.5,4.2,1.4,0.2
2,5.8,2.8,5.1,2.4


In [5]:
# Function to calculate Euclidean distance between two points
def distance(p1, p2):
    """
    Calculate the Euclidean distance between two data points.

    Parameters:
    p1 (Series): Pandas Series representing the first data point.
    p2 (Series): Pandas Series representing the second data point.

    Returns:
    float: The Euclidean distance between the two data points.
    """
    # Calculate the differences between corresponding feature values
    sl_diff = p1['SepalLengthCm'] - p2['SepalLengthCm']
    sw_diff = p1['SepalWidthCm'] - p2['SepalWidthCm']
    pl_diff = p1['PetalLengthCm'] - p2['PetalLengthCm']
    pw_diff = p1['PetalWidthCm'] - p2['PetalWidthCm']
    
    # Calculate the Euclidean distance using the differences
    distance = np.sqrt(sl_diff**2 + sw_diff**2 + pl_diff**2 + pw_diff**2)
    
    return distance

# Test the distance function by calculating the distance between two points
p1 = data.iloc[0]
p2 = data.iloc[1]
distance(p1, p2)

0.5385164807134502

In [7]:
# Function to assign data points to the nearest centroid
def assign_points_to_clusters(df, centroids):
    """
    Assign each data point to the nearest centroid.

    Parameters:
    df (DataFrame): Pandas DataFrame containing the dataset.
    centroids (DataFrame): Pandas DataFrame containing the centroids.

    Returns:
    dict: A dictionary where keys represent cluster IDs and values are lists of indices
          corresponding to data points assigned to each cluster.
    """
    # Initialize an empty dictionary to store clusters
    clusters = {}
    for i in range(len(centroids)):
        clusters[i] = []  # Initialize an empty list for each cluster
    
    # Iterate over each data point in the dataset
    for index, row in df.iterrows():
        min_dist = float('inf')  # Set initial minimum distance to infinity
        closest_centroid = None  # Initialize closest centroid as None
        
        # Iterate over each centroid to find the closest one to the current data point
        for j, centroid in centroids.iterrows():
            # Calculate the distance between the current data point and the centroid
            d = distance(row, centroid)
            
            # Update the minimum distance and closest centroid if a closer one is found
            if d < min_dist:
                min_dist = d
                closest_centroid = j
        
        # Append the index of the data point to the cluster of the closest centroid
        clusters[closest_centroid].append(index)
    
    return clusters

# Test the assign_points_to_clusters function by assigning points to the closest centroids
centroids = select_random_points(data, 3)
clusters = assign_points_to_clusters(data, centroids)
clusters.keys()

dict_keys([0, 1, 2])

In [8]:
# Function to update centroids based on assigned points
def update_centroids(df, clusters):
    """
    Update centroids based on assigned data points.

    Parameters:
    df (DataFrame): Pandas DataFrame containing the dataset.
    clusters (dict): A dictionary where keys represent cluster IDs and values are lists of indices
                     corresponding to data points assigned to each cluster.

    Returns:
    DataFrame: Pandas DataFrame containing the updated centroids.
    """
    new_centroids = []  # Initialize a list to store the updated centroids
    
    # Iterate over each cluster and its assigned data points
    for i, indices in clusters.items():
        if indices:  # Check if there are data points assigned to the cluster
            # Calculate the mean of data points assigned to the cluster to get the new centroid
            new_centroid = df.loc[indices].mean()
        else:
            new_centroid = df.iloc[i]  # If no points assigned to cluster, keep the old centroid
        
        new_centroids.append(new_centroid)  # Append the new centroid to the list of updated centroids
    
    return pd.DataFrame(new_centroids)

# Test the update_centroids function by updating centroids based on assigned points
new_centroids = update_centroids(data, clusters)
new_centroids

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
0,6.346739,2.909783,5.031522,1.729348
1,5.275,3.84375,1.4875,0.29375
2,4.957143,3.069048,1.835714,0.380952


In [9]:
def k_means(df, k, max_iter=1000):
    """
    Perform k-means clustering on the dataset.

    Parameters:
    df (DataFrame): Pandas DataFrame containing the dataset.
    k (int): Number of clusters to form.
    max_iter (int): Maximum number of iterations for the algorithm.

    Returns:
    tuple: A tuple containing two elements:
           - A dictionary where keys represent cluster IDs and values are lists of indices
             corresponding to data points assigned to each cluster.
           - Pandas DataFrame containing the final centroids.
    """
    centroids = select_random_points(df, k)  # Initialize centroids by randomly selecting k points from the dataset

    for _ in range(max_iter):
        clusters = assign_points_to_clusters(df, centroids)  # Assign data points to the nearest centroids
        new_centroids = update_centroids(df, clusters)  # Update centroids based on assigned data points

        # Check for convergence: if centroids remain unchanged, stop iterating
        if (centroids == new_centroids).all().all():
            break

        centroids = new_centroids  # Update centroids for the next iteration

    return clusters, centroids  # Return the assigned clusters and the final centroids

In [10]:
# Perform k-means clustering on the Iris dataset with k=3
clusters, centroids = k_means(data, 3)

In [11]:
# Display the final centroids
centroids

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
0,5.006,3.418,1.464,0.244
1,6.85,3.073684,5.742105,2.071053
2,5.901613,2.748387,4.393548,1.433871


In [12]:
# Save clustered data to CSV
# Add a 'Cluster' column to the original dataset
clustered_data = data.copy()
clustered_data['Cluster'] = -1
# Assign each data point to its corresponding cluster
for cluster, indices in clusters.items():
    clustered_data.loc[indices, 'Cluster'] = cluster
# Save the clustered data to a CSV file
clustered_data.to_csv('clustered_iris.csv', index=False)

In [13]:
# Display the first few rows of the clustered data
clustered_data.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Cluster
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
