<a href="https://colab.research.google.com/github/Karthik-Aravapalli/CSE-572-Data-Mining/blob/main/DM_HW3_Task_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
from google.colab import files
uploaded = files.upload()

Saving data.csv to data.csv
Saving label.csv to label.csv


In [7]:
# Q1, Q2, Q3
import numpy as np
import pandas as pd
from scipy.spatial.distance import euclidean, cosine
from sklearn.metrics import pairwise_distances_argmin_min
from sklearn.preprocessing import LabelEncoder

# Distance computation functions
def euclidean_distance(x, y):
    return euclidean(x, y)

def cosine_similarity(x, y):
    return 1 - np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y))

def generalized_jaccard_similarity(x, y):
    min_sum = np.minimum(x, y).sum()
    max_sum = np.maximum(x, y).sum()
    return 1 - min_sum / max_sum if max_sum != 0 else 0

# K-means algorithm
def kmeans(X, k, distance_func, max_iters=100):
    indices = np.random.choice(X.shape[0], k, replace=False)
    centroids = X[indices]

    for iteration in range(max_iters):
        clusters = np.array([np.argmin([distance_func(x, centroid) for centroid in centroids]) for x in X])
        new_centroids = np.array([X[clusters == i].mean(axis=0) for i in range(k)])

        if np.all(centroids == new_centroids):
            break
        centroids = new_centroids

        _, dist = pairwise_distances_argmin_min(X, centroids, metric=distance_func)
        sse = np.sum(dist ** 2)
        if iteration > 0 and sse > previous_sse:
            break
        previous_sse = sse

    return clusters, centroids, sse, iteration

# Helper functions for cluster labeling and accuracy calculation
def label_clusters(clusters, true_labels):
    cluster_labels = {}
    for cluster in np.unique(clusters):
        labels, counts = np.unique(true_labels[clusters == cluster], return_counts=True)
        cluster_labels[cluster] = labels[np.argmax(counts)]
    return cluster_labels

def calculate_accuracy(clusters, cluster_labels, true_labels):
    correct_predictions = sum(cluster_labels[cluster] == true_label for cluster, true_label in zip(clusters, true_labels))
    return correct_predictions / len(true_labels)

# Load your dataset here
# X = (your data points)
# y = (your labels)
X = pd.read_csv('data.csv').values
y = pd.read_csv('label.csv').values.ravel()
# Number of clusters
k = np.unique(y).size

# Run K-means for each distance metric
results = {}
for distance_func, name in [
    (euclidean_distance, 'Euclidean'),
    (cosine_similarity, 'Cosine'),
    (generalized_jaccard_similarity, 'Jaccard')
]:
    clusters, centroids, sse, iterations = kmeans(X, k, distance_func)
    cluster_labels = label_clusters(clusters, y)
    accuracy = calculate_accuracy(clusters, cluster_labels, y)
    results[name] = {'SSE': sse, 'Accuracy': accuracy, 'Iterations': iterations}

print(results)

{'Euclidean': {'SSE': 25430229321.897377, 'Accuracy': 0.6032603260326033, 'Iterations': 33}, 'Cosine': {'SSE': 691.3338391221173, 'Accuracy': 0.5888588858885888, 'Iterations': 99}, 'Jaccard': {'SSE': 3662.537855948002, 'Accuracy': 0.5955595559555955, 'Iterations': 23}}


In [8]:
# Q4 - when there is no change in centroid position
import numpy as np
from scipy.spatial.distance import euclidean, cosine
from sklearn.metrics import pairwise_distances_argmin_min
from sklearn.preprocessing import LabelEncoder

# Distance computation functions
def euclidean_distance(x, y):
    return euclidean(x, y)

def cosine_similarity(x, y):
    return 1 - np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y))

def generalized_jaccard_similarity(x, y):
    min_sum = np.minimum(x, y).sum()
    max_sum = np.maximum(x, y).sum()
    return 1 - min_sum / max_sum if max_sum != 0 else 0

# K-means algorithm
def kmeans(X, k, distance_func, max_iters=100):
    indices = np.random.choice(X.shape[0], k, replace=False)
    centroids = X[indices]
    for iteration in range(max_iters):
        clusters = np.array([np.argmin([distance_func(x, centroid) for centroid in centroids]) for x in X])
        new_centroids = np.array([X[clusters == i].mean(axis=0) for i in range(k)])

        if np.all(centroids == new_centroids):
            break
        centroids = new_centroids

        _, dist = pairwise_distances_argmin_min(X, centroids, metric=distance_func)
        sse = np.sum(dist ** 2)
        # if iteration > 0 and sse > previous_sse:
        #     break
        previous_sse = sse

    return clusters, centroids, sse, iteration

# Helper functions for cluster labeling and accuracy calculation
def label_clusters(clusters, true_labels):
    cluster_labels = {}
    for cluster in np.unique(clusters):
        labels, counts = np.unique(true_labels[clusters == cluster], return_counts=True)
        cluster_labels[cluster] = labels[np.argmax(counts)]
    return cluster_labels

def calculate_accuracy(clusters, cluster_labels, true_labels):
    correct_predictions = sum(cluster_labels[cluster] == true_label for cluster, true_label in zip(clusters, true_labels))
    return correct_predictions / len(true_labels)

# Load your dataset here
# X = (your data points)
# y = (your labels)
X = pd.read_csv('data.csv').values
y = pd.read_csv('label.csv').values.ravel()
# Number of clusters
k = np.unique(y).size

# Run K-means for each distance metric
results = {}
for distance_func, name in [
    (euclidean_distance, 'Euclidean'),
    (cosine_similarity, 'Cosine'),
    (generalized_jaccard_similarity, 'Jaccard')
]:
    clusters, centroids, sse, iterations = kmeans(X, k, distance_func)
    cluster_labels = label_clusters(clusters, y)
    accuracy = calculate_accuracy(clusters, cluster_labels, y)
    results[name] = {'SSE': sse, 'Accuracy': accuracy, 'Iterations': iterations}

print(results)

{'Euclidean': {'SSE': 25322234651.88256, 'Accuracy': 0.5985598559855986, 'Iterations': 96}, 'Cosine': {'SSE': 681.9813238093508, 'Accuracy': 0.6048604860486049, 'Iterations': 99}, 'Jaccard': {'SSE': 3721.8152092054106, 'Accuracy': 0.6273627362736274, 'Iterations': 99}}


In [9]:
# Q4 - when the SSE value increases in the next iteration
import numpy as np
from scipy.spatial.distance import euclidean, cosine
from sklearn.metrics import pairwise_distances_argmin_min
from sklearn.preprocessing import LabelEncoder

# Distance computation functions
def euclidean_distance(x, y):
    return euclidean(x, y)

def cosine_similarity(x, y):
    return 1 - np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y))

def generalized_jaccard_similarity(x, y):
    min_sum = np.minimum(x, y).sum()
    max_sum = np.maximum(x, y).sum()
    return 1 - min_sum / max_sum if max_sum != 0 else 0

# K-means algorithm
def kmeans(X, k, distance_func, max_iters=100):
    indices = np.random.choice(X.shape[0], k, replace=False)
    centroids = X[indices]
    for iteration in range(max_iters):
        clusters = np.array([np.argmin([distance_func(x, centroid) for centroid in centroids]) for x in X])
        new_centroids = np.array([X[clusters == i].mean(axis=0) for i in range(k)])

        # if np.all(centroids == new_centroids):
        #     break
        centroids = new_centroids

        _, dist = pairwise_distances_argmin_min(X, centroids, metric=distance_func)
        sse = np.sum(dist ** 2)
        if iteration > 0 and sse > previous_sse:
            break
        previous_sse = sse

    return clusters, centroids, sse, iteration

# Helper functions for cluster labeling and accuracy calculation
def label_clusters(clusters, true_labels):
    cluster_labels = {}
    for cluster in np.unique(clusters):
        labels, counts = np.unique(true_labels[clusters == cluster], return_counts=True)
        cluster_labels[cluster] = labels[np.argmax(counts)]
    return cluster_labels

def calculate_accuracy(clusters, cluster_labels, true_labels):
    correct_predictions = sum(cluster_labels[cluster] == true_label for cluster, true_label in zip(clusters, true_labels))
    return correct_predictions / len(true_labels)

# Load your dataset here
# X = (your data points)
# y = (your labels)
X = pd.read_csv('data.csv').values
y = pd.read_csv('label.csv').values.ravel()
# Number of clusters
k = np.unique(y).size

# Run K-means for each distance metric
results = {}
for distance_func, name in [
    (euclidean_distance, 'Euclidean'),
    (cosine_similarity, 'Cosine'),
    (generalized_jaccard_similarity, 'Jaccard')
]:
    clusters, centroids, sse, iterations = kmeans(X, k, distance_func)
    cluster_labels = label_clusters(clusters, y)
    accuracy = calculate_accuracy(clusters, cluster_labels, y)
    results[name] = {'SSE': sse, 'Accuracy': accuracy, 'Iterations': iterations}

print(results)

{'Euclidean': {'SSE': 25489559452.95701, 'Accuracy': 0.6041604160416042, 'Iterations': 99}, 'Cosine': {'SSE': 698.4681780433566, 'Accuracy': 0.585058505850585, 'Iterations': 15}, 'Jaccard': {'SSE': 3662.9519447114008, 'Accuracy': 0.5971597159715971, 'Iterations': 25}}


In [10]:
# Q4 - when the maximum preset value (e.g., 100) of iteration is complete
import numpy as np
from scipy.spatial.distance import euclidean, cosine
from sklearn.metrics import pairwise_distances_argmin_min
from sklearn.preprocessing import LabelEncoder

# Distance computation functions
def euclidean_distance(x, y):
    return euclidean(x, y)

def cosine_similarity(x, y):
    return 1 - np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y))

def generalized_jaccard_similarity(x, y):
    min_sum = np.minimum(x, y).sum()
    max_sum = np.maximum(x, y).sum()
    return 1 - min_sum / max_sum if max_sum != 0 else 0

# K-means algorithm
def kmeans(X, k, distance_func, max_iters=100):
    indices = np.random.choice(X.shape[0], k, replace=False)
    centroids = X[indices]
    for iteration in range(max_iters):
        clusters = np.array([np.argmin([distance_func(x, centroid) for centroid in centroids]) for x in X])
        new_centroids = np.array([X[clusters == i].mean(axis=0) for i in range(k)])

        # if np.all(centroids == new_centroids):
        #     break
        centroids = new_centroids

        _, dist = pairwise_distances_argmin_min(X, centroids, metric=distance_func)
        sse = np.sum(dist ** 2)
        # if iteration > 0 and sse > previous_sse:
        #     break
        previous_sse = sse

    return clusters, centroids, sse, iteration

# Helper functions for cluster labeling and accuracy calculation
def label_clusters(clusters, true_labels):
    cluster_labels = {}
    for cluster in np.unique(clusters):
        labels, counts = np.unique(true_labels[clusters == cluster], return_counts=True)
        cluster_labels[cluster] = labels[np.argmax(counts)]
    return cluster_labels

def calculate_accuracy(clusters, cluster_labels, true_labels):
    correct_predictions = sum(cluster_labels[cluster] == true_label for cluster, true_label in zip(clusters, true_labels))
    return correct_predictions / len(true_labels)

# Load your dataset here
# X = (your data points)
# y = (your labels)
X = pd.read_csv('data.csv').values
y = pd.read_csv('label.csv').values.ravel()
# Number of clusters
k = np.unique(y).size

# Run K-means for each distance metric
results = {}
for distance_func, name in [
    (euclidean_distance, 'Euclidean'),
    (cosine_similarity, 'Cosine'),
    (generalized_jaccard_similarity, 'Jaccard')
]:
    clusters, centroids, sse, iterations = kmeans(X, k, distance_func)
    cluster_labels = label_clusters(clusters, y)
    accuracy = calculate_accuracy(clusters, cluster_labels, y)
    results[name] = {'SSE': sse, 'Accuracy': accuracy, 'Iterations': iterations}

print(results)

{'Euclidean': {'SSE': 25404934076.023125, 'Accuracy': 0.598059805980598, 'Iterations': 99}, 'Cosine': {'SSE': 682.0686188494456, 'Accuracy': 0.6135613561356136, 'Iterations': 99}, 'Jaccard': {'SSE': 3678.2550296702548, 'Accuracy': 0.6233623362336234, 'Iterations': 99}}
