In [2]:
# Packages
import pandas as pd
from scipy.io import arff
import pandas as pd
import numpy as np

URV                                                                            MESIIA

Neural and Evolutionary Computation (NEC)
Assignment 3: Unsupervised learning with PCA, t-SNE, k-means, AHC and SOM

Teachers: Dr. Jordi Duch, Dr. Sergio Gomez

Student: Natzaret Gálvez Rísquez

Part 1: Selecting and analyzing the datasets

The unsupervised learning techniques must be applied on two datasets:

In [10]:
# We upload the datasets

# First dataset: File: A3-data.txt
    # Features: 4 variables, 1 class
    # Patterns: 360 patterns
df_data=pd.read_csv('C:/Users/Gari/Desktop/Assignments_NEC/A3/A3-data.txt', sep=',', header=None)
header_vector_data = df_data.iloc[0, :].tolist() #header
df_data=df_data.iloc[1:,:-1]
df_data=pd.DataFrame(df_data)

# Second dataset: from "https://www.openml.org/search?type=data&status=active&id=188"
    # Features: at least 6 variables, and a class attribute
    # The class attribute must refer to, at least, 4 different classes
    #  Patterns: at least 200 patterns
data_Eucaliptus, meta = arff.loadarff('C:/Users/Gari/Desktop/Assignments_NEC/A3/dataset_194_eucalyptus.arff')
# Convert numpy array to DataFrame
# [736 rows x 20 columns], number of classes 5
df_Eucaliptus = pd.DataFrame(data_Eucaliptus)

In [None]:
print(meta) # Details of the second dataset

In [None]:
print(df_Eucaliptus) # Details of the second dataset

Part 2: Comparing unsupervised learning algorithms

We are going to perform unsupervised learning of the two datasets using the following algorithms:
PCA, t-SNE, k-means, AHC and SOM

In [None]:
# PCA
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

def apply_pca(data, n_components=2, labels):
    pca = PCA(n_components=n_components)
    pca_result = pca.fit_transform(data)

    # Scatter plot of the first two principal components
    plt.figure(figsize=(8, 6))
    plt.scatter(pca_result[:, 0], pca_result[:, 1], c=labels, cmap='viridis')
    plt.xlabel('PC1')
    plt.ylabel('PC2')
    plt.title('PCA Projection')
    plt.colorbar()
    plt.show()

    # Scree plot
    plt.figure(figsize=(8, 6))
    plt.plot(range(1, len(pca.explained_variance_ratio_) + 1), pca.explained_variance_ratio_, marker='o', linestyle='-')
    plt.xlabel('Number of Components')
    plt.ylabel('Explained Variance Ratio')
    plt.title('Scree Plot')
    plt.grid(True)
    plt.show()

In [None]:
# t-SNE
from sklearn.manifold import TSNE

def apply_tsne(data, perplexity=30, labels):
    tsne = TSNE(n_components=2, perplexity=perplexity)
    tsne_result = tsne.fit_transform(data)

    plt.figure(figsize=(8, 6))
    plt.scatter(tsne_result[:, 0], tsne_result[:, 1], c=labels, cmap='viridis')
    plt.xlabel('t-SNE 1')
    plt.ylabel('t-SNE 2')
    plt.title(f't-SNE Projection (Perplexity={perplexity})')
    plt.colorbar()
    plt.show()

In [None]:
# k-means
from sklearn.cluster import KMeans
from sklearn.metrics import confusion_matrix

def apply_kmeans(data, k, labels):
    kmeans = KMeans(n_clusters=k)
    kmeans.fit(data)
    cluster_labels = kmeans.labels_

    # Scatter plot of the data colored according to the classes they belong to
    plt.figure(figsize=(8, 6))
    plt.scatter(data[:, 0], data[:, 1], c=labels, cmap='viridis')
    plt.xlabel('Feature 1')
    plt.ylabel('Feature 2')
    plt.title('True Classes')
    plt.colorbar()
    plt.show()

    # Scatter plot of the data colored according to the clusters obtained by K-means
    plt.figure(figsize=(8, 6))
    plt.scatter(data[:, 0], data[:, 1], c=cluster_labels, cmap='viridis')
    plt.xlabel('Feature 1')
    plt.ylabel('Feature 2')
    plt.title(f'K-means Clustering (k={k})')
    plt.colorbar()
    plt.show()

    # Compare with the real class labels
    if k == len(np.unique(labels)):
        cm = confusion_matrix(labels, cluster_labels)
        print("Confusion Matrix:")
        print(cm)

In [None]:
# AHC
from scipy.cluster.hierarchy import linkage, dendrogram
import numpy as np
from scipy.spatial.distance import pdist

# Method UPGMA -> method average
def apply_ahc(data, labels, method='average'):
    # Calculate Euclidean distances between original patterns
    distances = pdist(data)

    # Calculate linkage matrix
    linkage_matrix = linkage(distances, method=method)

    # Plot dendrogram with colors representing original classes
    plt.figure(figsize=(12, 8))
    dendrogram(linkage_matrix, 
               color_threshold=0, 
               labels=labels,
               leaf_font_size=10,
               above_threshold_color='k')
    plt.title(f'Agglomerative Hierarchical Clustering ({method.capitalize()} Linkage)')
    plt.xlabel('Sample Index')
    plt.ylabel('Distance')
    plt.show()

In [None]:
# SOM
from sompy.sompy import SOMFactory

def apply_som(data, mapsize=(10, 10), topology='rectangular', learning_rate=0.02, neighborhood_function='gaussian', labels):
    # Ensure at least 100 neurons
    if mapsize[0] * mapsize[1] < 100:
        raise ValueError("Number of neurons is less than 100. Please choose a larger mapsize.")

    som = SOMFactory.build(data, mapsize=mapsize, topology=topology, normalization='var', initialization='pca')
    som.train(n_job=1, verbose=False, train_rough_len=20, train_finetune_len=100, learning_rate=learning_rate,
              neighborhood=neighborhood_function)

    # Plot component planes
    plt.figure(figsize=(10, 8))
    som.component_planes()
    plt.suptitle('Component Planes')
    plt.show()

    # Plot U-matrix
    plt.figure(figsize=(10, 8))
    som.view_umatrix(bestmatches=True, colormap='viridis', colorbar=True)
    plt.title('U-matrix')
    plt.show()

    # Plot heatmap of the most represented class in each position
    plt.figure(figsize=(10, 8))
    som.plot_heatmap()
    plt.title('Heatmap of the Most Represented Class')
    plt.show()

    # Find the best matching units (BMUs) for each data point
    bmus = som.find_bmu(data)
    
    # Calculate the most represented class for each position
    unique_classes = np.unique(labels)
    most_represented_class = np.zeros(som.codebook.mapsize)
    for i, j in np.ndindex(som.codebook.mapsize):
        most_represented_class[i, j] = np.argmax(np.bincount(labels[bmus == (i, j)]))

    # Plot heatmap of the most represented class in each position
    plt.figure(figsize=(10, 8))
    plt.imshow(most_represented_class, cmap='viridis')
    plt.colorbar(label='Class')
    plt.title('Heatmap of the Most Represented Class')
    plt.show()

Dataset 1: data

In [None]:
labels =

apply_pca(df_data, labels)
apply_tsne(df_data, labels)
apply_kmeans(df_data, k=1, labels)
apply_ahc(df_data)
apply_som(df_data, labels)

Dataset 2: eucaliptus

In [None]:
labels =

apply_pca(df_Eucaliptus, labels)
apply_tsne(df_Eucaliptus, labels)
apply_kmeans(df_Eucaliptus, k=5, labels)
apply_ahc(df_Eucaliptus)
apply_som(df_Eucaliptus, labels)