In [1]:
import numpy as np
from scipy.sparse import issparse, csr_matrix
from numpy.linalg import norm
from sklearn.metrics.pairwise import cosine_distances
import nltk
import string
from tqdm import tqdm
from gensim.models.keyedvectors import KeyedVectors
from sklearn.feature_extraction.text import TfidfVectorizer

## Features Extraction

#### Text processing preparation

In [2]:
tokeniser = nltk.tokenize.TreebankWordTokenizer()
stopwords = frozenset(nltk.corpus.stopwords.words("english"))
trans_table = str.maketrans(dict.fromkeys(string.punctuation))

#### Tokenize text

In [3]:
def tokenise_text(str_):
    # remove non-ASCII characters for simplicity
    str_ = str_.encode(encoding='ascii', errors='ignore').decode()
    return [t for t in tokeniser.tokenize(str_.lower().translate(trans_table)) if t not in stopwords]

#### Extract features by TF-IDF 

In [4]:
def get_features_tfidf(Xr_fit, Xr_pred=None):
    print('Generating features (TF-IDF) ...')
    vectorizer = TfidfVectorizer()
    X_fit = vectorizer.fit_transform(Xr_fit)
    if Xr_pred is not None:
        X_pred = vectorizer.transform(Xr_pred)
    return X_fit if Xr_pred is None else (X_fit, X_pred)

#### Document -> Vector

In [5]:
def document_to_vector(tokenised_doc, word_vectors):
    """Takes a (tokenised) document and turns it into a vector by aggregating
    its word vectors.

    Args:
        tokenised_doc (list(list(str))): A document represented as list of
            sentences. Each sentence is a list of tokens.
        word_vectors (gensim.models.keyedvectors.KeyedVectors): A mapping 
            from words (string) to their embeddings (np.ndarray)

    Returns:
        np.array: The aggregated word vector representing the input document.
    """
    # check the input
    assert isinstance(word_vectors, KeyedVectors)
    vector_size = word_vectors.vector_size

    vec= np.zeros(vector_size)
    count = 0   # count words that have embeddings

    for sentence in tokenised_doc:
        for token in sentence:
            if token in word_vectors:
                vec += word_vectors[token]
                count += 1
    
    # weighted average of word vectors
    if count > 0:
        vec /= count

    return vec

#### Extract features by word vector

In [6]:
def get_features_w2v(Xt, word_vectors):
    print('Generating features (word2vec) ...')
    return np.vstack([document_to_vector(xt, word_vectors) for xt in tqdm(Xt)])

### K-means

In [7]:
def cosine_distance(v1, v2):
    """Compute the cosine distance between the two input vectors.

    Args:
        v1: A (sparse or dense) vector.
        v2: Another (sparse or dense) vector.

    Returns:
        float: The cosine distance between `v1` and `v2`.
    """

    # If one of the vectors is sparse and the other is dense, 
    # convert the dense one to sparse.
    if issparse(v1) and not issparse(v2):
        v2 = csr_matrix(v2)
    elif not issparse(v1) and issparse(v2):
        v1 = csr_matrix(v1)

    # Using scikit-learn's cosine_distances
    distance = cosine_distances(v1.reshape(1, -1), v2.reshape(1, -1))[0, 0]
    return distance

In [8]:
def compute_distances(data, centroids):
    """compute the cosine distances between every data point and
    every centroid.

    Args:
        data: A (sparse or dense) matrix of features for N documents.
            Each row represents a document.
        centroids (np.ndarray): The K cluster centres. Each row
            represent a cluster centre.

    Returns:
        np.ndarray: An N x K matrix of cosine distances.
    """
    
    assert data.shape[1] == centroids.shape[1]

    N = data.shape[0]
    K = centroids.shape[0]
    dists = np.full((N, K), -1.)

    for i in range(N):
        for j in range(K):
            v1 = data[i] # Ensure it's a row vector
            v2 = centroids[j]  # Ensure it's a row vector
            dists[i, j] = cosine_distance(v1, v2)
    return dists

In [9]:
def assign_data_points(distances):
    """Assign each data point to its closest centroid.

    Args:
        distances (np.ndarray): An N x K matrix where distances[i, j]
            is the cosine distance between the i-th data point and
            the j-th centroid.

    Returns:
        np.ndarray: A vector of size N.
    """
    N, K = distances.shape
    clusters = np.full(N, -1)

    for i in range(N):
        clusters[i] = np.argmin(distances[i, :])

    return clusters

In [10]:
def update_centroids(data, centroids, clusters):
    """Re-compute each centroid as the average of the data points
    assigned to it.

    Args:
        data: A (sparse or dense) matrix of features for N documents.
            Each row represents a document.
        centroids (np.ndarray): The K cluster centres. Each row
            represent a cluster centre.
        clusters (np.ndarray): A vector of size N where clusters[i] = j
            denotes that the i-th data point is assigned to the j-th
            centroid.

    Returns:
        np.ndarray: The updated centroids.
    """
    # check the input
    assert data.shape[1] == centroids.shape[1]
    N = data.shape[0]
    K = centroids.shape[0]
    assert clusters.shape[0] == N

    # Re-compute each centroid as the average of the data points assigned to it.
    for k in range(K):
        # Get the data points assigned to centroid k
        assigned_points = data[clusters == k]
        
        # If there are no points assigned to the centroid,the centroid remains unchanged
        if assigned_points.shape[0] > 0:
            # Compute the mean of assigned points
            new_centroid = np.mean(assigned_points, axis=0)
            
            # Update the centroid's position
            centroids[k] = new_centroid

    return centroids

In [11]:
def kmeans(data, K, max_iter=10, rng=None):
    """Clustering data points using the KMeans algorithm.

    Args:
        data: A matrix of features of documents. Each row represents a document.
        K (int): The number of cluster centres.
        max_iter (int): The maximum number of iterations to run in the KMeans algorithm.
        rng (np.random.Generator): A random number generator.

    Returns:
        centroids (np.ndarray): The cluster centres (after the re-computation of centroids).
        clusters (np.ndarray): The index of cluster each document belongs to, e.g., clusters[i] = k
            denotes that the i-th document is in the k-th cluster.
    """
    print(f'Clustering using KMeans (K={K}) ...')
    N = data.shape[0]
    assert N >= K
    rng = np.random.default_rng(rng)
    indices = rng.choice(N, size=K, replace=False)
    if issparse(data):
        centroids = data[indices, :].A  # dense
    else:
        centroids = data[indices, :]
    
    print(f'{"Iteration":>10} {"Total Distance":>20}')
    prev_clusters = None
    for i in range(max_iter):
        dists = compute_distances(data, centroids)
        clusters = assign_data_points(dists)
        centroids = update_centroids(data, centroids, clusters)
        print(f'{i:>10} {round(dists.min(axis=1).sum(), 2):>20}')
        if prev_clusters is not None and np.all(prev_clusters == clusters):
            return centroids, clusters
        prev_clusters = clusters
    return centroids, clusters