In [1]:
from sklearn.datasets import fetch_20newsgroups
from tabulate import tabulate
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import euclidean_distances, cosine_distances
from sklearn.metrics import DistanceMetric
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics import normalized_mutual_info_score, adjusted_rand_score, v_measure_score, homogeneity_score
import numpy as np
import spacy
from string import punctuation

In [8]:
categories = ['comp.graphics', 'rec.autos', 'sci.med',  'talk.politics.mideast']
news_groups = fetch_20newsgroups(subset='all', categories=categories, remove=('headers', 'footers', 'quotes'), shuffle=False)

In [27]:
import re

cleaned_collection = [re.sub(r'[\n\t]+| {2,}', ' ', text) for text in news_groups.data]

In [11]:
nlp = spacy.load('en_core_web_sm')

In [32]:
from spacy.lang.en import stop_words

stop_words = stop_words.STOP_WORDS
punctuations = list(punctuation)

token_collection= []
vector = []

lemmatized_collection = [[(token.lemma_.lower(), token.pos_) for token in nlp(text) if token.lemma_.lower() not in stop_words and token.lemma_.lower() not in punctuations and not token.pos_ == 'PUNCT' and not token.pos_ == 'SPACE'] for text in cleaned_collection]

# lemmatized_collection_POS = [[{token.lemma_.lower(): token.pos_} for token in nlp(text) if token.lemma_.lower() not in stop_words and token.lemma_.lower() not in punctuations and not token.pos_ == 'PUNCT'] for text in news_groups.data]

# print(lemmatized_collection)
# print(lemmatized_collection_POS)

In [34]:
lemm_texts = [' '.join(text) for text in [
    [token[0] for token in lemmatized_text]
    for lemmatized_text in lemmatized_collection
]]

In [58]:
lemm_texts_nouns = [' '.join(text) for text in [
    [token[0] for token in lemmatized_text if token[1] == "NOUN"]
    for lemmatized_text in lemmatized_collection
]]

lemm_texts_nouns_adj = [' '.join(text) for text in [
    [token[0] for token in lemmatized_text if token[1] == "NOUN" or token[1] == "ADJ"]
    for lemmatized_text in lemmatized_collection
]]

In [37]:
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(lemm_texts)

In [60]:
vectors_adj_nouns = vectorizer.fit_transform(lemm_texts_nouns_adj)
vectors_nouns = vectorizer.fit_transform(lemm_texts_nouns)

euclidean_distance_matrix = euclidean_distances(vectors)
euclidean_distance_matrix_adj_nouns = euclidean_distances(vectors_adj_nouns)
euclidean_distance_matrix_nouns = euclidean_distances(vectors_nouns)

cosine_distance_matrix = cosine_distances(vectors)
cosine_distance_matrix_adj_nouns = cosine_distances(vectors_adj_nouns)
cosine_distance_matrix_nouns = cosine_distances(vectors_nouns)

jaccard_similarity_matrix = DistanceMetric.get_metric('jaccard').pairwise(vectors)
jaccard_similarity_matrix_adj_nouns = DistanceMetric.get_metric('jaccard').pairwise(vectors_adj_nouns)
jaccard_similarity_matrix_nouns = DistanceMetric.get_metric('jaccard').pairwise(vectors_nouns)

In [61]:
true_labels = news_groups.target
n_clusters = len(news_groups.target_names)
n_iterations = 50
matrixes = [euclidean_distance_matrix, cosine_distance_matrix, jaccard_similarity_matrix]
matrixes_adj_nouns = [euclidean_distance_matrix_adj_nouns, cosine_distance_matrix_adj_nouns, jaccard_similarity_matrix_adj_nouns]
matrixes_nouns = [euclidean_distance_matrix_nouns, cosine_distance_matrix_nouns, jaccard_similarity_matrix_nouns]
metrics = [normalized_mutual_info_score, adjusted_rand_score, v_measure_score, homogeneity_score]

In [52]:

def cluster_kmeans(matrix, metrics, true_labels, n_iterations, n_clusters):
    scores = {}
    result = []
    for metric in metrics:
        scores.update({metric.__name__: []})


    for i in range(n_iterations):
        clusters = KMeans(n_clusters=n_clusters, n_init=10)
        clusters.fit(matrix)

        for metric in metrics:
            score = metric(true_labels, clusters.labels_)
            scores[metric.__name__].append(score)


    for metric in scores:
        max = np.max(scores[metric])
        min = np.min(scores[metric])
        avg = np.mean(scores[metric])

        result.append(f'{metric}\nMax: {max} iter: {scores[metric].index(max) + 1}\nMin: {min} iter: {scores[metric].index(min) + 1}\nAvg: {avg}\n')
        # print(metric)
        # print(f'Max: {max} iter: {scores[metric].index(max) + 1}')
        # print(f'Min: {min} iter: {scores[metric].index(min) + 1}')
        # print(f'Avg: {avg}')
    return '\n'.join(result)

In [63]:
from tabulate import tabulate

headers = ['words', 'euclidean', 'cosine', 'jaccard']

arr = [['all words'], ['adj and nouns'], ['nouns']]

for matrix in matrixes:
    arr[0].append(cluster_kmeans(matrix, metrics, true_labels, n_iterations, n_clusters))
# for matrix in matrixes_adj_nouns:
#     arr[1].append(cluster_kmeans(matrix, metrics, true_labels, n_iterations, n_clusters))
# for matrix in matrixes_nouns:
#     arr[2].append(cluster_kmeans(matrix, metrics, true_labels, n_iterations, n_clusters))

print(tabulate(arr, headers, tablefmt='grid'))

+---------------+------------------------------------+-----------------------------------+--------------------------------------+
| words         | euclidean                          | cosine                            | jaccard                              |
| all words     | normalized_mutual_info_score       | normalized_mutual_info_score      | normalized_mutual_info_score         |
|               | Max: 0.35308510749159355 iter: 30  | Max: 0.42198909471231216 iter: 5  | Max: 0.020543863504170163 iter: 8    |
|               | Min: 0.19247054307228872 iter: 27  | Min: 0.2676083036717975 iter: 21  | Min: 0.011883379526465019 iter: 37   |
|               | Avg: 0.24373422059537667           | Avg: 0.3453079389697233           | Avg: 0.01764014041154703             |
|               |                                    |                                   |                                      |
|               | adjusted_rand_score                | adjusted_rand_score               |

In [56]:
# import matplotlib.pyplot as plt
# from scipy.cluster.hierarchy import dendrogram

def cluster_hierarchy(matrix, metrics, true_labels, num_clusters):
    linkages = ["complete", "average", "single"]
    result = []

    if matrix.shape[0] != matrix.shape[1]:
        affinity = "euclidean"
        matrix = matrix.toarray()
        linkages.append("ward")
    else:
        affinity = "precomputed"

    for linkage in linkages:
        result.append('\n')
        result.append(linkage)
        result.append('---------')
        # print(linkage)
        agg_clustering = AgglomerativeClustering(n_clusters=num_clusters, metric=affinity, linkage=linkage)

        agg_clustering.fit(matrix)

        for metric in metrics:
            score = metric(true_labels, agg_clustering.labels_)
            result.append(f"{metric.__name__}: {score}")
            # print(f"{metric.__name__}: ", score)

        # plt.figure(figsize=[12, 12])
        # plt.subplot(4, 1, linkages.index(linkage) + 1)
        # children = agg_clustering.children_
        # distance = np.arange(children.shape[0])
        # num_of_observations = np.arange(2, children.shape[0] + 2)
        # linkage_matrix = np.column_stack([children, distance, num_of_observations]).astype(float)
        # dendrogram(linkage_matrix)

    return '\n'.join(result)

    # plt.show()
        

In [62]:
headers = ['words', 'euclidean', 'cosine', 'jaccard']

arr = [['all words'], ['adj and nouns'], ['nouns']]

for matrix in matrixes:
    arr[0].append(cluster_hierarchy(matrix, metrics, true_labels, n_clusters))
for matrix in matrixes_adj_nouns:
    arr[1].append(cluster_hierarchy(matrix, metrics, true_labels, n_clusters))
for matrix in matrixes_nouns:
    arr[2].append(cluster_hierarchy(matrix, metrics, true_labels, n_clusters))

print(tabulate(arr, headers, tablefmt='grid'))

+---------------+-----------------------------------------------------+-----------------------------------------------------+-----------------------------------------------------+
| words         | euclidean                                           | cosine                                              | jaccard                                             |
| all words     | complete                                            | complete                                            | complete                                            |
|               | ---------                                           | ---------                                           | ---------                                           |
|               | normalized_mutual_info_score: 0.04725060943365263   | normalized_mutual_info_score: 0.008831547134239563  | normalized_mutual_info_score: 0.01472448095701408   |
|               | adjusted_rand_score: 0.007549427520518734           | adjusted_rand_score: 5.78176