In [1]:
from sklearn.datasets import fetch_20newsgroups
from tabulate import tabulate
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import euclidean_distances, cosine_distances
from sklearn.metrics import DistanceMetric
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics import (
    normalized_mutual_info_score,
    adjusted_rand_score,
    v_measure_score,
    homogeneity_score,
)
import numpy as np
import spacy
from string import punctuation
from gensim import corpora, models

In [2]:
categories = ['comp.graphics', 'rec.autos', 'sci.med',  'talk.politics.mideast']
news_groups = fetch_20newsgroups(subset='all', categories=categories, remove=('headers', 'footers', 'quotes'), shuffle=False)

In [3]:
import re

cleaned_collection = [re.sub(r'[\n\t]+| {2,}', ' ', text) for text in news_groups.data]

In [4]:
nlp = spacy.load('en_core_web_sm')

In [5]:
from spacy.lang.en import stop_words

stop_words = stop_words.STOP_WORDS
punctuations = list(punctuation)

token_collection= []
vector = []

lemmatized_collection = [[(token.lemma_.lower(), token.pos_) for token in nlp(text) if token.lemma_.lower() not in stop_words and token.lemma_.lower() not in punctuations and not token.pos_ == 'PUNCT' and not token.pos_ == 'SPACE'] for text in cleaned_collection]

In [6]:
lemm_texts = [' '.join(text) for text in [
    [token[0] for token in lemmatized_text]
    for lemmatized_text in lemmatized_collection
]]

In [7]:
lemm_texts_nouns = [' '.join(text) for text in [
    [token[0] for token in lemmatized_text if token[1] == "NOUN"]
    for lemmatized_text in lemmatized_collection
]]

lemm_texts_nouns_adj = [' '.join(text) for text in [
    [token[0] for token in lemmatized_text if token[1] == "NOUN" or token[1] == "ADJ"]
    for lemmatized_text in lemmatized_collection
]]

In [8]:
lemm_texts = [text.split() for text in lemm_texts]
lemm_texts_nouns = [text.split() for text in lemm_texts_nouns]
lemm_texts_nouns_adj = [text.split() for text in lemm_texts_nouns_adj]

In [9]:
dictionary_all = corpora.Dictionary(lemm_texts)
dictionary_nouns = corpora.Dictionary(lemm_texts_nouns)
dictionary_nouns_adj = corpora.Dictionary(lemm_texts_nouns_adj)

In [10]:
bow_corpus_all = [dictionary_all.doc2bow(doc) for doc in lemm_texts]
bow_corpus_nouns = [dictionary_nouns.doc2bow(doc) for doc in lemm_texts_nouns]
bow_corpus_nouns_adj = [
    dictionary_nouns_adj.doc2bow(doc) for doc in lemm_texts_nouns_adj
]

In [11]:
model = models.LdaModel(
    bow_corpus_all, id2word=dictionary_all, num_topics=10, passes=15
)

In [12]:
model_nouns = models.LdaModel(
    bow_corpus_nouns, id2word=dictionary_nouns, num_topics=10, passes=15
)
model_nouns_adj = models.LdaModel(
    bow_corpus_nouns_adj, id2word=dictionary_nouns_adj, num_topics=10, passes=15
)

In [13]:
text_vectors = []
for doc_bow in bow_corpus_all:
    document_topics = model.get_document_topics(doc_bow)
    document_topic_vector = [topic_prob for _, topic_prob in document_topics]
    text_vectors.append(document_topic_vector)

In [14]:
text_vectors_nouns = []
for doc_bow in bow_corpus_nouns:
    document_topics = model_nouns.get_document_topics(doc_bow)
    document_topic_vector = [topic_prob for _, topic_prob in document_topics]
    text_vectors_nouns.append(document_topic_vector)

In [15]:
text_vectors_nouns_adj = []
for doc_bow in bow_corpus_nouns_adj:
    document_topics = model_nouns_adj.get_document_topics(doc_bow)
    document_topic_vector = [topic_prob for _, topic_prob in document_topics]
    text_vectors_nouns_adj.append(document_topic_vector)

In [16]:
euclidean_distance_matrix = euclidean_distances(text_vectors)
euclidean_distance_matrix_adj_nouns = euclidean_distances(text_vectors_nouns)
euclidean_distance_matrix_nouns = euclidean_distances(text_vectors_nouns_adj)

cosine_distance_matrix = cosine_distances(text_vectors)
cosine_distance_matrix_adj_nouns = cosine_distances(text_vectors_nouns)
cosine_distance_matrix_nouns = cosine_distances(text_vectors_nouns_adj)

In [17]:
true_labels = news_groups.target
n_clusters = len(news_groups.target_names)
n_iterations = 50
matrixes = [euclidean_distance_matrix, cosine_distance_matrix]
matrixes_adj_nouns = [euclidean_distance_matrix_adj_nouns, cosine_distance_matrix_adj_nouns]
matrixes_nouns = [euclidean_distance_matrix_nouns, cosine_distance_matrix_nouns]
metrics = [normalized_mutual_info_score, adjusted_rand_score, v_measure_score, homogeneity_score]

In [18]:

def cluster_kmeans(matrix, metrics, true_labels, n_iterations, n_clusters):
    scores = {}
    result = []
    for metric in metrics:
        scores.update({metric.__name__: []})


    for i in range(n_iterations):
        clusters = KMeans(n_clusters=n_clusters, n_init=10, random_state=i)
        clusters.fit(matrix)

        for metric in metrics:
            score = metric(true_labels, clusters.labels_)
            scores[metric.__name__].append(score)


    for metric in scores:
        max = np.max(scores[metric])
        min = np.min(scores[metric])
        avg = np.mean(scores[metric])

        result.append(f'{metric}\nMax: {max} iter: {scores[metric].index(max) + 1}\nMin: {min} iter: {scores[metric].index(min) + 1}\nAvg: {avg}\n')
        # print(metric)
        # print(f'Max: {max} iter: {scores[metric].index(max) + 1}')
        # print(f'Min: {min} iter: {scores[metric].index(min) + 1}')
        # print(f'Avg: {avg}')
    return '\n'.join(result)

In [22]:
from tabulate import tabulate

headers = ['euclidean', 'cosine']

arr = [['all words'], ['adj and nouns'], ['nouns']]

for matrix in matrixes:
    arr[0].append(cluster_kmeans(matrix, metrics, true_labels, n_iterations, n_clusters))
for matrix in matrixes_adj_nouns:
    arr[1].append(cluster_kmeans(matrix, metrics, true_labels, n_iterations, n_clusters))
for matrix in matrixes_nouns:
    arr[2].append(cluster_kmeans(matrix, metrics, true_labels, n_iterations, n_clusters))

print(tabulate(arr, headers, tablefmt='grid'))

+---------------+----------------------------------+----------------------------------+
| words         | euclidean                        | cosine                           |
| all words     | normalized_mutual_info_score     | normalized_mutual_info_score     |
|               | Max: 0.34879373468230035 iter: 1 | Max: 0.45075554749625896 iter: 1 |
|               | Min: 0.34879373468230035 iter: 1 | Min: 0.38331162859740525 iter: 2 |
|               | Avg: 0.34879373468230035         | Avg: 0.3970446384979763          |
|               |                                  |                                  |
|               | adjusted_rand_score              | adjusted_rand_score              |
|               | Max: 0.30369951005543305 iter: 1 | Max: 0.4608616764082237 iter: 1  |
|               | Min: 0.30369951005543305 iter: 1 | Min: 0.37914346245909464 iter: 2 |
|               | Avg: 0.30369951005543305         | Avg: 0.39583436248724657         |
|               |               

In [19]:
# import matplotlib.pyplot as plt
# from scipy.cluster.hierarchy import dendrogram

def cluster_hierarchy(matrix, metrics, true_labels, num_clusters):
    linkages = ["complete", "average", "single"]
    result = []

    if matrix.shape[0] != matrix.shape[1]:
        affinity = "euclidean"
        matrix = matrix.toarray()
        linkages.append("ward")
    else:
        affinity = "precomputed"

    for linkage in linkages:
        result.append('\n')
        result.append(linkage)
        result.append('---------')
        # print(linkage)
        agg_clustering = AgglomerativeClustering(n_clusters=num_clusters, metric=affinity, linkage=linkage)

        agg_clustering.fit(matrix)

        for metric in metrics:
            score = metric(true_labels, agg_clustering.labels_)
            result.append(f"{metric.__name__}: {score}")
            # print(f"{metric.__name__}: ", score)

        # plt.figure(figsize=[12, 12])
        # plt.subplot(4, 1, linkages.index(linkage) + 1)
        # children = agg_clustering.children_
        # distance = np.arange(children.shape[0])
        # num_of_observations = np.arange(2, children.shape[0] + 2)
        # linkage_matrix = np.column_stack([children, distance, num_of_observations]).astype(float)
        # dendrogram(linkage_matrix)

    return '\n'.join(result)

    # plt.show()
        

In [21]:
headers = ['euclidean', 'cosine']

arr = [['all words'], ['adj and nouns'], ['nouns']]

for matrix in matrixes:
    arr[0].append(cluster_hierarchy(matrix, metrics, true_labels, n_clusters))
for matrix in matrixes_adj_nouns:
    arr[1].append(cluster_hierarchy(matrix, metrics, true_labels, n_clusters))
for matrix in matrixes_nouns:
    arr[2].append(cluster_hierarchy(matrix, metrics, true_labels, n_clusters))

print(tabulate(arr, headers, tablefmt='grid'))

+---------------+-----------------------------------------------------+-----------------------------------------------------+
| words         | euclidean                                           | cosine                                              |
| all words     | complete                                            | complete                                            |
|               | ---------                                           | ---------                                           |
|               | normalized_mutual_info_score: 0.14155381144358758   | normalized_mutual_info_score: 0.1641302538171248    |
|               | adjusted_rand_score: 0.052034622348958504           | adjusted_rand_score: 0.047452619900946486           |
|               | v_measure_score: 0.14155381144358758                | v_measure_score: 0.16413025381712482                |
|               | homogeneity_score: 0.11194050792656518              | homogeneity_score: 0.12490042590721691        

In [3]:
categories = ['talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc']
news_groups = fetch_20newsgroups(subset='all', categories=categories, remove=('headers', 'footers', 'quotes'), shuffle=False)

In [18]:
from tabulate import tabulate

headers = ['euclidean', 'cosine']

arr = [['all words'], ['adj and nouns'], ['nouns']]

for matrix in matrixes:
    arr[0].append(cluster_kmeans(matrix, metrics, true_labels, n_iterations, n_clusters))
for matrix in matrixes_adj_nouns:
    arr[1].append(cluster_kmeans(matrix, metrics, true_labels, n_iterations, n_clusters))
for matrix in matrixes_nouns:
    arr[2].append(cluster_kmeans(matrix, metrics, true_labels, n_iterations, n_clusters))

print(tabulate(arr, headers, tablefmt='grid'))

+---------------+--------------------------------------+-------------------------------------+--------------------------------------+
| words         | euclidean                            | cosine                              | jaccard                              |
| all words     | normalized_mutual_info_score         | normalized_mutual_info_score        | normalized_mutual_info_score         |
|               | Max: 0.00989720425486164 iter: 27    | Max: 0.06705030887874189 iter: 16   | Max: 0.0024613566626412752 iter: 34  |
|               | Min: 0.00867397313342348 iter: 44    | Min: 0.06589595692174573 iter: 30   | Min: 0.0006243085014123848 iter: 26  |
|               | Avg: 0.009237487871880801            | Avg: 0.06650649272880205            | Avg: 0.0009126573425607214           |
|               |                                      |                                     |                                      |
|               | adjusted_rand_score                  | adjus

In [17]:
headers = ['euclidean', 'cosine']

arr = [['all words'], ['adj and nouns'], ['nouns']]

for matrix in matrixes:
    arr[0].append(cluster_hierarchy(matrix, metrics, true_labels, n_clusters))
for matrix in matrixes_adj_nouns:
    arr[1].append(cluster_hierarchy(matrix, metrics, true_labels, n_clusters))
for matrix in matrixes_nouns:
    arr[2].append(cluster_hierarchy(matrix, metrics, true_labels, n_clusters))

print(tabulate(arr, headers, tablefmt='grid'))

+---------------+-----------------------------------------------------+-----------------------------------------------------+------------------------------------------------------+
| words         | euclidean                                           | cosine                                              | jaccard                                              |
| all words     | complete                                            | complete                                            | complete                                             |
|               | ---------                                           | ---------                                           | ---------                                            |
|               | normalized_mutual_info_score: 0.01757673451070667   | normalized_mutual_info_score: 0.023622156850907686  | normalized_mutual_info_score: 0.0027278125516680633  |
|               | adjusted_rand_score: 0.0021760001055478474          | adjusted_rand_score: 0.