In [2]:
from sklearn.datasets import fetch_20newsgroups
from tabulate import tabulate
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import euclidean_distances, cosine_distances
from sklearn.metrics import DistanceMetric
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics import (
    normalized_mutual_info_score,
    adjusted_rand_score,
    v_measure_score,
    homogeneity_score,
)
import numpy as np
import spacy
from string import punctuation
from gensim import corpora, models
import random

In [2]:
categories = ['comp.graphics', 'rec.autos', 'sci.med',  'talk.politics.mideast']
news_groups = fetch_20newsgroups(subset='all', categories=categories, remove=('headers', 'footers', 'quotes'), shuffle=False)

In [4]:
import re

cleaned_collection = [re.sub(r'[\n\t]+| {2,}', ' ', text) for text in news_groups.data]

In [5]:
nlp = spacy.load('en_core_web_sm')

In [6]:
from spacy.lang.en import stop_words

stop_words = stop_words.STOP_WORDS
punctuations = list(punctuation)

token_collection= []
vector = []

lemmatized_collection = [[(token.lemma_.lower(), token.pos_) for token in nlp(text) if token.lemma_.lower() not in stop_words and token.lemma_.lower() not in punctuations and not token.pos_ == 'PUNCT' and not token.pos_ == 'SPACE'] for text in cleaned_collection]

In [7]:
lemm_texts = [' '.join(text) for text in [
    [token[0] for token in lemmatized_text]
    for lemmatized_text in lemmatized_collection
]]

In [8]:
lemm_texts_nouns = [' '.join(text) for text in [
    [token[0] for token in lemmatized_text if token[1] == "NOUN"]
    for lemmatized_text in lemmatized_collection
]]

lemm_texts_nouns_adj = [' '.join(text) for text in [
    [token[0] for token in lemmatized_text if token[1] == "NOUN" or token[1] == "ADJ"]
    for lemmatized_text in lemmatized_collection
]]

In [9]:
lemm_texts = [text.split() for text in lemm_texts]
lemm_texts_nouns = [text.split() for text in lemm_texts_nouns]
lemm_texts_nouns_adj = [text.split() for text in lemm_texts_nouns_adj]

In [10]:
dictionary_all = corpora.Dictionary(lemm_texts)
dictionary_nouns = corpora.Dictionary(lemm_texts_nouns)
dictionary_nouns_adj = corpora.Dictionary(lemm_texts_nouns_adj)

In [11]:
bow_corpus_all = [dictionary_all.doc2bow(doc) for doc in lemm_texts]
bow_corpus_nouns = [dictionary_nouns.doc2bow(doc) for doc in lemm_texts_nouns]
bow_corpus_nouns_adj = [
    dictionary_nouns_adj.doc2bow(doc) for doc in lemm_texts_nouns_adj
]

In [12]:
model = models.LsiModel(bow_corpus_all, id2word=dictionary_all, num_topics=10)

In [13]:
model_nouns = models.LsiModel(bow_corpus_nouns, id2word=dictionary_nouns, num_topics=10)
model_nouns_adj = models.LsiModel(
    bow_corpus_nouns_adj, id2word=dictionary_nouns_adj, num_topics=10
)

In [14]:
text_vectors = []
for doc_bow in bow_corpus_all:
    document_topics = model[doc_bow]
    document_topic_vector = [float(topic_prob) for _, topic_prob in document_topics]
    if document_topic_vector == [] or len(document_topic_vector) < 10:
        document_topic_vector = [random.uniform(-0.00001, 0.00001) for _ in range(10)]
    text_vectors.append(document_topic_vector)

In [15]:
text_vectors_nouns = []
for doc_bow in bow_corpus_nouns:
    document_topics = model[doc_bow]
    document_topic_vector = [float(topic_prob) for _, topic_prob in document_topics]
    if document_topic_vector == [] or len(document_topic_vector) < 10:
        document_topic_vector = [random.uniform(-0.00001, 0.00001) for _ in range(10)]
    text_vectors_nouns.append(document_topic_vector)

In [16]:
text_vectors_nouns_adj = []
for doc_bow in bow_corpus_nouns_adj:
    document_topics = model[doc_bow]
    document_topic_vector = [float(topic_prob) for _, topic_prob in document_topics]
    if document_topic_vector == [] or len(document_topic_vector) < 10:
        document_topic_vector = [random.uniform(-0.00001, 0.00001) for _ in range(10)]
    text_vectors_nouns_adj.append(document_topic_vector)

In [17]:
euclidean_distance_matrix = euclidean_distances(text_vectors)
euclidean_distance_matrix_adj_nouns = euclidean_distances(text_vectors_nouns)
euclidean_distance_matrix_nouns = euclidean_distances(text_vectors_nouns_adj)

cosine_distance_matrix = cosine_distances(text_vectors)
cosine_distance_matrix_adj_nouns = cosine_distances(text_vectors_nouns)
cosine_distance_matrix_nouns = cosine_distances(text_vectors_nouns_adj)

In [21]:
true_labels = news_groups.target
n_clusters = len(news_groups.target_names)
n_iterations = 50
matrixes = [euclidean_distance_matrix, cosine_distance_matrix]
matrixes_adj_nouns = [euclidean_distance_matrix_adj_nouns, cosine_distance_matrix_adj_nouns]
matrixes_nouns = [euclidean_distance_matrix_nouns, cosine_distance_matrix_nouns]
metrics = [normalized_mutual_info_score, adjusted_rand_score, v_measure_score, homogeneity_score]

In [19]:

def cluster_kmeans(matrix, metrics, true_labels, n_iterations, n_clusters):
    scores = {}
    result = []
    for metric in metrics:
        scores.update({metric.__name__: []})


    for i in range(n_iterations):
        clusters = KMeans(n_clusters=n_clusters, n_init=10, random_state=i)
        clusters.fit(matrix)

        for metric in metrics:
            score = metric(true_labels, clusters.labels_)
            scores[metric.__name__].append(score)


    for metric in scores:
        max = np.max(scores[metric])
        min = np.min(scores[metric])
        avg = np.mean(scores[metric])

        result.append(f'{metric}\nMax: {max} iter: {scores[metric].index(max) + 1}\nMin: {min} iter: {scores[metric].index(min) + 1}\nAvg: {avg}\n')
        # print(metric)
        # print(f'Max: {max} iter: {scores[metric].index(max) + 1}')
        # print(f'Min: {min} iter: {scores[metric].index(min) + 1}')
        # print(f'Avg: {avg}')
    return '\n'.join(result)

In [21]:
from tabulate import tabulate

headers = [ 'euclidean', 'cosine']

arr = [['all words'], ['adj and nouns'], ['nouns']]

for matrix in matrixes:
    arr[0].append(cluster_kmeans(matrix, metrics, true_labels, n_iterations, n_clusters))
for matrix in matrixes_adj_nouns:
    arr[1].append(cluster_kmeans(matrix, metrics, true_labels, n_iterations, n_clusters))
for matrix in matrixes_nouns:
    arr[2].append(cluster_kmeans(matrix, metrics, true_labels, n_iterations, n_clusters))

print(tabulate(arr, headers, tablefmt='grid'))

+---------------+------------------------------------+------------------------------------+
| words         | euclidean                          | cosine                             |
| all words     | normalized_mutual_info_score       | normalized_mutual_info_score       |
|               | Max: 0.012837984040047653 iter: 1  | Max: 0.19386014257970308 iter: 2   |
|               | Min: 0.012837984040047653 iter: 1  | Min: 0.1929527505752893 iter: 1    |
|               | Avg: 0.012837984040047653          | Avg: 0.19354791883337633           |
|               |                                    |                                    |
|               | adjusted_rand_score                | adjusted_rand_score                |
|               | Max: 0.0007923689839791758 iter: 1 | Max: 0.135623113640441 iter: 2     |
|               | Min: 0.0007923689839791758 iter: 1 | Min: 0.1344493683654122 iter: 1    |
|               | Avg: 0.0007923689839791758         | Avg: 0.1351526266167526  

In [20]:
# import matplotlib.pyplot as plt
# from scipy.cluster.hierarchy import dendrogram

def cluster_hierarchy(matrix, metrics, true_labels, num_clusters):
    linkages = ["complete", "average", "single"]
    result = []

    if matrix.shape[0] != matrix.shape[1]:
        affinity = "euclidean"
        matrix = matrix.toarray()
        linkages.append("ward")
    else:
        affinity = "precomputed"

    for linkage in linkages:
        result.append('\n')
        result.append(linkage)
        result.append('---------')
        # print(linkage)
        agg_clustering = AgglomerativeClustering(n_clusters=num_clusters, metric=affinity, linkage=linkage)

        agg_clustering.fit(matrix)

        for metric in metrics:
            score = metric(true_labels, agg_clustering.labels_)
            result.append(f"{metric.__name__}: {score}")
            # print(f"{metric.__name__}: ", score)

        # plt.figure(figsize=[12, 12])
        # plt.subplot(4, 1, linkages.index(linkage) + 1)
        # children = agg_clustering.children_
        # distance = np.arange(children.shape[0])
        # num_of_observations = np.arange(2, children.shape[0] + 2)
        # linkage_matrix = np.column_stack([children, distance, num_of_observations]).astype(float)
        # dendrogram(linkage_matrix)

    return '\n'.join(result)

    # plt.show()
        

In [20]:
headers = [ 'euclidean', 'cosine']

arr = [['all words'], ['adj and nouns'], ['nouns']]

for matrix in matrixes:
    arr[0].append(cluster_hierarchy(matrix, metrics, true_labels, n_clusters))
for matrix in matrixes_adj_nouns:
    arr[1].append(cluster_hierarchy(matrix, metrics, true_labels, n_clusters))
for matrix in matrixes_nouns:
    arr[2].append(cluster_hierarchy(matrix, metrics, true_labels, n_clusters))

print(tabulate(arr, headers, tablefmt='grid'))

+---------------+----------------------------------------------------+-----------------------------------------------------+
| words         | euclidean                                          | cosine                                              |
| all words     | complete                                           | complete                                            |
|               | ---------                                          | ---------                                           |
|               | normalized_mutual_info_score: 0.006609383262860259 | normalized_mutual_info_score: 0.0047403127867715195 |
|               | adjusted_rand_score: 5.535400422572383e-05         | adjusted_rand_score: 1.60894136149029e-05           |
|               | v_measure_score: 0.006609383262860259              | v_measure_score: 0.0047403127867715195              |
|               | homogeneity_score: 0.0033660793442008862           | homogeneity_score: 0.0026721489008559446            |


In [3]:
categories = ['talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc']
news_groups = fetch_20newsgroups(subset='all', categories=categories, remove=('headers', 'footers', 'quotes'), shuffle=False)

In [23]:
from tabulate import tabulate

headers = ['euclidean', 'cosine']

arr = [['all words'], ['adj and nouns'], ['nouns']]

for matrix in matrixes:
    arr[0].append(cluster_kmeans(matrix, metrics, true_labels, n_iterations, n_clusters))
for matrix in matrixes_adj_nouns:
    arr[1].append(cluster_kmeans(matrix, metrics, true_labels, n_iterations, n_clusters))
for matrix in matrixes_nouns:
    arr[2].append(cluster_kmeans(matrix, metrics, true_labels, n_iterations, n_clusters))

print(tabulate(arr, headers, tablefmt='grid'))

+---------------+--------------------------------------+-------------------------------------+
|               | euclidean                            | cosine                              |
| all words     | normalized_mutual_info_score         | normalized_mutual_info_score        |
|               | Max: 0.011588837175748943 iter: 1    | Max: 0.03744183192772247 iter: 3    |
|               | Min: 0.011588837175748943 iter: 1    | Min: 0.03608768375925906 iter: 1    |
|               | Avg: 0.011588837175748943            | Avg: 0.036485809487247195           |
|               |                                      |                                     |
|               | adjusted_rand_score                  | adjusted_rand_score                 |
|               | Max: -0.00019766272151510464 iter: 1 | Max: 0.023167454617220435 iter: 3   |
|               | Min: -0.00019766272151510464 iter: 1 | Min: 0.01986871238952054 iter: 2    |
|               | Avg: -0.0001976627215151046     

In [22]:
headers = ['euclidean', 'cosine']

arr = [['all words'], ['adj and nouns'], ['nouns']]

for matrix in matrixes:
    arr[0].append(cluster_hierarchy(matrix, metrics, true_labels, n_clusters))
for matrix in matrixes_adj_nouns:
    arr[1].append(cluster_hierarchy(matrix, metrics, true_labels, n_clusters))
for matrix in matrixes_nouns:
    arr[2].append(cluster_hierarchy(matrix, metrics, true_labels, n_clusters))

print(tabulate(arr, headers, tablefmt='grid'))

+---------------+-----------------------------------------------------+------------------------------------------------------+
| words         | euclidean                                           | cosine                                               |
| all words     | complete                                            | complete                                             |
|               | ---------                                           | ---------                                            |
|               | normalized_mutual_info_score: 0.006561090240526184  | normalized_mutual_info_score: 0.0023368840105066064  |
|               | adjusted_rand_score: 0.000646427144291917           | adjusted_rand_score: 0.0012058651643825499           |
|               | v_measure_score: 0.0065610902405261835              | v_measure_score: 0.002336884010506607                |
|               | homogeneity_score: 0.0033459869400902713            | homogeneity_score: 0.001326778722457823