# Expirement Design for Data Science - Clustering
## Group 26

### Imports

In [2]:
import pandas as pd
import numpy as np
from sklearn import datasets
from sklearn import cluster
import os
import nltk
from nltk.corpus import stopwords
# download all packages (easiest) with:
#nltk.download()

import initialize
import pip

# if you use the notebook for the first time, also run: 
#pip.main(['install', 'spherecluster'])

# in case you get the following error message from the installation of spherecluster: 
# ImportError: cannot import name '_k_means' from 'sklearn.cluster' 
# Open the file spherical_kmeans.py (path specified in the error message) and replace in line 16 '_k_means' with 'KMeans'


# modified data file from the original publication (https://link.springer.com/chapter/10.1007%2F978-3-030-45442-5_7#Bib1):
from utils import *

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml




## Preprocess Data

### Step 1 - Checking the data size

In [2]:
(news_train, news_test) = initialize.get_20_newsgroups_dataset()

In [3]:
print("Dataset size:\n")
print("Train set: {}".format(len(news_train.filenames)))
print("Test set: {}".format(len(news_test.filenames)))
print("Total: {}".format(len(news_train.filenames) + len(news_test.filenames)))

Dataset size:

Train set: 11314
Test set: 7532
Total: 18846


In [4]:
# Save labels to file for later need.
np.save('news_test_labels', news_test.target)
np.save('news_train_labels', news_train.target)

### Step 2 - Tokenize the data sets
This can take quite some time: 

In [5]:
tok_twenty_train = [tokenize_text(sent) for sent in news_train.data]  # Tokenized train data
tok_twenty_test = [tokenize_text(sent) for sent in news_test.data]    # Tokenized test data

# If everything works, delete the original ROB04_data to save RAM.
if (len(tok_twenty_train) == len(news_train.data)):
    print("Tokenization of train set OK; original data removed.")
    del news_train
    
if (len(tok_twenty_test) == len(news_test.data)):
    print("Tokenization of test set OK; original data removed.")
    del news_test

Tokenization of train set OK; original data removed.
Tokenization of test set OK; original data removed.


In [14]:
# Save the tokenized datasets to a file (tokenization takes time).
with open("tok_twenty_train.csv", mode="w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerows(tok_twenty_train)
    
with open("tok_twenty_test.csv", mode="w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerows(tok_twenty_test)


In [3]:
# Read tokenized train and test sets, generate also a total file:
tok_twenty_tot =[]   # new code lines

tok_twenty_train = []  # List to store train set
tok_twenty_test = []   # List to store test set

with open('tok_twenty_train.csv') as csvfile:
    read = csv.reader(csvfile, delimiter = ',')
    for row in read:
        tok_twenty_train.append(row)
        tok_twenty_tot.append(row) # new code lines
        
with open('tok_twenty_test.csv') as csvfile:
    read = csv.reader(csvfile, delimiter = ',')
    for row in read:
        tok_twenty_test.append(row)
        tok_twenty_tot.append(row) # new code lines

For the evaluation of the different models also the labels of the total dataset has to be created:

In [4]:
# new code lines: 

# load the labels from training and test set:
arr1 = np.load('news_train_labels.npy')
arr2 = np.load('news_test_labels.npy')

# combine the labels: 
arr3 = np.append(arr1 , arr2)

# save the labels for later usage:
np.save('twenty_labels_tot', arr3)
twenty_labels_tot = np.load('twenty_labels_tot.npy')

In [5]:
# check for correct length:
print("Train set: {}".format(len(arr1)))
print("Test set: {}".format(len(arr2)))
print("Total: {}".format(len(arr1) + len(arr2)))

Train set: 11314
Test set: 7532
Total: 18846


## Clustering

### Fixed parameters

In [6]:
n_top = 50     # Number of topics for LSI and LDA.
               # Vector dimension for cBow and PV.
mix_comp = 15  # Mixture components.
K = 20         # Number of clusters.
n_feat = 5000  # Number of features.

n_top_high = 100  # For computations with 100-dim. feat. vectors.
n_top_low = 20    # For computations with 20-dim. feat. vectors.

### 1. TF-IDF

Repeat clustering 10 times (running the code takes some time).

The code had to be changed in line 24, as no label argument was defined within the evaluate_cluster() function. 
Unfortunately for this part of code there are no result files from the original available.

In [36]:
### Try several clustering runs and store results.

# Convert a collection of text documents into a matrix of token counts.
count_vectorizer_TFIDF = CountVectorizer(tokenizer=identity_tokenizer, 
                                         lowercase=False, max_features=n_feat)

# Matrix of shape len(data) x #words.
twenty_features_TFIDF = count_vectorizer_TFIDF.fit_transform(tok_twenty_tot)

ARIs = [[], []]    # Store ARI values: random, k-means++ init.
NMIs = [[], []]    # Store NMI values: random, k-means++ init.

init_methods = ['random', 'k-means++']

for j, ini in enumerate(init_methods):
    print("Processing init = {}.".format(ini))
    for i in range(10):
        print("Processing i = {}.".format(i))

        # Initialize the clustering alg.
        km_TFIDF = KMeans(n_clusters=K, n_init=10, init=ini, max_iter=200)
        km_TFIDF.fit(twenty_features_TFIDF)
    
        ARI, NMI = evaluate_cluster(km_TFIDF, twenty_labels_tot)
    
        print("Iteration number: {}".format(km_TFIDF.n_iter_))
        print("Clustering performance for TF-IDF:\nARI: {0:.3f}\nNMI: {1:.3f}".format(ARI*100, NMI*100))
   
        ARIs[j].append(ARI)
        NMIs[j].append(NMI)
        
df_TFIDF = pd.DataFrame(np.multiply(list(zip(ARIs[0], NMIs[0], ARIs[1], NMIs[1])), 100), 
                        columns =['ARI_random', 'NMI_random', 'ARI_kmeans', 'NMI_kmeans'])

df_TFIDF.round(3).to_csv('results/TFIDF_clust.csv', header=True, index=False)

Processing init = random.
Processing i = 0.
Iteration number: 62
Clustering performance for TF-IDF:
ARI: 0.575
NMI: 3.725
Processing i = 1.
Iteration number: 72
Clustering performance for TF-IDF:
ARI: 0.313
NMI: 3.242
Processing i = 2.
Iteration number: 80
Clustering performance for TF-IDF:
ARI: 0.551
NMI: 3.720
Processing i = 3.
Iteration number: 86
Clustering performance for TF-IDF:
ARI: 0.571
NMI: 3.767
Processing i = 4.
Iteration number: 67
Clustering performance for TF-IDF:
ARI: 0.564
NMI: 3.743
Processing i = 5.
Iteration number: 61
Clustering performance for TF-IDF:
ARI: 0.317
NMI: 3.319
Processing i = 6.
Iteration number: 86
Clustering performance for TF-IDF:
ARI: 0.617
NMI: 3.777
Processing i = 7.
Iteration number: 62
Clustering performance for TF-IDF:
ARI: 0.365
NMI: 3.425
Processing i = 8.
Iteration number: 86
Clustering performance for TF-IDF:
ARI: 0.317
NMI: 3.295
Processing i = 9.
Iteration number: 88
Clustering performance for TF-IDF:
ARI: 0.566
NMI: 3.698
Processing ini

In [37]:
df_TFIDF

Unnamed: 0,ARI_random,NMI_random,ARI_kmeans,NMI_kmeans
0,0.575,3.725,0.011,1.117
1,0.313,3.242,0.011,1.056
2,0.551,3.72,0.011,1.109
3,0.571,3.767,0.011,1.094
4,0.564,3.743,0.011,1.083
5,0.317,3.319,0.018,0.988
6,0.617,3.777,0.011,1.042
7,0.365,3.425,0.011,1.052
8,0.317,3.295,0.011,1.052
9,0.566,3.698,0.011,1.087


### Generate dictionary and corpus

In [40]:
# Create a Gensim dictionary and corpus.
dct = corpora.Dictionary(tok_twenty_tot)
# Gensim uses bag of wards to represent in this form.
corpus_twenty = [dct.doc2bow(sent) for sent in tok_twenty_tot]

### 2. LSI

In [41]:
# Run LSI model to get topic modelling.
lsi_model_twenty = LsiModel(corpus=corpus_twenty, num_topics=n_top_high, id2word=dct)

# Converting topics to feature vectors.
# The probability distribution of the topics for 
# a specific review will be our feature vector.

feat_vecs_twenty = matutils.corpus2dense(lsi_model_twenty[corpus_twenty], num_terms=n_top_high).T.tolist()

### Try several clustering runs and store results.

ARIs = [[], []]    # Store ARI values: random, k-means++ init.
NMIs = [[], []]    # Store NMI values: random, k-means++ init.

init_methods = ['random', 'k-means++']

for j, ini in enumerate(init_methods):
    print("Processing init = {}.".format(ini))
    for i in range(10):
        print("Processing i = {}.".format(i))

        # Initialize the clustering alg.
        km_LSI = KMeans(n_clusters=K, n_init=10, init=ini, max_iter=200)
        km_LSI.fit(feat_vecs_twenty)
    
        ARI, NMI = evaluate_cluster(km_LSI, twenty_labels_tot)
    
        print("Iteration number: {}".format(km_LSI.n_iter_))
        print("Clustering performance for LSI:\nARI: {0:.3f}\nNMI: {1:.3f}".format(ARI*100, NMI*100))
   
        ARIs[j].append(ARI)
        NMIs[j].append(NMI)
        
df_LSI = pd.DataFrame(np.multiply(list(zip(ARIs[0], NMIs[0], ARIs[1], NMIs[1])), 100), 
                      columns =['ARI_random', 'NMI_random', 'ARI_kmeans', 'NMI_kmeans'])

df_LSI.round(3).to_csv('results/LSI_clust_100topics.csv', header=True, index=False)

Processing init = random.
Processing i = 0.
Iteration number: 63
Clustering performance for LSI:
ARI: 0.697
NMI: 3.854
Processing i = 1.
Iteration number: 81
Clustering performance for LSI:
ARI: 0.608
NMI: 3.802
Processing i = 2.
Iteration number: 64
Clustering performance for LSI:
ARI: 0.716
NMI: 3.916
Processing i = 3.
Iteration number: 69
Clustering performance for LSI:
ARI: 0.544
NMI: 3.731
Processing i = 4.
Iteration number: 78
Clustering performance for LSI:
ARI: 0.610
NMI: 3.806
Processing i = 5.
Iteration number: 78
Clustering performance for LSI:
ARI: 0.311
NMI: 3.291
Processing i = 6.
Iteration number: 73
Clustering performance for LSI:
ARI: 0.721
NMI: 4.080
Processing i = 7.
Iteration number: 60
Clustering performance for LSI:
ARI: 0.591
NMI: 3.847
Processing i = 8.
Iteration number: 64
Clustering performance for LSI:
ARI: 0.589
NMI: 3.840
Processing i = 9.
Iteration number: 75
Clustering performance for LSI:
ARI: 0.617
NMI: 3.753
Processing init = k-means++.
Processing i = 

In [42]:
df_LSI

Unnamed: 0,ARI_random,NMI_random,ARI_kmeans,NMI_kmeans
0,0.697289,3.854219,0.018136,0.956284
1,0.608305,3.801824,0.010717,1.07863
2,0.71568,3.915953,0.010826,1.141271
3,0.543538,3.731247,0.010673,1.083788
4,0.610418,3.806324,0.028034,1.536651
5,0.310731,3.291402,0.010934,1.095184
6,0.721423,4.079949,0.010556,1.110189
7,0.591261,3.847428,0.010708,1.019375
8,0.589166,3.840464,0.010802,1.027931
9,0.6167,3.75297,0.010474,1.051045


### 3. LDA

In [7]:
# Run LDA model to get topic modelling.
lda_model_twenty = LdaMulticore(corpus=corpus_twenty, num_topics=n_top_high, id2word=dct,
                                passes=60, workers=6)

# Save model
# lda_model_twenty.save("models/LDA/LDA_clustering.model")

# Converting topics to feature vectors
# The probability distribution of the topics for 
# a specific review will be our feature vector.

feat_vecs_twenty = []

for i in range(len(corpus_twenty)):
    top_topics = lda_model_twenty.get_document_topics(corpus_twenty[i], minimum_probability=0.0)
    topic_vec = [top_topics[i][1] for i in range(n_top_high)]
    feat_vecs_twenty.append(topic_vec)

### Try several clustering runs and store results.

ARIs = [[], []]    # Store ARI values: random, k-means++ init.
NMIs = [[], []]    # Store NMI values: random, k-means++ init.

init_methods = ['random', 'k-means++']

for j, ini in enumerate(init_methods):
    print("Processing init = {}.".format(ini))
    for i in range(10):
        print("Processing i = {}.".format(i))

        # Initialize the clustering alg.
        km_LDA = KMeans(n_clusters=K, n_init=10, init=ini, max_iter=200)
        km_LDA.fit(feat_vecs_twenty)
    
        ARI, NMI = evaluate_cluster(km_LDA, twenty_labels_tot)
    
        print("Iteration number: {}".format(km_LDA.n_iter_))
        print("Clustering performance for LDA:\nARI: {0:.3f}\nNMI: {1:.3f}".format(ARI*100, NMI*100))
   
        ARIs[j].append(ARI)
        NMIs[j].append(NMI)
        
df_LDA = pd.DataFrame(np.multiply(list(zip(ARIs[0], NMIs[0], ARIs[1], NMIs[1])), 100), 
                      columns =['ARI_random', 'NMI_random', 'ARI_kmeans', 'NMI_kmeans'])

df_LDA.round(3).to_csv('results/LDA_clust_100topics.csv', header=True, index=False)

NameError: name 'corpus_twenty' is not defined

In [None]:
df_LDA

### 4. cBow

In [8]:
## Set-up w2v model.

cores = 6      # Threads used for training

# Initialize model.
w2v_model_twenty = Word2Vec(size=n_top_high, window=5, min_count=1, workers=cores)

# Build the vocabulary.
w2v_model_twenty.build_vocab(tok_twenty_tot)

# Train model.
w2v_model_twenty.train(tok_twenty_tot, total_examples=w2v_model_twenty.corpus_count, epochs=60)

# Save model.
w2v_model_twenty.save("models/cBow/cBow_clustering_100topics.model")

w2v_model_twenty.init_sims(replace=True)

# Getting feature vectors.
twenty_w2v_aver = word_averaging_list(w2v_model_twenty.wv, tok_twenty_tot)


### Try several clustering runs and store results.

ARIs = [[], []]    # Store ARI values: random, k-means++ init.
NMIs = [[], []]    # Store NMI values: random, k-means++ init.

init_methods = ['random', 'k-means++']

for j, ini in enumerate(init_methods):
    print("Processing init = {}.".format(ini))
    for i in range(10):
        print("Processing i = {}.".format(i))

        # Initialize the clustering alg.
        km_cBow = KMeans(n_clusters=K, n_init=10, init=ini, max_iter=200)
        km_cBow.fit(twenty_w2v_aver)
    
        ARI, NMI = evaluate_cluster(km_cBow, twenty_labels_tot)
    
        print("Iteration number: {}".format(km_cBow.n_iter_))
        print("Clustering performance for cBow:\nARI: {0:.3f}\nNMI: {1:.3f}".format(ARI*100, NMI*100))
   
        ARIs[j].append(ARI)
        NMIs[j].append(NMI)
        
df_cBow = pd.DataFrame(np.multiply(list(zip(ARIs[0], NMIs[0], ARIs[1], NMIs[1])), 100), 
                       columns =['ARI_random', 'NMI_random', 'ARI_kmeans', 'NMI_kmeans'])

df_cBow.round(3).to_csv('results/cBow_clust_100topics.csv', header=True, index=False)

Processing init = random.
Processing i = 0.
Iteration number: 91
Clustering performance for cBow:
ARI: 22.426
NMI: 41.356
Processing i = 1.
Iteration number: 80
Clustering performance for cBow:
ARI: 24.587
NMI: 43.584
Processing i = 2.
Iteration number: 62
Clustering performance for cBow:
ARI: 24.608
NMI: 43.677
Processing i = 3.
Iteration number: 49
Clustering performance for cBow:
ARI: 24.707
NMI: 43.640
Processing i = 4.
Iteration number: 66
Clustering performance for cBow:
ARI: 24.570
NMI: 43.698
Processing i = 5.
Iteration number: 88
Clustering performance for cBow:
ARI: 24.540
NMI: 43.626
Processing i = 6.
Iteration number: 200
Clustering performance for cBow:
ARI: 24.658
NMI: 43.624
Processing i = 7.
Iteration number: 132
Clustering performance for cBow:
ARI: 24.530
NMI: 43.580
Processing i = 8.
Iteration number: 115
Clustering performance for cBow:
ARI: 24.631
NMI: 43.565
Processing i = 9.
Iteration number: 70
Clustering performance for cBow:
ARI: 24.722
NMI: 43.633
Processing 

In [9]:
df_cBow

Unnamed: 0,ARI_random,NMI_random,ARI_kmeans,NMI_kmeans
0,22.425526,41.35598,24.072269,42.789966
1,24.587096,43.584043,24.689664,43.660655
2,24.608407,43.677179,24.659662,43.625171
3,24.706912,43.640046,24.680484,43.635702
4,24.569937,43.698164,24.332699,42.836343
5,24.539958,43.626031,24.654819,43.58722
6,24.658053,43.624297,23.108998,41.04448
7,24.530405,43.579807,24.651223,43.594205
8,24.630834,43.564513,24.585026,43.560738
9,24.721509,43.632774,24.555304,43.613814


### 5. PV models

In [10]:
# For Doc2Vec data need to be tokenized + tagged.
tagged_tok_twenty_tot = []

for j, sent in enumerate(tok_twenty_tot):
    tagged_tok_twenty_tot.append(TaggedDocument(words=sent, tags=[j]))
    
# Set up d2v model.
cores = 6      # Threads used for training
assert gensim.models.doc2vec.FAST_VERSION > -1, "Too slow otherwise"

# Initialize 2 models: PV-DBOW and PV-DM.
d2v_models = [
    # PV-DBOW (dm=0).
    Doc2Vec(dm=0, vector_size=n_top_high, window=5, min_count=1, sample=0, workers=cores),
    # PV-DM (dm=1) with default averaging.
    Doc2Vec(dm=1, vector_size=n_top_high, window=5, min_count=1, sample=0, workers=cores)
]

# Build the vocabulary
for model in d2v_models:
    model.build_vocab(tagged_tok_twenty_tot)
    print("%s vocabulary scanned and state initialized" % model)
    
# Train the models.
for model in d2v_models: 
    print("Training %s" % model)
    model.train(tagged_tok_twenty_tot, total_examples=model.corpus_count, epochs=30)
    

### Try several clustering runs and store results.

init_methods = ['random', 'k-means++']

for m, model in enumerate(d2v_models):
    ARIs = [[], []]    # Store ARI values: random, k-means++ init.
    NMIs = [[], []]    # Store NMI values: random, k-means++ init.
    feat_vecs_twenty = [model.infer_vector(doc.words) for doc in tagged_tok_twenty_tot]
    for j, ini in enumerate(init_methods):
        print("Processing init = {}.".format(ini))
        for i in range(10):
            print("Processing i = {}.".format(i))

            # Initialize the clustering alg.
            km_PV = KMeans(n_clusters=K, n_init=10, init=ini, max_iter=200)
            km_PV.fit(feat_vecs_twenty)
    
            ARI, NMI = evaluate_cluster(km_PV, twenty_labels_tot)
    
            print("Iteration number: {}".format(km_PV.n_iter_))
            print("Clustering performance for PV:\nARI: {0:.3f}\nNMI: {1:.3f}".format(ARI*100, NMI*100))
   
            ARIs[j].append(ARI)
            NMIs[j].append(NMI)
        
    df_cBow = pd.DataFrame(np.multiply(list(zip(ARIs[0], NMIs[0], ARIs[1], NMIs[1])), 100), 
                           columns =['ARI_random', 'NMI_random', 'ARI_kmeans', 'NMI_kmeans'])
    if m == 0:
        df_cBow.round(3).to_csv('results/PV_DBOW_clust_30ep_100topics.csv', header=True, index=False)
    else:
        df_cBow.round(3).to_csv('results/PV_DM_clust_30ep_100topics.csv', header=True, index=False)

Doc2Vec(dbow,d100,n5,t6) vocabulary scanned and state initialized
Doc2Vec(dm/m,d100,n5,w5,t6) vocabulary scanned and state initialized
Training Doc2Vec(dbow,d100,n5,t6)
Training Doc2Vec(dm/m,d100,n5,w5,t6)
Processing init = random.
Processing i = 0.
Iteration number: 50
Clustering performance for PV:
ARI: 44.639
NMI: 62.639
Processing i = 1.
Iteration number: 72
Clustering performance for PV:
ARI: 47.141
NMI: 63.701
Processing i = 2.
Iteration number: 34
Clustering performance for PV:
ARI: 43.807
NMI: 62.142
Processing i = 3.
Iteration number: 103
Clustering performance for PV:
ARI: 45.382
NMI: 63.326
Processing i = 4.
Iteration number: 58
Clustering performance for PV:
ARI: 47.547
NMI: 64.379
Processing i = 5.
Iteration number: 85
Clustering performance for PV:
ARI: 45.722
NMI: 63.586
Processing i = 6.
Iteration number: 69
Clustering performance for PV:
ARI: 45.858
NMI: 64.352
Processing i = 7.
Iteration number: 59
Clustering performance for PV:
ARI: 45.932
NMI: 64.443
Processing i = 

### 6. FV-GMM

In [11]:
# Load the cBow model.
w2v_model_twenty = Word2Vec.load("models/cBow/cBow_clustering_100topics.model")

In [12]:
gmm_twenty = mixture.GaussianMixture(n_components=mix_comp, covariance_type='diag', max_iter=200, 
                                     n_init=5, random_state=22, reg_covar=1e-05)
print("Fitting GMM...")

gmm_twenty.fit(w2v_model_twenty.wv.vectors)

print("GMM iteration number: {}.".format(gmm_twenty.n_iter_))

FV_twenty = [FV_GMM(BoWE_doc(w2v_model_twenty.wv, tok_twenty_tot[k]), gmm_twenty) for k in range(len(tok_twenty_tot))]

### Try several clustering runs and store results.

ARIs = [[], []]    # Store ARI values: random, k-means++ init.
NMIs = [[], []]    # Store NMI values: random, k-means++ init.

init_methods = ['random', 'k-means++']

for j, ini in enumerate(init_methods):
    print("Processing init = {}.".format(ini))
    for i in range(10):
        print("Processing i = {}.".format(i))

        # Initialize the clustering alg.
        km_FV_GMM = KMeans(n_clusters=K, n_init=10, init=ini, max_iter=200)
        km_FV_GMM.fit(FV_twenty)
    
        ARI, NMI = evaluate_cluster(km_FV_GMM, twenty_labels_tot)
    
        print("Iteration number: {}".format(km_FV_GMM.n_iter_))
        print("Clustering performance for FV-GMM:\nARI: {0:.3f}\nNMI: {1:.3f}".format(ARI*100, NMI*100))
   
        ARIs[j].append(ARI)
        NMIs[j].append(NMI)
        
df_FV_GMM = pd.DataFrame(np.multiply(list(zip(ARIs[0], NMIs[0], ARIs[1], NMIs[1])), 100), 
                         columns =['ARI_random', 'NMI_random', 'ARI_kmeans', 'NMI_kmeans'])

df_FV_GMM.round(3).to_csv('results/FV_GMM_clust_100topics.csv', header=True, index=False)

Fitting GMM...
GMM iteration number: 178.
Processing init = random.
Processing i = 0.
Iteration number: 160
Clustering performance for FV-GMM:
ARI: 0.777
NMI: 3.912
Processing i = 1.
Iteration number: 183
Clustering performance for FV-GMM:
ARI: 0.750
NMI: 3.873
Processing i = 2.
Iteration number: 136
Clustering performance for FV-GMM:
ARI: 0.777
NMI: 3.909
Processing i = 3.
Iteration number: 139
Clustering performance for FV-GMM:
ARI: 0.777
NMI: 3.909
Processing i = 4.
Iteration number: 200
Clustering performance for FV-GMM:
ARI: 0.800
NMI: 3.831
Processing i = 5.
Iteration number: 200
Clustering performance for FV-GMM:
ARI: 0.798
NMI: 3.823
Processing i = 6.
Iteration number: 143
Clustering performance for FV-GMM:
ARI: 0.782
NMI: 3.929
Processing i = 7.
Iteration number: 152
Clustering performance for FV-GMM:
ARI: 0.777
NMI: 3.909
Processing i = 8.
Iteration number: 159
Clustering performance for FV-GMM:
ARI: 0.777
NMI: 3.909
Processing i = 9.
Iteration number: 137
Clustering performa

In [None]:
df_FV_GMM

### 7. FV-moVMF

In [None]:
vmf_twenty = VonMisesFisherMixture(n_clusters=mix_comp, posterior_type='soft', max_iter=300, 
                                   n_init=6, n_jobs=6, copy_x=True, normalize=True, random_state=22)

print("Fitting moVMF...")

vmf_twenty.fit(normalize(w2v_model_twenty.wv.vectors))

FV_twenty = [FV_moVMF(BoWE_doc(w2v_model_twenty.wv, tok_twenty_tot[k]), vmf_twenty) for k in range(len(tok_twenty_tot))]


### Try several clustering runs and store results.

ARIs = [[], []]    # Store ARI values: random, k-means++ init.
NMIs = [[], []]    # Store NMI values: random, k-means++ init.

init_methods = ['random', 'k-means++']

for j, ini in enumerate(init_methods):
    print("Processing init = {}.".format(ini))
    for i in range(10):
        print("Processing i = {}.".format(i))

        # Initialize the clustering alg.
        km_FV_moVMF = KMeans(n_clusters=K, n_init=10, init=ini, max_iter=200)
        km_FV_moVMF.fit(FV_twenty)
    
        ARI, NMI = evaluate_cluster(km_FV_moVMF, twenty_labels_tot)
    
        print("Iteration number: {}".format(km_FV_moVMF.n_iter_))
        print("Clustering performance for FV-moVMF:\nARI: {0:.3f}\nNMI: {1:.3f}".format(ARI*100, NMI*100))
   
        ARIs[j].append(ARI)
        NMIs[j].append(NMI)
        
df_FV_moVMF = pd.DataFrame(np.multiply(list(zip(ARIs[0], NMIs[0], ARIs[1], NMIs[1])), 100), 
                           columns =['ARI_random', 'NMI_random', 'ARI_kmeans', 'NMI_kmeans'])

df_FV_moVMF.round(3).to_csv('results/FV_moVMF_clust_100topics.csv', header=True, index=False)

Fitting moVMF...


In [None]:
df_FV_moVMF