In [1]:
# Import a file containing all the needed functions.
from utils import *
import matplotlib.pyplot as plt

# Automatic reload the changes in `utils.py`.
%load_ext autoreload
%autoreload 2



ImportError: cannot import name '_k_means' from 'sklearn.cluster' (C:\ProgramData\Anaconda3\lib\site-packages\sklearn\cluster\__init__.py)

# Import and pre-process the data

In [2]:
DATA_DIR = "/storage/dataCollections/20_newsgroups/"

# Folder for the Robust04 collection.
TWENTY_TRAIN = os.path.join(DATA_DIR, "20news-bydate/20news-bydate-train")
TWENTY_TEST = os.path.join(DATA_DIR, "20news-bydate/20news-bydate-test")

In [None]:
enc = 'latin-1'

twenty_train = datasets.load_files(TWENTY_TRAIN, encoding=enc)
twenty_test = datasets.load_files(TWENTY_TEST, encoding=enc)

In [5]:
print("Dataset size:\n")
print("Train set: {}".format(len(twenty_train.filenames)))
print("Test set: {}".format(len(twenty_test.filenames)))
print("Total: {}".format(len(twenty_train.filenames) + len(twenty_test.filenames)))

Dataset size:

Train set: 11314
Test set: 7532
Total: 18846


In [6]:
# Save labels to file for later need.
np.save('20news/twenty_test_labels', twenty_test.target)
np.save('20news/twenty_train_labels', twenty_train.target)

### Pre-processing

Either pre-process data from scratch **or** import them already tokenized (see section below).

In [17]:
tok_twenty_train = [tokenize_text(sent) for sent in twenty_train.data]  # Tokenized train data
tok_twenty_test = [tokenize_text(sent) for sent in twenty_test.data]    # Tokenized test data

# If everything works, delete the original ROB04_data to save RAM.
if (len(tok_twenty_train) == len(twenty_train.data)):
    print("Tokenization of train set OK; original data removed.")
    del twenty_train
    
if (len(tok_twenty_test) == len(twenty_test.data)):
    print("Tokenization of test set OK; original data removed.")
    del twenty_test

Tokenization of train set OK; original data removed.
Tokenization of test set OK; original data removed.


In [18]:
# Save the tokenized datasets to a file (tokenization takes time).

with open("20news/tok_twenty_train.csv", mode="w", newline="") as f:
    writer = csv.writer(f)
    writer.writerows(tok_twenty_train)
    
with open("20news/tok_twenty_test.csv", mode="w", newline="") as f:
    writer = csv.writer(f)
    writer.writerows(tok_twenty_test)

In [150]:
# Read tokenized train and test sets.

tok_twenty_train = []  # List to store train set
tok_twenty_test = []   # List to store test set

with open('20news/tok_twenty_train.csv') as csvfile:
    read = csv.reader(csvfile, delimiter = ',')
    for row in read:
        tok_twenty_train.append(row)
        
with open('20news/tok_twenty_test.csv') as csvfile:
    read = csv.reader(csvfile, delimiter = ',')
    for row in read:
        tok_twenty_test.append(row)

## Import pre-processed data and labels

In [2]:
# Read tokenized set and IDs.

tok_twenty_tot = []  # List to store total set

with open('../../clustering/20news/tok_twenty_tot.csv') as csvfile:
    read = csv.reader(csvfile, delimiter = ',')
    for row in read:
        tok_twenty_tot.append(row)
        
twenty_labels_tot = np.load('../../clustering/20news/twenty_labels_tot.npy')

#### Fixed parameters

In [3]:
n_top = 50     # Number of topics for LSI and LDA.
               # Vector dimension for cBow and PV.
mix_comp = 15  # Mixture components.
K = 20         # Number of clusters.
n_feat = 5000  # Number of features.

n_top_high = 100  # For computations with 100-dim. feat. vectors.
n_top_low = 20    # For computations with 20-dim. feat. vectors.

## 1. TF-IDF

Repeat clustering 10 times

In [None]:
### Try several clustering runs and store results.

# Convert a collection of text documents into a matrix of token counts.
count_vectorizer_TFIDF = CountVectorizer(tokenizer=identity_tokenizer, 
                                         lowercase=False, max_features=n_feat)

# Matrix of shape len(data) x #words.
twenty_features_TFIDF = count_vectorizer_TFIDF.fit_transform(tok_twenty_tot)

ARIs = [[], []]    # Store ARI values: random, k-means++ init.
NMIs = [[], []]    # Store NMI values: random, k-means++ init.

init_methods = ['random', 'k-means++']

for j, ini in enumerate(init_methods):
    print("Processing init = {}.".format(ini))
    for i in range(10):
        print("Processing i = {}.".format(i))

        # Initialize the clustering alg.
        km_TFIDF = KMeans(n_clusters=K, n_init=10, init=ini, max_iter=200)
        km_TFIDF.fit(twenty_features_TFIDF)
    
        ARI, NMI = evaluate_cluster(km_TFIDF)
    
        print("Iteration number: {}".format(km_TFIDF.n_iter_))
        print("Clustering performance for TF-IDF:\nARI: {0:.3f}\nNMI: {1:.3f}".format(ARI*100, NMI*100))
   
        ARIs[j].append(ARI)
        NMIs[j].append(NMI)
        
df_TFIDF = pd.DataFrame(np.multiply(list(zip(ARIs[0], NMIs[0], ARIs[1], NMIs[1])), 100), 
                        columns =['ARI_random', 'NMI_random', 'ARI_kmeans', 'NMI_kmeans'])

df_TFIDF.round(3).to_csv('results/TFIDF_clust.csv', header=True, index=False)

## Generate dictionary and corpus

In [4]:
# Create a Gensim dictionary and corpus.
dct = corpora.Dictionary(tok_twenty_tot)
# Gensim uses bag of wards to represent in this form.
corpus_twenty = [dct.doc2bow(sent) for sent in tok_twenty_tot]

## 2. LSI

In [5]:
# Run LSI model to get topic modelling.
lsi_model_twenty = LsiModel(corpus=corpus_twenty, num_topics=n_top_high, id2word=dct)

# Converting topics to feature vectors.
# The probability distribution of the topics for 
# a specific review will be our feature vector.

feat_vecs_twenty = matutils.corpus2dense(lsi_model_twenty[corpus_twenty], num_terms=n_top_high).T.tolist()

### Try several clustering runs and store results.

ARIs = [[], []]    # Store ARI values: random, k-means++ init.
NMIs = [[], []]    # Store NMI values: random, k-means++ init.

init_methods = ['random', 'k-means++']

for j, ini in enumerate(init_methods):
    print("Processing init = {}.".format(ini))
    for i in range(10):
        print("Processing i = {}.".format(i))

        # Initialize the clustering alg.
        km_LSI = KMeans(n_clusters=K, n_init=10, init=ini, max_iter=200)
        km_LSI.fit(feat_vecs_twenty)
    
        ARI, NMI = evaluate_cluster(km_LSI, twenty_labels_tot)
    
        print("Iteration number: {}".format(km_LSI.n_iter_))
        print("Clustering performance for LSI:\nARI: {0:.3f}\nNMI: {1:.3f}".format(ARI*100, NMI*100))
   
        ARIs[j].append(ARI)
        NMIs[j].append(NMI)
        
df_LSI = pd.DataFrame(np.multiply(list(zip(ARIs[0], NMIs[0], ARIs[1], NMIs[1])), 100), 
                      columns =['ARI_random', 'NMI_random', 'ARI_kmeans', 'NMI_kmeans'])

df_LSI.round(3).to_csv('results/LSI_clust_100topics.csv', header=True, index=False)

Processing init = random.
Processing i = 0.




Iteration number: 49
Clustering performance for LSI:
ARI: 0.548
NMI: 6.163
Processing i = 1.




Iteration number: 58
Clustering performance for LSI:
ARI: 0.729
NMI: 6.492
Processing i = 2.




Iteration number: 54
Clustering performance for LSI:
ARI: 0.704
NMI: 9.767
Processing i = 3.




Iteration number: 50
Clustering performance for LSI:
ARI: 0.833
NMI: 6.104
Processing i = 4.




Iteration number: 83
Clustering performance for LSI:
ARI: 0.496
NMI: 6.285
Processing i = 5.




Iteration number: 51
Clustering performance for LSI:
ARI: 0.769
NMI: 6.497
Processing i = 6.




Iteration number: 74
Clustering performance for LSI:
ARI: 0.529
NMI: 6.292
Processing i = 7.




Iteration number: 70
Clustering performance for LSI:
ARI: 0.459
NMI: 7.154
Processing i = 8.


  return_n_iter=True)


Iteration number: 56
Clustering performance for LSI:
ARI: 0.464
NMI: 5.967
Processing i = 9.




Iteration number: 62
Clustering performance for LSI:
ARI: 0.543
NMI: 6.419
Processing init = k-means++.
Processing i = 0.




Iteration number: 34
Clustering performance for LSI:
ARI: 0.002
NMI: 2.027
Processing i = 1.




Iteration number: 14
Clustering performance for LSI:
ARI: 0.001
NMI: 1.793
Processing i = 2.




Iteration number: 8
Clustering performance for LSI:
ARI: 0.049
NMI: 2.947
Processing i = 3.




Iteration number: 11
Clustering performance for LSI:
ARI: 0.001
NMI: 1.737
Processing i = 4.




Iteration number: 19
Clustering performance for LSI:
ARI: 0.003
NMI: 1.983
Processing i = 5.




Iteration number: 17
Clustering performance for LSI:
ARI: 0.002
NMI: 2.088
Processing i = 6.




Iteration number: 14
Clustering performance for LSI:
ARI: 0.002
NMI: 1.640
Processing i = 7.




Iteration number: 14
Clustering performance for LSI:
ARI: 0.001
NMI: 1.825
Processing i = 8.




Iteration number: 25
Clustering performance for LSI:
ARI: 0.003
NMI: 2.056
Processing i = 9.
Iteration number: 16
Clustering performance for LSI:
ARI: 0.002
NMI: 1.712




## 3. LDA

In [6]:
# Run LDA model to get topic modelling.
lda_model_twenty = LdaMulticore(corpus=corpus_twenty, num_topics=n_top_high, id2word=dct,
                                passes=60, workers=6)

# Save model
# lda_model_twenty.save("models/LDA/LDA_clustering.model")

# Converting topics to feature vectors
# The probability distribution of the topics for 
# a specific review will be our feature vector.

feat_vecs_twenty = []

for i in range(len(corpus_twenty)):
    top_topics = lda_model_twenty.get_document_topics(corpus_twenty[i], minimum_probability=0.0)
    topic_vec = [top_topics[i][1] for i in range(n_top_high)]
    feat_vecs_twenty.append(topic_vec)

### Try several clustering runs and store results.

ARIs = [[], []]    # Store ARI values: random, k-means++ init.
NMIs = [[], []]    # Store NMI values: random, k-means++ init.

init_methods = ['random', 'k-means++']

for j, ini in enumerate(init_methods):
    print("Processing init = {}.".format(ini))
    for i in range(10):
        print("Processing i = {}.".format(i))

        # Initialize the clustering alg.
        km_LDA = KMeans(n_clusters=K, n_init=10, init=ini, max_iter=200)
        km_LDA.fit(feat_vecs_twenty)
    
        ARI, NMI = evaluate_cluster(km_LDA, twenty_labels_tot)
    
        print("Iteration number: {}".format(km_LDA.n_iter_))
        print("Clustering performance for LDA:\nARI: {0:.3f}\nNMI: {1:.3f}".format(ARI*100, NMI*100))
   
        ARIs[j].append(ARI)
        NMIs[j].append(NMI)
        
df_LDA = pd.DataFrame(np.multiply(list(zip(ARIs[0], NMIs[0], ARIs[1], NMIs[1])), 100), 
                      columns =['ARI_random', 'NMI_random', 'ARI_kmeans', 'NMI_kmeans'])

df_LDA.round(3).to_csv('results/LDA_clust_100topics.csv', header=True, index=False)

Processing init = random.
Processing i = 0.




Iteration number: 27
Clustering performance for LDA:
ARI: 10.903
NMI: 33.526
Processing i = 1.




Iteration number: 63
Clustering performance for LDA:
ARI: 11.633
NMI: 33.969
Processing i = 2.




Iteration number: 17
Clustering performance for LDA:
ARI: 11.294
NMI: 34.406
Processing i = 3.




Iteration number: 81
Clustering performance for LDA:
ARI: 10.350
NMI: 33.444
Processing i = 4.




Iteration number: 24
Clustering performance for LDA:
ARI: 11.167
NMI: 34.298
Processing i = 5.




Iteration number: 40
Clustering performance for LDA:
ARI: 10.975
NMI: 33.396
Processing i = 6.




Iteration number: 18
Clustering performance for LDA:
ARI: 11.805
NMI: 35.356
Processing i = 7.




Iteration number: 48
Clustering performance for LDA:
ARI: 12.360
NMI: 34.354
Processing i = 8.




Iteration number: 74
Clustering performance for LDA:
ARI: 11.493
NMI: 33.799
Processing i = 9.




Iteration number: 30
Clustering performance for LDA:
ARI: 11.096
NMI: 32.784
Processing init = k-means++.
Processing i = 0.




Iteration number: 21
Clustering performance for LDA:
ARI: 11.989
NMI: 34.550
Processing i = 1.




Iteration number: 22
Clustering performance for LDA:
ARI: 10.800
NMI: 33.163
Processing i = 2.




Iteration number: 29
Clustering performance for LDA:
ARI: 10.223
NMI: 33.574
Processing i = 3.




Iteration number: 35
Clustering performance for LDA:
ARI: 10.633
NMI: 32.057
Processing i = 4.




Iteration number: 42
Clustering performance for LDA:
ARI: 11.602
NMI: 35.411
Processing i = 5.




Iteration number: 34
Clustering performance for LDA:
ARI: 11.200
NMI: 34.347
Processing i = 6.




Iteration number: 29
Clustering performance for LDA:
ARI: 10.638
NMI: 33.153
Processing i = 7.




Iteration number: 52
Clustering performance for LDA:
ARI: 11.199
NMI: 34.079
Processing i = 8.




Iteration number: 28
Clustering performance for LDA:
ARI: 11.138
NMI: 34.973
Processing i = 9.
Iteration number: 33
Clustering performance for LDA:
ARI: 11.184
NMI: 33.776




## 4. cBow

In [7]:
## Set-up w2v model.

cores = 6      # Threads used for training

# Initialize model.
w2v_model_twenty = Word2Vec(size=n_top_high, window=5, min_count=1, workers=cores)

# Build the vocabulary.
w2v_model_twenty.build_vocab(tok_twenty_tot)

# Train model.
w2v_model_twenty.train(tok_twenty_tot, total_examples=w2v_model_twenty.corpus_count, epochs=60)

# Save model.
w2v_model_twenty.save("models/cBow/cBow_clustering_100topics.model")

w2v_model_twenty.init_sims(replace=True)

# Getting feature vectors.
twenty_w2v_aver = word_averaging_list(w2v_model_twenty.wv, tok_twenty_tot)


### Try several clustering runs and store results.

ARIs = [[], []]    # Store ARI values: random, k-means++ init.
NMIs = [[], []]    # Store NMI values: random, k-means++ init.

init_methods = ['random', 'k-means++']

for j, ini in enumerate(init_methods):
    print("Processing init = {}.".format(ini))
    for i in range(10):
        print("Processing i = {}.".format(i))

        # Initialize the clustering alg.
        km_cBow = KMeans(n_clusters=K, n_init=10, init=ini, max_iter=200)
        km_cBow.fit(twenty_w2v_aver)
    
        ARI, NMI = evaluate_cluster(km_cBow, twenty_labels_tot)
    
        print("Iteration number: {}".format(km_cBow.n_iter_))
        print("Clustering performance for cBow:\nARI: {0:.3f}\nNMI: {1:.3f}".format(ARI*100, NMI*100))
   
        ARIs[j].append(ARI)
        NMIs[j].append(NMI)
        
df_cBow = pd.DataFrame(np.multiply(list(zip(ARIs[0], NMIs[0], ARIs[1], NMIs[1])), 100), 
                       columns =['ARI_random', 'NMI_random', 'ARI_kmeans', 'NMI_kmeans'])

df_cBow.round(3).to_csv('results/cBow_clust_100topics.csv', header=True, index=False)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


Processing init = random.
Processing i = 0.




Iteration number: 82
Clustering performance for cBow:
ARI: 32.105
NMI: 51.542
Processing i = 1.




Iteration number: 64
Clustering performance for cBow:
ARI: 31.621
NMI: 51.341
Processing i = 2.




Iteration number: 68
Clustering performance for cBow:
ARI: 31.448
NMI: 50.895
Processing i = 3.




Iteration number: 68
Clustering performance for cBow:
ARI: 32.515
NMI: 51.609
Processing i = 4.




Iteration number: 71
Clustering performance for cBow:
ARI: 33.288
NMI: 51.995
Processing i = 5.




Iteration number: 67
Clustering performance for cBow:
ARI: 31.525
NMI: 51.080
Processing i = 6.




Iteration number: 88
Clustering performance for cBow:
ARI: 32.543
NMI: 51.560
Processing i = 7.




Iteration number: 89
Clustering performance for cBow:
ARI: 33.113
NMI: 52.090
Processing i = 8.




Iteration number: 39
Clustering performance for cBow:
ARI: 33.025
NMI: 52.047
Processing i = 9.




Iteration number: 83
Clustering performance for cBow:
ARI: 31.461
NMI: 51.185
Processing init = k-means++.
Processing i = 0.




Iteration number: 60
Clustering performance for cBow:
ARI: 31.834
NMI: 51.249
Processing i = 1.




Iteration number: 54
Clustering performance for cBow:
ARI: 32.751
NMI: 51.586
Processing i = 2.




Iteration number: 57
Clustering performance for cBow:
ARI: 33.098
NMI: 52.085
Processing i = 3.




Iteration number: 101
Clustering performance for cBow:
ARI: 32.214
NMI: 51.471
Processing i = 4.




Iteration number: 173
Clustering performance for cBow:
ARI: 30.867
NMI: 50.725
Processing i = 5.




Iteration number: 83
Clustering performance for cBow:
ARI: 32.727
NMI: 51.563
Processing i = 6.




Iteration number: 77
Clustering performance for cBow:
ARI: 33.062
NMI: 51.833
Processing i = 7.




Iteration number: 54
Clustering performance for cBow:
ARI: 33.164
NMI: 52.033
Processing i = 8.




Iteration number: 76
Clustering performance for cBow:
ARI: 31.888
NMI: 51.445
Processing i = 9.
Iteration number: 75
Clustering performance for cBow:
ARI: 32.354
NMI: 51.672




## 5. PV models

In [8]:
# For Doc2Vec data need to be tokenized + tagged.
tagged_tok_twenty_tot = []

for j, sent in enumerate(tok_twenty_tot):
    tagged_tok_twenty_tot.append(TaggedDocument(words=sent, tags=[j]))
    
# Set up d2v model.
cores = 6      # Threads used for training
assert gensim.models.doc2vec.FAST_VERSION > -1, "Too slow otherwise"

# Initialize 2 models: PV-DBOW and PV-DM.
d2v_models = [
    # PV-DBOW (dm=0).
    Doc2Vec(dm=0, vector_size=n_top_high, window=5, min_count=1, sample=0, workers=cores),
    # PV-DM (dm=1) with default averaging.
    Doc2Vec(dm=1, vector_size=n_top_high, window=5, min_count=1, sample=0, workers=cores)
]

# Build the vocabulary
for model in d2v_models:
    model.build_vocab(tagged_tok_twenty_tot)
    print("%s vocabulary scanned and state initialized" % model)
    
# Train the models.
for model in d2v_models: 
    print("Training %s" % model)
    model.train(tagged_tok_twenty_tot, total_examples=model.corpus_count, epochs=30)
    

### Try several clustering runs and store results.

init_methods = ['random', 'k-means++']

for m, model in enumerate(d2v_models):
    ARIs = [[], []]    # Store ARI values: random, k-means++ init.
    NMIs = [[], []]    # Store NMI values: random, k-means++ init.
    feat_vecs_twenty = [model.infer_vector(doc.words) for doc in tagged_tok_twenty_tot]
    for j, ini in enumerate(init_methods):
        print("Processing init = {}.".format(ini))
        for i in range(10):
            print("Processing i = {}.".format(i))

            # Initialize the clustering alg.
            km_PV = KMeans(n_clusters=K, n_init=10, init=ini, max_iter=200)
            km_PV.fit(feat_vecs_twenty)
    
            ARI, NMI = evaluate_cluster(km_PV, twenty_labels_tot)
    
            print("Iteration number: {}".format(km_PV.n_iter_))
            print("Clustering performance for PV:\nARI: {0:.3f}\nNMI: {1:.3f}".format(ARI*100, NMI*100))
   
            ARIs[j].append(ARI)
            NMIs[j].append(NMI)
        
    df_cBow = pd.DataFrame(np.multiply(list(zip(ARIs[0], NMIs[0], ARIs[1], NMIs[1])), 100), 
                           columns =['ARI_random', 'NMI_random', 'ARI_kmeans', 'NMI_kmeans'])
    if m == 0:
        df_cBow.round(3).to_csv('results/PV_DBOW_clust_30ep_100topics.csv', header=True, index=False)
    else:
        df_cBow.round(3).to_csv('results/PV_DM_clust_30ep_100topics.csv', header=True, index=False)

Doc2Vec(dbow,d100,n5,t6) vocabulary scanned and state initialized
Doc2Vec(dm/m,d100,n5,w5,t6) vocabulary scanned and state initialized
Training Doc2Vec(dbow,d100,n5,t6)
Training Doc2Vec(dm/m,d100,n5,w5,t6)
Processing init = random.
Processing i = 0.




Iteration number: 33
Clustering performance for PV:
ARI: 50.103
NMI: 66.618
Processing i = 1.




Iteration number: 182
Clustering performance for PV:
ARI: 50.562
NMI: 66.564
Processing i = 2.




Iteration number: 52
Clustering performance for PV:
ARI: 48.029
NMI: 64.782
Processing i = 3.




Iteration number: 80
Clustering performance for PV:
ARI: 47.967
NMI: 64.746
Processing i = 4.




Iteration number: 53
Clustering performance for PV:
ARI: 49.104
NMI: 65.311
Processing i = 5.




Iteration number: 31
Clustering performance for PV:
ARI: 50.002
NMI: 66.449
Processing i = 6.




Iteration number: 90
Clustering performance for PV:
ARI: 49.674
NMI: 65.687
Processing i = 7.




Iteration number: 39
Clustering performance for PV:
ARI: 51.361
NMI: 66.705
Processing i = 8.




Iteration number: 43
Clustering performance for PV:
ARI: 50.273
NMI: 66.100
Processing i = 9.




Iteration number: 64
Clustering performance for PV:
ARI: 50.948
NMI: 66.375
Processing init = k-means++.
Processing i = 0.




Iteration number: 57
Clustering performance for PV:
ARI: 47.380
NMI: 64.407
Processing i = 1.




Iteration number: 52
Clustering performance for PV:
ARI: 48.126
NMI: 65.194
Processing i = 2.




Iteration number: 52
Clustering performance for PV:
ARI: 49.995
NMI: 65.701
Processing i = 3.




Iteration number: 36
Clustering performance for PV:
ARI: 50.728
NMI: 66.697
Processing i = 4.




Iteration number: 129
Clustering performance for PV:
ARI: 48.132
NMI: 65.169
Processing i = 5.




Iteration number: 70
Clustering performance for PV:
ARI: 50.267
NMI: 66.205
Processing i = 6.




Iteration number: 29
Clustering performance for PV:
ARI: 51.319
NMI: 66.462
Processing i = 7.




Iteration number: 58
Clustering performance for PV:
ARI: 52.462
NMI: 67.607
Processing i = 8.




Iteration number: 116
Clustering performance for PV:
ARI: 48.147
NMI: 65.166
Processing i = 9.




Iteration number: 47
Clustering performance for PV:
ARI: 54.003
NMI: 67.496
Processing init = random.
Processing i = 0.




Iteration number: 35
Clustering performance for PV:
ARI: 9.794
NMI: 40.886
Processing i = 1.




Iteration number: 95
Clustering performance for PV:
ARI: 9.803
NMI: 42.196
Processing i = 2.




Iteration number: 70
Clustering performance for PV:
ARI: 12.506
NMI: 40.188
Processing i = 3.




Iteration number: 73
Clustering performance for PV:
ARI: 9.657
NMI: 41.679
Processing i = 4.




Iteration number: 41
Clustering performance for PV:
ARI: 9.055
NMI: 39.258
Processing i = 5.




Iteration number: 51
Clustering performance for PV:
ARI: 10.309
NMI: 42.769
Processing i = 6.




Iteration number: 39
Clustering performance for PV:
ARI: 11.955
NMI: 38.798
Processing i = 7.




Iteration number: 49
Clustering performance for PV:
ARI: 12.511
NMI: 39.562
Processing i = 8.




Iteration number: 98
Clustering performance for PV:
ARI: 9.949
NMI: 42.287
Processing i = 9.




Iteration number: 44
Clustering performance for PV:
ARI: 9.669
NMI: 41.861
Processing init = k-means++.
Processing i = 0.




Iteration number: 80
Clustering performance for PV:
ARI: 9.480
NMI: 39.817
Processing i = 1.




Iteration number: 65
Clustering performance for PV:
ARI: 8.826
NMI: 39.654
Processing i = 2.




Iteration number: 47
Clustering performance for PV:
ARI: 9.314
NMI: 40.576
Processing i = 3.




Iteration number: 48
Clustering performance for PV:
ARI: 9.133
NMI: 40.035
Processing i = 4.




Iteration number: 57
Clustering performance for PV:
ARI: 9.175
NMI: 40.476
Processing i = 5.




Iteration number: 59
Clustering performance for PV:
ARI: 8.775
NMI: 39.458
Processing i = 6.




Iteration number: 27
Clustering performance for PV:
ARI: 8.299
NMI: 38.368
Processing i = 7.




Iteration number: 77
Clustering performance for PV:
ARI: 9.535
NMI: 40.658
Processing i = 8.




Iteration number: 41
Clustering performance for PV:
ARI: 8.867
NMI: 39.132
Processing i = 9.
Iteration number: 39
Clustering performance for PV:
ARI: 9.758
NMI: 41.577




## 6. FV-GMM

In [9]:
# Load the cBow model.
w2v_model_twenty = Word2Vec.load("models/cBow/cBow_clustering_100topics.model")

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [10]:
gmm_twenty = mixture.GaussianMixture(n_components=mix_comp, covariance_type='diag', max_iter=200, 
                                     n_init=5, random_state=22, reg_covar=1e-05)
print("Fitting GMM...")

gmm_twenty.fit(w2v_model_twenty.wv.vectors)

print("GMM iteration number: {}.".format(gmm_twenty.n_iter_))

FV_twenty = [FV_GMM(BoWE_doc(w2v_model_twenty.wv, tok_twenty_tot[k]), gmm_twenty) for k in range(len(tok_twenty_tot))]

### Try several clustering runs and store results.

ARIs = [[], []]    # Store ARI values: random, k-means++ init.
NMIs = [[], []]    # Store NMI values: random, k-means++ init.

init_methods = ['random', 'k-means++']

for j, ini in enumerate(init_methods):
    print("Processing init = {}.".format(ini))
    for i in range(10):
        print("Processing i = {}.".format(i))

        # Initialize the clustering alg.
        km_FV_GMM = KMeans(n_clusters=K, n_init=10, init=ini, max_iter=200)
        km_FV_GMM.fit(FV_twenty)
    
        ARI, NMI = evaluate_cluster(km_FV_GMM, twenty_labels_tot)
    
        print("Iteration number: {}".format(km_FV_GMM.n_iter_))
        print("Clustering performance for FV-GMM:\nARI: {0:.3f}\nNMI: {1:.3f}".format(ARI*100, NMI*100))
   
        ARIs[j].append(ARI)
        NMIs[j].append(NMI)
        
df_FV_GMM = pd.DataFrame(np.multiply(list(zip(ARIs[0], NMIs[0], ARIs[1], NMIs[1])), 100), 
                         columns =['ARI_random', 'NMI_random', 'ARI_kmeans', 'NMI_kmeans'])

df_FV_GMM.round(3).to_csv('results/FV_GMM_clust_100topics.csv', header=True, index=False)

Fitting GMM...
GMM iteration number: 189.
Processing init = random.
Processing i = 0.




Iteration number: 128
Clustering performance for FV-GMM:
ARI: 1.326
NMI: 10.684
Processing i = 1.




Iteration number: 188
Clustering performance for FV-GMM:
ARI: 1.591
NMI: 10.351
Processing i = 2.




Iteration number: 181
Clustering performance for FV-GMM:
ARI: 1.231
NMI: 11.242
Processing i = 3.




Iteration number: 200
Clustering performance for FV-GMM:
ARI: 1.613
NMI: 10.371
Processing i = 4.




Iteration number: 117
Clustering performance for FV-GMM:
ARI: 2.533
NMI: 13.622
Processing i = 5.




Iteration number: 190
Clustering performance for FV-GMM:
ARI: 1.591
NMI: 10.351
Processing i = 6.




Iteration number: 186
Clustering performance for FV-GMM:
ARI: 1.591
NMI: 10.351
Processing i = 7.




Iteration number: 198
Clustering performance for FV-GMM:
ARI: 1.588
NMI: 10.350
Processing i = 8.




Iteration number: 200
Clustering performance for FV-GMM:
ARI: 1.615
NMI: 10.371
Processing i = 9.




Iteration number: 112
Clustering performance for FV-GMM:
ARI: 2.697
NMI: 13.626
Processing init = k-means++.
Processing i = 0.




Iteration number: 29
Clustering performance for FV-GMM:
ARI: 0.484
NMI: 3.261
Processing i = 1.




Iteration number: 21
Clustering performance for FV-GMM:
ARI: 0.449
NMI: 3.552
Processing i = 2.




Iteration number: 65
Clustering performance for FV-GMM:
ARI: 0.489
NMI: 3.294
Processing i = 3.




Iteration number: 16
Clustering performance for FV-GMM:
ARI: 0.448
NMI: 3.535
Processing i = 4.




Iteration number: 20
Clustering performance for FV-GMM:
ARI: 0.442
NMI: 3.477
Processing i = 5.




Iteration number: 22
Clustering performance for FV-GMM:
ARI: 0.448
NMI: 3.462
Processing i = 6.




Iteration number: 13
Clustering performance for FV-GMM:
ARI: 0.443
NMI: 3.570
Processing i = 7.




Iteration number: 46
Clustering performance for FV-GMM:
ARI: 0.502
NMI: 3.420
Processing i = 8.




Iteration number: 14
Clustering performance for FV-GMM:
ARI: 0.443
NMI: 3.530
Processing i = 9.
Iteration number: 28
Clustering performance for FV-GMM:
ARI: 0.442
NMI: 3.475




## 7. FV-moVMF

In [11]:
vmf_twenty = VonMisesFisherMixture(n_clusters=mix_comp, posterior_type='soft', max_iter=300, 
                                   n_init=6, n_jobs=6, copy_x=True, normalize=True, random_state=22)

print("Fitting moVMF...")

vmf_twenty.fit(normalize(w2v_model_twenty.wv.vectors))

FV_twenty = [FV_moVMF(BoWE_doc(w2v_model_twenty.wv, tok_twenty_tot[k]), vmf_twenty) for k in range(len(tok_twenty_tot))]


### Try several clustering runs and store results.

ARIs = [[], []]    # Store ARI values: random, k-means++ init.
NMIs = [[], []]    # Store NMI values: random, k-means++ init.

init_methods = ['random', 'k-means++']

for j, ini in enumerate(init_methods):
    print("Processing init = {}.".format(ini))
    for i in range(10):
        print("Processing i = {}.".format(i))

        # Initialize the clustering alg.
        km_FV_moVMF = KMeans(n_clusters=K, n_init=10, init=ini, max_iter=200)
        km_FV_moVMF.fit(FV_twenty)
    
        ARI, NMI = evaluate_cluster(km_FV_moVMF, twenty_labels_tot)
    
        print("Iteration number: {}".format(km_FV_moVMF.n_iter_))
        print("Clustering performance for FV-moVMF:\nARI: {0:.3f}\nNMI: {1:.3f}".format(ARI*100, NMI*100))
   
        ARIs[j].append(ARI)
        NMIs[j].append(NMI)
        
df_FV_moVMF = pd.DataFrame(np.multiply(list(zip(ARIs[0], NMIs[0], ARIs[1], NMIs[1])), 100), 
                           columns =['ARI_random', 'NMI_random', 'ARI_kmeans', 'NMI_kmeans'])

df_FV_moVMF.round(3).to_csv('results/FV_moVMF_clust_100topics.csv', header=True, index=False)

Fitting moVMF...
Processing init = random.
Processing i = 0.




Iteration number: 155
Clustering performance for FV-moVMF:
ARI: 1.497
NMI: 9.290
Processing i = 1.




Iteration number: 92
Clustering performance for FV-moVMF:
ARI: 1.507
NMI: 10.001
Processing i = 2.




Iteration number: 119
Clustering performance for FV-moVMF:
ARI: 1.603
NMI: 11.655
Processing i = 3.




Iteration number: 163
Clustering performance for FV-moVMF:
ARI: 1.444
NMI: 9.109
Processing i = 4.




Iteration number: 153
Clustering performance for FV-moVMF:
ARI: 1.420
NMI: 9.009
Processing i = 5.




Iteration number: 152
Clustering performance for FV-moVMF:
ARI: 1.465
NMI: 9.330
Processing i = 6.




Iteration number: 114
Clustering performance for FV-moVMF:
ARI: 1.481
NMI: 9.414
Processing i = 7.




Iteration number: 132
Clustering performance for FV-moVMF:
ARI: 1.490
NMI: 9.439
Processing i = 8.




Iteration number: 158
Clustering performance for FV-moVMF:
ARI: 1.449
NMI: 9.336
Processing i = 9.




Iteration number: 160
Clustering performance for FV-moVMF:
ARI: 1.307
NMI: 9.039
Processing init = k-means++.
Processing i = 0.




Iteration number: 27
Clustering performance for FV-moVMF:
ARI: 0.553
NMI: 3.263
Processing i = 1.




Iteration number: 24
Clustering performance for FV-moVMF:
ARI: 0.604
NMI: 3.249
Processing i = 2.




Iteration number: 22
Clustering performance for FV-moVMF:
ARI: 0.649
NMI: 3.335
Processing i = 3.




Iteration number: 73
Clustering performance for FV-moVMF:
ARI: 0.610
NMI: 3.255
Processing i = 4.




Iteration number: 29
Clustering performance for FV-moVMF:
ARI: 0.511
NMI: 3.324
Processing i = 5.




Iteration number: 46
Clustering performance for FV-moVMF:
ARI: 0.603
NMI: 3.310
Processing i = 6.




Iteration number: 46
Clustering performance for FV-moVMF:
ARI: 0.592
NMI: 3.279
Processing i = 7.




Iteration number: 39
Clustering performance for FV-moVMF:
ARI: 0.515
NMI: 3.238
Processing i = 8.




Iteration number: 30
Clustering performance for FV-moVMF:
ARI: 0.522
NMI: 3.271
Processing i = 9.
Iteration number: 33
Clustering performance for FV-moVMF:
ARI: 0.593
NMI: 3.284


