# Import modules needed

In [1]:
%%time
import sys
sys.path.append('..')

Wall time: 0 ns


In [2]:
%%time
from source.code.data_downloader import DataDownloader
from source.code.custom_tokenizer import CustomTokenizer
from source.code.word_to_vec_transformer import WordToVecTransformer
from source.code.doc_to_vec_transformer import Doc2VecTransformer



Wall time: 2.19 s


In [3]:
%%time
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.pipeline import Pipeline

from sklearn.cluster import KMeans
from sklearn.decomposition import LatentDirichletAllocation as LDA

from sklearn import metrics

import matplotlib.pyplot as plt
import seaborn as sns

Wall time: 1 s


Fix one random state across all notebook cells

In [4]:
%%time
random_state = 0

Wall time: 0 ns


# Read data

In [5]:
%%time
data_downloader = DataDownloader('../data/datasets/')


INITIALIZING...
INITIALIZATION HAS BEEN COMPLETED


Wall time: 6 ms


In [6]:
%%time
documents, labels_true = data_downloader.extract_documents(docs_count_per_topic=100)

Files reading and documents extraction: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:01<00:00, 13.49it/s]


Wall time: 1.52 s


# Describe pipelines

In [12]:
pipelines = {
    'tfidf': Pipeline([
        ('tokenize', CustomTokenizer()),
        ('vectorize', CountVectorizer()),
        ('tfidf', TfidfTransformer(smooth_idf=True)),
        ('predict', KMeans(n_clusters=20, random_state=random_state))
    ]),
    'lda': Pipeline([
        ('tokenize', CustomTokenizer()),
        ('vectorize', CountVectorizer()),
        ('lda', LDA(n_components=100, random_state=random_state, verbose=10, learning_method='batch', n_jobs=-1)),
        ('predict', KMeans(n_clusters=20, random_state=random_state))
    ]),
    'word2vec': Pipeline([
        ('tokenize', CustomTokenizer()),
        ('w2v', WordToVecTransformer(100)),
        ('predict', KMeans(n_clusters=20, random_state=random_state))
    ])
}

# TF-IDF documents representation

In [7]:
%%time
pipeline = Pipeline([
    ('tokenize', CustomTokenizer()),
    ('vectorize', CountVectorizer()),
    ('inverse', TfidfTransformer(smooth_idf=True))
])

Wall time: 494 µs


In [8]:
%%time
tf_idf_documents = pipeline.fit_transform(documents)

Documents tokenization: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2000/2000 [00:33<00:00, 59.69it/s]


Wall time: 34.2 s


In [9]:
%%time
tf_idf_documents.shape

Wall time: 0 ns


(2000, 13800)

# LDA documents representation

In [10]:
%%time
pipeline = Pipeline([
    ('tokenize', CustomTokenizer()),
    ('vectorize', CountVectorizer()),
    ('lda', LDA(n_components=100, random_state=random_state, verbose=10, learning_method='batch', n_jobs=-1))
])

Wall time: 1.5 ms


In [None]:
%%time
lda_documents = pipeline.fit_transform(documents)

In [None]:
%%time
lda_documents.shape

# Word2Vec representation

In [None]:
%%time
pipeline = Pipeline([
    ('tokenize', CustomTokenizer()),
    ('w2v', WordToVecTransformer(100))
])

In [None]:
%%time
w2v_documents = pipeline.fit_transform(documents)

In [None]:
%%time
w2v_documents.shape

# K-Means Clustering with known clusters count

## TF-IDF

In [None]:
%%time
kmeans = KMeans(n_clusters=20, random_state=random_state)

In [None]:
%%time
labels_tf_idf_pred = kmeans.fit_predict(tf_idf_documents)

## LDA

In [None]:
%%time
kmeans = KMeans(n_clusters=20, random_state=random_state)

In [None]:
%%time
labels_lda_pred = kmeans.fit_predict(lda_documents)

## W2V

In [None]:
%%time
kmeans = KMeans(n_clusters=20, random_state=random_state)

In [None]:
%%time
labels_w2v_pred = kmeans.fit_predict(w2v_documents)

# Metrics calculation

## With ground-truth labels

### Confusion matrix

In [None]:
%%time
mat = metrics.confusion_matrix(labels_true, labels_tf_idf_pred)
sns.heatmap(mat.T, square=True, fmt='d')
plt.xlabel('true label')
plt.ylabel('predicted label')
plt.show()

In [None]:
%%time
mat = metrics.confusion_matrix(labels_true, labels_lda_pred)
sns.heatmap(mat.T, square=True, fmt='d')
plt.xlabel('true label')
plt.ylabel('predicted label')
plt.show()

In [None]:
%%time
mat = metrics.confusion_matrix(labels_true, labels_w2v_pred)
sns.heatmap(mat.T, square=True, fmt='d')
plt.xlabel('true label')
plt.ylabel('predicted label')
plt.show()

### Adjusted rand score

In [None]:
%%time
metrics.adjusted_rand_score(labels_true, labels_tf_idf_pred)

In [None]:
%%time
metrics.adjusted_rand_score(labels_true, labels_lda_pred)

In [None]:
%%time
metrics.adjusted_rand_score(labels_true, labels_w2v_pred)

### Mutual info score

In [None]:
%%time
metrics.mutual_info_score(labels_true, labels_tf_idf_pred)

In [None]:
%%time
metrics.mutual_info_score(labels_true, labels_lda_pred)

In [None]:
%%time
metrics.mutual_info_score(labels_true, labels_w2v_pred)

### Adjusted mutual info score

In [None]:
%%time
metrics.adjusted_mutual_info_score(labels_true, labels_tf_idf_pred) 

In [None]:
%%time
metrics.adjusted_mutual_info_score(labels_true, labels_lda_pred) 

In [None]:
%%time
metrics.adjusted_mutual_info_score(labels_true, labels_w2v_pred) 

### Normalized mutual info score

In [None]:
%%time
metrics.normalized_mutual_info_score(labels_true, labels_tf_idf_pred)

In [None]:
%%time
metrics.normalized_mutual_info_score(labels_true, labels_lda_pred)

In [None]:
%%time
metrics.normalized_mutual_info_score(labels_true, labels_w2v_pred)

## Without ground-truth labels

### Silhouette Coefficient

In [None]:
%%time
metrics.silhouette_score(tf_idf_documents, labels_tf_idf_pred, metric='euclidean')

In [None]:
%%time
metrics.silhouette_score(lda_documents, labels_lda_pred, metric='euclidean')

In [None]:
%%time
metrics.silhouette_score(w2v_documents, labels_w2v_pred, metric='euclidean')

### Calinski-Harabaz Index

In [None]:
%%time
metrics.calinski_harabaz_score(tf_idf_documents.toarray(), labels_tf_idf_pred)

In [None]:
%%time
metrics.calinski_harabaz_score(lda_documents, labels_lda_pred)

In [None]:
%%time
metrics.calinski_harabaz_score(w2v_documents, labels_w2v_pred)