# Import modules needed

In [None]:
%%time
import sys
sys.path.append('..')

In [None]:
%%time
from source.code.clustering_etl import CLUSTERINGETL

In [None]:
%%time
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.pipeline import Pipeline

from sklearn.cluster import KMeans
from sklearn.decomposition import LatentDirichletAllocation as LDA

from sklearn import metrics

import matplotlib.pyplot as plt
import seaborn as sns

Fix one random state across all notebook

In [None]:
%%time
random_state = 0

# Read data

In [None]:
%%time
clustering_etl = CLUSTERINGETL('../data/datasets/')

In [None]:
%%time
documents, labels_true = clustering_etl.extract_documents(docs_count_per_topic=100)

# TF-IDF documents representation

In [None]:
%%time
pipeline = Pipeline([
    ('vectorize', CountVectorizer()),
    ('inverse', TfidfTransformer(smooth_idf=True))
])

In [None]:
%%time
tf_idf_documents = pipeline.fit_transform(documents)

In [None]:
%%time
tf_idf_documents.shape

# Dimensionality reduction (LDA)

In [None]:
%%time
lda = LDA(n_components=100, random_state=random_state, verbose=10, learning_method='batch', n_jobs=-1)

In [None]:
%%time
tf_idf_documents_reduced = lda.fit_transform(tf_idf_documents)

In [None]:
%%time
tf_idf_documents_reduced.shape

# K-Means Clustering with known clusters count

In [None]:
%%time
kmeans = KMeans(n_clusters=20, random_state=random_state)

In [None]:
%%time
labels_pred = kmeans.fit_predict(tf_idf_documents_reduced)

# Metrics calculation

## With ground-truth labels

### Confusion matrix

In [None]:
mat = metrics.confusion_matrix(labels_true, labels_pred)
sns.heatmap(mat.T, square=True, fmt='d')
plt.xlabel('true label')
plt.ylabel('predicted label')
plt.show()

### Adjusted rand score

In [None]:
metrics.adjusted_rand_score(labels_true, labels_pred)

### Mutual info score

In [None]:
metrics.mutual_info_score(labels_true, labels_pred)

### Adjusted mutual info score

In [None]:
metrics.adjusted_mutual_info_score(labels_true, labels_pred) 

### Normalized mutual info score

In [None]:
metrics.normalized_mutual_info_score(labels_true, labels_pred)

## Without ground-truth labels

### Silhouette Coefficient

In [None]:
metrics.silhouette_score(tf_idf_documents_reduced, labels_pred, metric='euclidean')

### Calinski-Harabaz Index

In [None]:
metrics.calinski_harabaz_score(tf_idf_documents_reduced, labels_pred)