In [1]:
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer, CountVectorizer
from sklearn.metrics.cluster import homogeneity_score, completeness_score, v_measure_score
from get_nice_text import *

import pandas as pd
import re
import numpy as np

from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering, DBSCAN, MiniBatchKMeans, MeanShift, SpectralClustering
from sklearn.mixture import GaussianMixture

In [2]:
def train_fast(df, Model, labels, **kwargs):
    """
    Trenuje Model na sparse matrix df
    @example
    train fast(df, KMeans, n_clusters=8)
    """
    trans = TfidfTransformer()
    df_trans = trans.fit_transform(df)
    model = Model(**kwargs)
    lab = model.fit_predict(df_trans)
    
    ret = {}
    
    ret["homogeneity_score"] = homogeneity_score(labels, lab)
    ret["completeness_score"] = completeness_score(labels, lab)
    ret["v_measure_score"] = v_measure_score(labels, lab)
    
    return lab, ret

In [5]:
df = get_nice_text()
labels = get_labels(True)
np.random.seed(123)

In [6]:
cv = CountVectorizer(stop_words = 'english')
df_count = cv.fit_transform(df)

In [54]:
from sklearn.feature_selection import VarianceThreshold

sel = VarianceThreshold(threshold=(.3 * (1 - .8)))

In [55]:
print(df_count.shape)

df_high_var = sel.fit_transform(df_count)
print(df_high_var.shape)

(590, 8067)
(590, 494)


In [58]:
lab, ret = train_fast(df_count, KMeans, labels=get_labels(merge_Bible=False), n_clusters=8, random_state = 12)
ret

{'homogeneity_score': 0.481820054050061,
 'completeness_score': 0.4714029951506243,
 'v_measure_score': 0.4765546044932359}

In [59]:
lab, ret = train_fast(df_high_var, KMeans, labels=get_labels(merge_Bible=False), n_clusters=8, random_state = 12)
ret

{'homogeneity_score': 0.4798126853100102,
 'completeness_score': 0.4291710509743038,
 'v_measure_score': 0.45308118551615384}

In [42]:
for i in range(1,100):
    lab, scores = train_fast(df_high_var, DBSCAN, get_labels(True), eps = i/10, metric = 'manhattan')
    print(scores)

{'homogeneity_score': -4.2374126293898904e-16, 'completeness_score': 1.0, 'v_measure_score': -8.474825258779785e-16}
{'homogeneity_score': -4.2374126293898904e-16, 'completeness_score': 1.0, 'v_measure_score': -8.474825258779785e-16}
{'homogeneity_score': -4.2374126293898904e-16, 'completeness_score': 1.0, 'v_measure_score': -8.474825258779785e-16}
{'homogeneity_score': -4.2374126293898904e-16, 'completeness_score': 1.0, 'v_measure_score': -8.474825258779785e-16}
{'homogeneity_score': -4.2374126293898904e-16, 'completeness_score': 1.0, 'v_measure_score': -8.474825258779785e-16}
{'homogeneity_score': -4.2374126293898904e-16, 'completeness_score': 1.0, 'v_measure_score': -8.474825258779785e-16}
{'homogeneity_score': 0.00591670181481978, 'completeness_score': 0.18240283455802805, 'v_measure_score': 0.01146161681410512}
{'homogeneity_score': 0.00591670181481978, 'completeness_score': 0.18240283455802805, 'v_measure_score': 0.01146161681410512}
{'homogeneity_score': 0.00591670181481978, 'co

{'homogeneity_score': -4.2374126293898904e-16, 'completeness_score': 1.0, 'v_measure_score': -8.474825258779785e-16}
{'homogeneity_score': -4.2374126293898904e-16, 'completeness_score': 1.0, 'v_measure_score': -8.474825258779785e-16}
{'homogeneity_score': -4.2374126293898904e-16, 'completeness_score': 1.0, 'v_measure_score': -8.474825258779785e-16}
{'homogeneity_score': -4.2374126293898904e-16, 'completeness_score': 1.0, 'v_measure_score': -8.474825258779785e-16}
{'homogeneity_score': -4.2374126293898904e-16, 'completeness_score': 1.0, 'v_measure_score': -8.474825258779785e-16}
{'homogeneity_score': -4.2374126293898904e-16, 'completeness_score': 1.0, 'v_measure_score': -8.474825258779785e-16}
{'homogeneity_score': -4.2374126293898904e-16, 'completeness_score': 1.0, 'v_measure_score': -8.474825258779785e-16}
{'homogeneity_score': -4.2374126293898904e-16, 'completeness_score': 1.0, 'v_measure_score': -8.474825258779785e-16}
{'homogeneity_score': -4.2374126293898904e-16, 'completeness_sco