In [1]:
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer, CountVectorizer
from sklearn.metrics.cluster import homogeneity_score, completeness_score, v_measure_score
from get_nice_text import *

import pandas as pd
import re
import numpy as np

from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering, DBSCAN, MiniBatchKMeans, MeanShift, SpectralClustering
from sklearn.mixture import GaussianMixture
import sys

In [2]:
df = get_nice_text()
labels = get_labels(True)

In [3]:
cv = CountVectorizer(stop_words = 'english')
df_count = cv.fit_transform(df)


In [4]:
def train_fast(df, model, labels):
    """
    Changed train_fast, now takes model as initiated object
    
    
    Trenuje Model na sparse matrix df
    @example
    train fast(df, KMeans, n_clusters=8)
    """
    trans = TfidfTransformer()
    df_trans = trans.fit_transform(df)
    lab = model.fit_predict(df_trans.toarray())
    
    ret = {}
    
    ret["homogeneity_score"] = homogeneity_score(labels, lab)
    ret["completeness_score"] = completeness_score(labels, lab)
    ret["v_measure_score"] = v_measure_score(labels, lab)
    
    return lab, ret

In [5]:
results_table = pd.DataFrame(columns= ["name","homogeneity_score","completeness_score","v_measure_score"])

def fast_GridSearch(method, parameter_dict, df, labels, name_of_method, results_table, **kwargs):
    
    """
    Parameters
    ============================================================
    Method: 
        initiated method object
    
    perameter_dict: 
        dictionary containing name of parameters as keys and as values list of values to check. Number of keys should be 2. 
    
    df:
        DataFrame with data
    
    labels: 
        labels array
    
    name_of_method: 
        string, name of method
    
    results_table: 
        results table is meant to be dataframe where results will be added

    Return
    ===============================================================
    
    Output: 
        DataFrame
    """
    
    keys = list(parameter_dict.keys())
    assert(len(keys) == 2)
    
    results = pd.DataFrame(columns= ["name","homogeneity_score","completeness_score","v_measure_score"])
    
    bound = len(parameter_dict.get(keys[0])) * len(parameter_dict.get(keys[1])) 
    i = 1
    
    print(f'Starting {name_of_method}' )
    
    for par1 in parameter_dict.get(keys[0]):
        for par2 in parameter_dict.get(keys[1]):
            
            try : 
                sys.stdout.write('\r')
                print(f'progres: {i}/{bound}', end = '')


                tmp_params = {keys[0] : par1, 
                              keys[1] : par2}

                # setting additional parameters
                met =  method(**tmp_params, **kwargs)
                _ , ret = train_fast(df, met, labels)

                row = {"name" : [name_of_method],
                       "param1" : [par1],
                       "param2" : [par2],
                       "homogeneity_score"  : [ret.get("homogeneity_score")],
                       "completeness_score" : [ret.get("completeness_score")],
                       "v_measure_score"    : [ret.get("v_measure_score")]}

                row = pd.DataFrame(data = row)
                results_table = pd.concat([results_table, row])
            
            except :
                pass
            i += 1
       
            
            results_table = results_table.reset_index(drop = True)
    
    print(f'\nEnded {name_of_method}')
    return results_table

In [7]:
%%time

results_table = pd.DataFrame(columns= ["name","homogeneity_score","completeness_score","v_measure_score"])


spec_d = {'degree': [1, 2, 3, 4, 5],
          'gamma': [0.5, 0.8, 1, 1.3, 1.6]}

results_table = fast_GridSearch(SpectralClustering,
                                spec_d, df_count,
                                get_labels(True),
                                "SpectralClustering",
                                results_table,
                                random_state = 42,
                                n_init = 20)


kmeans_d = {'init'  : ['k-means++', 'random'], 
            'n_init': [20]} # not many to tune

results_table = fast_GridSearch(KMeans,
                                kmeans_d,
                                df_count,
                                get_labels(True),
                                "KMeans",
                                results_table, 
                                random_state = 42,
                                n_clusters = 5)

agg_d = {'affinity' : ["euclidean", "l1", "l2", "manhattan", "cosine"],
         'linkage'  : ["ward", "complete", "average", "single"]}

results_table = fast_GridSearch(AgglomerativeClustering,
                                agg_d,
                                df_count,
                                get_labels(True),
                                "AgglomerativeClustering",
                                results_table,
                                n_clusters = 5)

mbkmeans_d = {'init'  : ['k-means++', 'random'], 
            'reassignment_ratio': [0.001, 0.005,0.01,0.05,0.1,0.2,0.3] } 

results_table = fast_GridSearch(MiniBatchKMeans,
                                mbkmeans_d,
                                df_count,
                                get_labels(True),
                                "MiniBatchKMeans",
                                results_table,
                                random_state = 42,
                                n_clusters = 5)

gaussm_d = {'covariance_type' : ['full', 'tied', 'diag', 'spherical'], 
            'warm_start'      : [True, False]}

results_table = fast_GridSearch(GaussianMixture,
                                gaussm_d,
                                df_count,
                                get_labels(True),
                                "GaussianMixture", 
                                results_table, 
                                random_state = 42,
                                n_components = 5)


results_table

Starting SpectralClustering
progres: 25/25
Ended SpectralClustering
Starting KMeans
progres: 2/2
Ended KMeans
Starting AgglomerativeClustering
progres: 20/20
Ended AgglomerativeClustering
Starting MiniBatchKMeans
progres: 14/14
Ended MiniBatchKMeans
Starting GaussianMixture
progres: 8/8
Ended GaussianMixture
CPU times: user 20min 14s, sys: 19.9 s, total: 20min 34s
Wall time: 10min 47s


Unnamed: 0,name,homogeneity_score,completeness_score,v_measure_score,param1,param2
0,SpectralClustering,0.517659,0.440622,0.476044,1,0.5
1,SpectralClustering,0.523090,0.444914,0.480845,1,0.8
2,SpectralClustering,0.522113,0.469818,0.494587,1,1
3,SpectralClustering,0.505981,0.472994,0.488932,1,1.3
4,SpectralClustering,0.497955,0.474987,0.486200,1,1.6
...,...,...,...,...,...,...
57,GaussianMixture,0.482083,0.460225,0.470900,tied,False
58,GaussianMixture,0.485382,0.462491,0.473660,diag,True
59,GaussianMixture,0.485382,0.462491,0.473660,diag,False
60,GaussianMixture,0.501183,0.481117,0.490945,spherical,True
