# Pierwsze modele

In [13]:
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer, CountVectorizer
from sklearn.metrics.cluster import homogeneity_score, completeness_score, v_measure_score
from get_nice_text import *

import pandas as pd
import re
import numpy as np

from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering, DBSCAN, MiniBatchKMeans, MeanShift, SpectralClustering
from sklearn.mixture import GaussianMixture

In [2]:
def train_fast(df, Model, labels, **kwargs):
    """
    Trenuje Model na sparse matrix df
    @example
    train fast(df, KMeans, n_clusters=8)
    """
    trans = TfidfTransformer()
    df_trans = trans.fit_transform(df)
    model = Model(**kwargs)
    lab = model.fit_predict(df_trans)
    
    ret = {}
    
    ret["homogeneity_score"] = homogeneity_score(labels, lab)
    ret["completeness_score"] = completeness_score(labels, lab)
    ret["v_measure_score"] = v_measure_score(labels, lab)
    
    return lab, ret

In [3]:
df = get_nice_text()
labels = get_labels(True)
np.random.seed(123)

Naiwne podejście - sam wordbag dajmy do pogrupowania, Biblię mergujemy

In [37]:
cv = CountVectorizer()
df_count = cv.fit_transform(df)

model = KMeans(n_clusters=5, random_state=123)

lab = model.fit_predict(df_count)
ret = {}
    
ret["homogeneity_score"] = homogeneity_score(labels, lab)
ret["completeness_score"] = completeness_score(labels, lab)
ret["v_measure_score"] = v_measure_score(labels, lab)
ret

{'homogeneity_score': 0.3320610459866629,
 'completeness_score': 0.4986041642194423,
 'v_measure_score': 0.39863718443904317}

Bardzo słabo. Dodajmy TfidfTransformer.

In [38]:
cv = CountVectorizer()
df_count = cv.fit_transform(df)

lab, scores = train_fast(df_count, KMeans, get_labels(True), n_clusters=5, random_state = 123)
scores

{'homogeneity_score': 0.5504130721916395,
 'completeness_score': 0.5575716186373115,
 'v_measure_score': 0.5539692201909778}

Lepiej, przekroczyliśmy losowość 0.5. Weżmy pod uwagę stopwords

In [41]:
cv = CountVectorizer(stop_words = 'english')
df_count = cv.fit_transform(df)

lab, scores = train_fast(df_count, KMeans, get_labels(True), n_clusters=5, random_state = 123)
scores

{'homogeneity_score': 0.5499682967296581,
 'completeness_score': 0.6450909053251002,
 'v_measure_score': 0.5937438845329811}

Zobaczmy jeszcze czy mergowanie Bibli coś daje

In [28]:
cv = CountVectorizer(stop_words = 'english')
df_count = cv.fit_transform(df)

lab, scores = train_fast(df_count, KMeans, get_labels(False), n_clusters=8, random_state = 123)
scores

{'homogeneity_score': 0.5073841435249417,
 'completeness_score': 0.4465988909388149,
 'v_measure_score': 0.475054980208431}

Jak widać tak

# Entities
Jako vocabulary dodajmy tylko enitities

In [71]:
def get_entities(data):

    entities_list = []
    
    for i in range(len(data)):
        
        chapter = str(data[i])
        doc = nlp(chapter) 

        for ent in doc.ents: 
            entities_list.append(ent.text)
    
    return entities_list

In [87]:
import spacy 
nlp = spacy.load('en_core_web_sm') 
ent = get_entities(df)

In [96]:
from collections import defaultdict 
  
LABELS_DICT = defaultdict(set) 

for word in df:
    doc = nlp(str(word))

    for e in doc.ents:
        LABELS_DICT[e.label_].add(e.text)

In [110]:
LABELS_DICT.keys()
all_nlp = []
for key in LABELS_DICT.keys():
    all_nlp += list(LABELS_DICT[key])
    
#remove duplicates
all_nlp = list(set(all_nlp))

In [111]:
cv = CountVectorizer(vocabulary=all_nlp)
df_count = cv.fit_transform(df)

model = KMeans(n_clusters=5, random_state=123)

lab = model.fit_predict(df_count)
ret = {}
    
ret["homogeneity_score"] = homogeneity_score(labels, lab)
ret["completeness_score"] = completeness_score(labels, lab)
ret["v_measure_score"] = v_measure_score(labels, lab)
ret

{'homogeneity_score': 0.11897558584368266,
 'completeness_score': 0.22483275054993732,
 'v_measure_score': 0.15560767661491615}

Słabo, dodajmy stop_words

In [112]:
cv = CountVectorizer(vocabulary=all_nlp, stop_words='english')
df_count = cv.fit_transform(df)

model = KMeans(n_clusters=5, random_state=123)

lab = model.fit_predict(df_count)
ret = {}
    
ret["homogeneity_score"] = homogeneity_score(labels, lab)
ret["completeness_score"] = completeness_score(labels, lab)
ret["v_measure_score"] = v_measure_score(labels, lab)
ret

{'homogeneity_score': 0.14612903898906285,
 'completeness_score': 0.39871032617470714,
 'v_measure_score': 0.21387278718898287}

Niewiele lepiej

## Inne modele
### AgglomerativeClustering
z argumentem linkage{“ward”, “complete”, “average”, “single”}

In [6]:
cv = CountVectorizer(stop_words = 'english')
df_count = cv.fit_transform(df)

In [8]:
def train_fast(df, Model, labels, **kwargs):
    """
    Trenuje Model na sparse matrix df
    @example
    train fast(df, KMeans, n_clusters=8)
    """
    trans = TfidfTransformer()
    df_trans = trans.fit_transform(df)
    model = Model(**kwargs)
    lab = model.fit_predict(df_trans.toarray())
    
    ret = {}
    
    ret["homogeneity_score"] = homogeneity_score(labels, lab)
    ret["completeness_score"] = completeness_score(labels, lab)
    ret["v_measure_score"] = v_measure_score(labels, lab)
    
    return lab, ret

In [12]:
cv = CountVectorizer(stop_words = 'english')
df_count = cv.fit_transform(df)

lab, scores = train_fast(df_count, AgglomerativeClustering, get_labels(True), n_clusters=5)
scores

{'homogeneity_score': 0.572923993295591,
 'completeness_score': 0.6047451991605944,
 'v_measure_score': 0.5884046838430234}

In [11]:
lab, scores = train_fast(df_count, AgglomerativeClustering, get_labels(True), n_clusters=5, linkage =  "complete")
scores

{'homogeneity_score': 0.3970246607100051,
 'completeness_score': 0.4687726141023961,
 'v_measure_score': 0.42992578858479946}

In [10]:
lab, scores = train_fast(df_count, AgglomerativeClustering, get_labels(True), n_clusters=5, linkage =  "average")
scores

{'homogeneity_score': 0.20293471924776588,
 'completeness_score': 0.6412603791279626,
 'v_measure_score': 0.3083031286332579}

In [9]:
lab, scores = train_fast(df_count, AgglomerativeClustering, get_labels(True), n_clusters=5, linkage =  "single")
scores

{'homogeneity_score': 0.007543664124885699,
 'completeness_score': 0.22724173007689247,
 'v_measure_score': 0.014602571788471418}

Coraz gorzej

### DBSCAN

In [51]:
lab, scores = train_fast(df_count, DBSCAN, get_labels(True), eps = 0.4, metric = 'manhattan')
scores

{'homogeneity_score': -4.2374126293898904e-16,
 'completeness_score': 1.0,
 'v_measure_score': -8.474825258779785e-16}

In [48]:
lab

array([-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1

Cluster labels for each point in the dataset given to fit(). Noisy samples are given the label -1.

## no dobra, a jak będziemy grupować po 8?
### AgglomerativeClustering

In [33]:
cv = CountVectorizer(stop_words = 'english')
df_count = cv.fit_transform(df)

lab, scores = train_fast(df_count, AgglomerativeClustering, get_labels(False), n_clusters=8)
scores

{'homogeneity_score': 0.5594447643120913,
 'completeness_score': 0.5330019854326977,
 'v_measure_score': 0.5459033498665863}

In [34]:
lab, scores = train_fast(df_count, AgglomerativeClustering, get_labels(False), n_clusters=8, linkage =  "complete")
scores

{'homogeneity_score': 0.40259838528734326,
 'completeness_score': 0.35210201942854397,
 'v_measure_score': 0.37566086778954255}

In [35]:
lab, scores = train_fast(df_count, AgglomerativeClustering, get_labels(False), n_clusters=8, linkage =  "average")
scores

{'homogeneity_score': 0.2370086420254117,
 'completeness_score': 0.5045610676796211,
 'v_measure_score': 0.32251946622039124}

In [36]:
lab, scores = train_fast(df_count, AgglomerativeClustering, get_labels(False), n_clusters=8, linkage =  "single")
scores

{'homogeneity_score': 0.02146117023707067,
 'completeness_score': 0.4280269739509705,
 'v_measure_score': 0.040872979066504986}

Still bad, ale minimalnie lepiej niż po 5

### GaussianMixture

In [15]:
lab, scores = train_fast(df_count, GaussianMixture, get_labels(False), n_components=8)
scores

KeyboardInterrupt: 

Niestety nie zbiega

### MiniBatchKMeans

In [41]:
lab, scores = train_fast(df_count, MiniBatchKMeans, get_labels(False), n_clusters=8)
scores

{'homogeneity_score': 0.5986593612799581,
 'completeness_score': 0.5945854116019115,
 'v_measure_score': 0.5966154318468916}

In [37]:
lab, scores = train_fast(df_count, MiniBatchKMeans, get_labels(True), n_clusters=5)
scores

{'homogeneity_score': 0.5265192377286467,
 'completeness_score': 0.5382273166930658,
 'v_measure_score': 0.5323089055007681}