In [1]:
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer, CountVectorizer
from sklearn.metrics.cluster import homogeneity_score, completeness_score, v_measure_score
from get_nice_text import *

import pandas as pd
import re
import numpy as np

from sklearn.cluster import KMeans

In [16]:
def train_fast(df, Model, labels, **kwargs):
    """
    Trenuje Model na sparse matrix df
    @example
    train fast(df, KMeans, n_clusters=8)
    """
    trans = TfidfTransformer()
    df_trans = trans.fit_transform(df)
    model = Model(**kwargs)
    lab = model.fit_predict(df_trans)
    
    ret = {}
    
    ret["homogeneity_score"] = homogeneity_score(labels, lab)
    ret["completeness_score"] = completeness_score(labels, lab)
    ret["v_measure_score"] = v_measure_score(labels, lab)
    
    return lab, ret

In [25]:
df = get_nice_text()
labels = get_labels(True)
np.random.seed(123)

Naiwne podejście - sam wordbag dajmy do pogrupowania, Biblię mergujemy

In [37]:
cv = CountVectorizer()
df_count = cv.fit_transform(df)

model = KMeans(n_clusters=5, random_state=123)

lab = model.fit_predict(df_count)
ret = {}
    
ret["homogeneity_score"] = homogeneity_score(labels, lab)
ret["completeness_score"] = completeness_score(labels, lab)
ret["v_measure_score"] = v_measure_score(labels, lab)
ret

{'homogeneity_score': 0.3320610459866629,
 'completeness_score': 0.4986041642194423,
 'v_measure_score': 0.39863718443904317}

Bardzo słabo. Dodajmy TfidfTransformer.

In [38]:
cv = CountVectorizer()
df_count = cv.fit_transform(df)

lab, scores = train_fast(df_count, KMeans, get_labels(True), n_clusters=5, random_state = 123)
scores

{'homogeneity_score': 0.5504130721916395,
 'completeness_score': 0.5575716186373115,
 'v_measure_score': 0.5539692201909778}

Lepiej, przekroczyliśmy losowość 0.5. Weżmy pod uwagę stopwords

In [41]:
cv = CountVectorizer(stop_words = 'english')
df_count = cv.fit_transform(df)

lab, scores = train_fast(df_count, KMeans, get_labels(True), n_clusters=5, random_state = 123)
scores

{'homogeneity_score': 0.5499682967296581,
 'completeness_score': 0.6450909053251002,
 'v_measure_score': 0.5937438845329811}

# Entities

In [71]:
def get_entities(data):

    entities_list = []
    
    for i in range(len(data)):
        
        chapter = str(data[i])
        doc = nlp(chapter) 

        for ent in doc.ents: 
            entities_list.append(ent.text)
    
    return entities_list

In [87]:
import spacy 
nlp = spacy.load('en_core_web_sm') 
ent = get_entities(df)

In [61]:
vocab = [e[0] for e in ent]

In [66]:
vocab = list(set(vocab))

In [95]:
nlp(df[0]

TypeError: Argument 'string' has incorrect type (expected str, got numpy.str_)

In [96]:
from collections import defaultdict 
  
LABELS_DICT = defaultdict(set) 

for word in df:
    doc = nlp(str(word))

    for e in doc.ents:
        LABELS_DICT[e.label_].add(e.text)

In [110]:
LABELS_DICT.keys()
all_nlp = []
for key in LABELS_DICT.keys():
    all_nlp += list(LABELS_DICT[key])
    
#remove duplicates
all_nlp = list(set(all_nlp))

In [111]:
cv = CountVectorizer(vocabulary=all_nlp)
df_count = cv.fit_transform(df)

model = KMeans(n_clusters=5, random_state=123)

lab = model.fit_predict(df_count)
ret = {}
    
ret["homogeneity_score"] = homogeneity_score(labels, lab)
ret["completeness_score"] = completeness_score(labels, lab)
ret["v_measure_score"] = v_measure_score(labels, lab)
ret

{'homogeneity_score': 0.11897558584368266,
 'completeness_score': 0.22483275054993732,
 'v_measure_score': 0.15560767661491615}

Słabo, dodajmy stop_words

In [112]:
cv = CountVectorizer(vocabulary=all_nlp, stop_words='english')
df_count = cv.fit_transform(df)

model = KMeans(n_clusters=5, random_state=123)

lab = model.fit_predict(df_count)
ret = {}
    
ret["homogeneity_score"] = homogeneity_score(labels, lab)
ret["completeness_score"] = completeness_score(labels, lab)
ret["v_measure_score"] = v_measure_score(labels, lab)
ret

{'homogeneity_score': 0.14612903898906285,
 'completeness_score': 0.39871032617470714,
 'v_measure_score': 0.21387278718898287}

In [47]:
#pd.Series(labels).astype('category').cat.codes.values