In [92]:
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer, CountVectorizer
from sklearn.metrics import accuracy_score
from get_nice_text import *
import pandas as pd
import re
import numpy as np
from sklearn.cluster import KMeans
from collections import Counter
from textblob import Word
import spacy 
nlp = spacy.load('en_core_web_sm') 
  

In [3]:
data = get_nice_text()

In [53]:
stemmed = [0 for i in range(len(data))]
for i in range(len(data)):
    tmp = " ".join([Word(word).stem() for word in str(data[i]).split()])
    stemmed[i] = tmp

In [55]:
len(stemmed)

590

In [56]:
cv = CountVectorizer()
df_count = cv.fit_transform(stemmed)

In [57]:
count_vect_df = pd.DataFrame(df_count.todense(), columns=cv.get_feature_names())

In [58]:
count_vect_df

Unnamed: 0,aac,aaron,abandon,abas,abash,abat,abateth,aberr,abhor,abhorreth,...,yoga,yoke,you,young,your,yourself,yourselv,youth,zeal,zorobabel
0,0,0,0,0,0,0,0,0,0,0,...,0,0,27,0,0,2,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
3,0,0,2,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
585,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
586,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
587,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
588,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [59]:
def get_ending(base, stemmed): 
    """
    returns list of endings, so it might be different length from original vectors (when there is no ending)
    """


    out = []

    for i in range(len(base)): 
        st = stemmed[i]
        bs = base[i]
        diff = len(bs) - len(st)

        if diff > 0 : 
            out.append(base[i][-diff:])

    return(out)

def count_endings(data):
    
    n = len(data)
    
    dicts = [0 for i in range(n)]
    for i in range(n):
        chapter = data[i]
        stemmed = [Word(word).stem() for word in str(chapter).split()]
        base = str(chapter).split()
        
        endings = get_ending(base, stemmed)
        
        # makes dictionary
        counted = Counter(endings)
        dicts[i] = counted
        
    return(dicts)

In [215]:
def endings(data):
    n = len(data)

    endings = [0 for i in range(n)] 

    for i in range(n): 
        chapter = data[i]
        stemmed = [Word(word).stem() for word in str(chapter).split()]
        base = str(chapter).split()

        ending = get_ending(base, stemmed)
        tmp = " ".join(ending)

        endings[i] = tmp
    
    return endings


In [216]:
endings = endings(data)

In [217]:
cv = CountVectorizer()
endings_count = cv.fit_transform(endings)
count_vect_endings = pd.DataFrame(endings_count.todense(), columns=cv.get_feature_names())

In [218]:
count_vect_df

Unnamed: 0,aac,aaron,abandon,abas,abash,abat,abateth,aberr,abhor,abhorreth,...,yoga,yoke,you,young,your,yourself,yourselv,youth,zeal,zorobabel
0,0,0,0,0,0,0,0,0,0,0,...,0,0,27,0,0,2,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
3,0,0,2,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
585,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
586,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
587,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
588,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [63]:
whole_df = pd.concat([count_vect_df ,count_vect_endings], axis = 1) 
whole_df

Unnamed: 0,aac,aaron,abandon,abas,abash,abat,abateth,aberr,abhor,abhorreth,...,ous,ously,ousness,ped,ping,pings,red,ring,ted,ting
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,3,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,2,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
585,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
586,0,0,0,0,0,0,1,0,0,0,...,3,0,0,0,0,0,0,0,0,0
587,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,1,0,0,0
588,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [64]:
from scipy import sparse

In [193]:
df = sparse.csr_matrix(whole_df)

In [194]:
trans = TfidfTransformer()
x = trans.fit_transform(df)

In [203]:
model = KMeans(n_clusters=8)
lab = model.fit_predict(x)

In [204]:
lab

array([0, 6, 6, 6, 6, 6, 6, 6, 6, 6, 3, 6, 6, 3, 6, 6, 6, 6, 6, 6, 6, 6,
       6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 3, 6, 6, 6, 6, 6, 6, 6, 6,
       6, 3, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 6, 3, 0, 0, 0, 0, 4, 4, 4, 4,
       0, 3, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 4,
       6, 0, 0, 6, 0, 0, 4, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 2, 6, 2, 3, 0,
       3, 3, 3, 7, 3, 3, 7, 3, 3, 3, 3, 2, 4, 3, 4, 4, 4, 4, 7, 4, 4, 4,
       4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 4, 4,
       3, 4, 3, 3, 3, 4, 4, 3, 3, 4, 2, 3, 3, 0, 0, 3, 3, 3, 3, 3, 2, 4,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 4, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 7, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3,
       3, 0, 3, 3, 3, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 3, 3, 3, 3,
       3, 3, 2, 2, 2, 2, 2, 2, 2, 6, 2, 3, 2, 2, 2,

In [205]:
df_lab = pd.read_csv('./AllBooks_baseline_DTM_Labelled.csv')
names = list(df_lab.iloc[:,0])
labels = [re.match("^[a-zA-Z]+", n).group(0) for n in names]

In [206]:
from sklearn.metrics.cluster import homogeneity_score, completeness_score, v_measure_score
print("homogeneity_score = {}".format(homogeneity_score(labels, lab)))
print("completeness_score = {}".format(completeness_score(labels, lab)))
print("v_measure_score = {}".format(v_measure_score(labels, lab)))

homogeneity_score = 0.6580398188788985
completeness_score = 0.6159162262816389
v_measure_score = 0.6362816103845573


In [111]:
def get_entities(data):

    def unique(arr): 
    
        un = []

        for elem in arr: 
            if  elem not in un : 
                un.append(elem)

        return un

    entities_list = [[] for i in range(590)] # number of chapters
    for i in range(len(data)):
        chapter = str(data[i])
        doc = nlp(chapter) 

        entities_in_chapter = []
        for ent in doc.ents: 
            entities_in_chapter.append(ent.text)
            
        for j in range(len(entities_in_chapter)) : 
            ent = entities_in_chapter[j]
            ent = "".join(ent.split())
            entities_in_chapter[j] = ent
        
        
        entities_list[i] =  " ".join(entities_in_chapter)
        
            
    out = entities_list
    
    return out
        

In [112]:
ge = get_entities(data)

In [115]:
entities_count = cv.fit_transform(ge)
count_vect_entities = pd.DataFrame(entities_count.todense(), columns=cv.get_feature_names())

if some entity already is in matrix we won't add it 

In [132]:
nice_cols = []
for i in range(len(count_vect_entities.columns)):
    nice_cols.append(count_vect_entities.columns[i] not in list(whole_df.columns))

In [137]:
count_vect_entities.loc[:,nice_cols]

Unnamed: 0,absolute,absolutetruth,achiketas,aday,adorethee,afewyears,ages,ahundred,ahundredyears,allday,...,vinepower,virtue,vomens,whatsoever,windy,wisdomchapter,workmaster,works,yamas,years
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
585,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
586,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
587,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
588,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


words are glued together, because words are just numbers!!!

In [138]:
absolute_whole_df = pd.concat([whole_df ,count_vect_entities], axis = 1) 
absolute_whole_df

Unnamed: 0,aac,aaron,abandon,abas,abash,abat,abateth,aberr,abhor,abhorreth,...,works,xi,xii,yama,yamas,yea,years,yesterday,yoga,zorobabel
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,2,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
585,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
586,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
587,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
588,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [213]:
df = sparse.csr_matrix(absolute_whole_df)
x = trans.fit_transform(df)
model = KMeans(n_clusters=8)
lab2 = model.fit_predict(x)
lab2

array([5, 7, 7, 7, 7, 7, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 7, 7, 4, 2, 4, 4,
       4, 2, 4, 4, 4, 4, 7, 7, 4, 4, 7, 7, 4, 5, 2, 4, 2, 4, 2, 2, 4, 2,
       7, 4, 5, 2, 5, 5, 5, 5, 5, 2, 5, 5, 4, 2, 5, 5, 5, 5, 5, 4, 4, 5,
       5, 5, 5, 5, 5, 5, 5, 2, 5, 5, 2, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4,
       2, 5, 4, 5, 4, 5, 5, 4, 5, 5, 5, 5, 5, 5, 2, 5, 2, 5, 5, 5, 5, 5,
       5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 4, 5, 5, 2, 5, 5, 2, 4, 2, 4, 2,
       2, 4, 2, 4, 4, 4, 4, 4, 2, 4, 2, 2, 4, 4, 2, 5, 5, 4, 4, 4, 6, 6,
       6, 6, 6, 2, 4, 6, 4, 4, 2, 6, 6, 6, 4, 4, 4, 6, 4, 4, 4, 4, 4, 2,
       4, 2, 2, 4, 2, 4, 2, 2, 2, 6, 2, 4, 2, 5, 5, 2, 5, 2, 2, 4, 2, 2,
       2, 2, 4, 4, 5, 5, 2, 2, 2, 4, 2, 2, 2, 4, 2, 2, 4, 2, 2, 2, 4, 4,
       4, 4, 4, 2, 2, 4, 2, 2, 4, 4, 4, 4, 2, 4, 4, 4, 2, 4, 2, 5, 2, 4,
       2, 2, 2, 4, 2, 2, 2, 2, 2, 4, 4, 4, 2, 2, 2, 2, 2, 5, 5, 5, 5, 5,
       2, 5, 5, 2, 2, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6, 2, 2, 4, 4,
       4, 4, 3, 3, 3, 3, 3, 3, 2, 4, 2, 4, 3, 3, 3,

In [214]:
from sklearn.metrics.cluster import homogeneity_score, completeness_score, v_measure_score
print("homogeneity_score = {}".format(homogeneity_score(labels, lab2)))
print("completeness_score = {}".format(completeness_score(labels, lab2)))
print("v_measure_score = {}".format(v_measure_score(labels, lab2)))

homogeneity_score = 0.5160734645653857
completeness_score = 0.4769885439429387
v_measure_score = 0.4957618523749369
