In [1]:
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer, CountVectorizer
from sklearn.metrics import accuracy_score
from get_nice_text import *
import pandas as pd
import re
import numpy as np
from sklearn.cluster import KMeans
from collections import Counter
from textblob import Word
import spacy 
nlp = spacy.load('en_core_web_sm') 
  

In [2]:
data = get_nice_text()

In [3]:
stemmed = [0 for i in range(len(data))]
for i in range(len(data)):
    tmp = " ".join([Word(word).stem() for word in str(data[i]).split()])
    stemmed[i] = tmp

In [4]:
len(stemmed)

590

In [6]:
cv = CountVectorizer(stop_words = 'english')
df_count = cv.fit_transform(stemmed)

In [7]:
count_vect_df = pd.DataFrame(df_count.todense(), columns=cv.get_feature_names())

In [8]:
count_vect_df

Unnamed: 0,aac,aaron,abandon,abas,abash,abat,abateth,aberr,abhor,abhorreth,...,yesterday,yield,yieldeth,yoga,yoke,young,yourselv,youth,zeal,zorobabel
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,2,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
585,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
586,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
587,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
588,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
def get_ending(base, stemmed): 
    """
    returns list of endings, so it might be different length from original vectors (when there is no ending)
    """


    out = []

    for i in range(len(base)): 
        st = stemmed[i]
        bs = base[i]
        diff = len(bs) - len(st)

        if diff > 0 : 
            out.append(base[i][-diff:])

    return(out)

def count_endings(data):
    
    n = len(data)
    
    dicts = [0 for i in range(n)]
    for i in range(n):
        chapter = data[i]
        stemmed = [Word(word).stem() for word in str(chapter).split()]
        base = str(chapter).split()
        
        endings = get_ending(base, stemmed)
        
        # makes dictionary
        counted = Counter(endings)
        dicts[i] = counted
        
    return(dicts)

In [10]:
def endings(data):
    n = len(data)

    endings = [0 for i in range(n)] 

    for i in range(n): 
        chapter = data[i]
        stemmed = [Word(word).stem() for word in str(chapter).split()]
        base = str(chapter).split()

        ending = get_ending(base, stemmed)
        tmp = " ".join(ending)

        endings[i] = tmp
    
    return endings


In [11]:
endings = endings(data)

In [12]:
cv = CountVectorizer(stop_words='english')
endings_count = cv.fit_transform(endings)
count_vect_endings = pd.DataFrame(endings_count.todense(), columns=cv.get_feature_names())

In [13]:
count_vect_df

Unnamed: 0,aac,aaron,abandon,abas,abash,abat,abateth,aberr,abhor,abhorreth,...,yesterday,yield,yieldeth,yoga,yoke,young,yourselv,youth,zeal,zorobabel
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,2,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
585,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
586,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
587,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
588,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
whole_df = pd.concat([count_vect_df ,count_vect_endings], axis = 1) 
whole_df

Unnamed: 0,aac,aaron,abandon,abas,abash,abat,abateth,aberr,abhor,abhorreth,...,ous,ously,ousness,ped,ping,pings,red,ring,ted,ting
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,3,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,2,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
585,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
586,0,0,0,0,0,0,1,0,0,0,...,3,0,0,0,0,0,0,0,0,0
587,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,1,0,0,0
588,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [15]:
from scipy import sparse

In [16]:
df = sparse.csr_matrix(whole_df)

In [17]:
trans = TfidfTransformer()
x = trans.fit_transform(df)

In [18]:
model = KMeans(n_clusters=8)
lab = model.fit_predict(x)

In [19]:
lab

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 2, 1, 1, 2, 2, 1, 1,
       1, 1, 2, 2, 1, 2, 2, 1, 2, 2, 2, 2, 2, 4, 1, 1, 1, 2, 1, 1, 1, 1,
       1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 2, 4,
       4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
       2, 2, 4, 4, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
       4, 4, 4, 4, 2, 4, 6, 2, 4, 4, 2, 4, 4, 4, 4, 4, 4, 2, 2, 2, 3, 2,
       3, 2, 2, 6, 4, 6, 6, 6, 2, 2, 2, 5, 4, 4, 4, 4, 4, 4, 4, 4, 0, 0,
       0, 4, 4, 0, 4, 0, 6, 2, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 4, 4, 0, 4,
       4, 4, 2, 3, 3, 5, 4, 2, 2, 1, 4, 6, 4, 2, 2, 3, 2, 3, 2, 3, 3, 6,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 4, 6, 3, 2, 6, 3, 1, 4, 2, 4,
       2, 2, 2, 2, 2, 2, 2, 4, 3, 3, 2, 6, 2, 6, 3, 3, 2, 2, 2, 6, 6, 2,
       4, 4, 4, 3, 3, 3, 3, 3, 2, 4, 2, 6, 2, 4, 2, 4, 5, 6, 6, 6, 6, 6,
       6, 6, 6, 6, 6, 6, 6, 6, 4, 4, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 6,
       3, 6, 5, 5, 5, 5, 2, 5, 2, 2, 2, 2, 5, 5, 5,

In [21]:
#df_lab = pd.read_csv('./AllBooks_baseline_DTM_Labelled.csv')
#names = list(df_lab.iloc[:,0])
#labels = [re.match("^[a-zA-Z]+", n).group(0) for n in names]
labels = get_labels(merge_Bible=False)

In [22]:
from sklearn.metrics.cluster import homogeneity_score, completeness_score, v_measure_score
print("homogeneity_score = {}".format(homogeneity_score(labels, lab)))
print("completeness_score = {}".format(completeness_score(labels, lab)))
print("v_measure_score = {}".format(v_measure_score(labels, lab)))

homogeneity_score = 0.5138321529830332
completeness_score = 0.46371963904316915
v_measure_score = 0.48749143002681933


In [23]:
def get_entities(data):

    def unique(arr): 
    
        un = []

        for elem in arr: 
            if  elem not in un : 
                un.append(elem)

        return un

    entities_list = [[] for i in range(590)] # number of chapters
    for i in range(len(data)):
        chapter = str(data[i])
        doc = nlp(chapter) 

        entities_in_chapter = []
        for ent in doc.ents: 
            entities_in_chapter.append(ent.text)
            
        for j in range(len(entities_in_chapter)) : 
            ent = entities_in_chapter[j]
            ent = "".join(ent.split())
            entities_in_chapter[j] = ent
        
        
        entities_list[i] =  " ".join(entities_in_chapter)
        
            
    out = entities_list
    
    return out
        

In [24]:
ge = get_entities(data)

In [25]:
entities_count = cv.fit_transform(ge)
count_vect_entities = pd.DataFrame(entities_count.todense(), columns=cv.get_feature_names())

if some entity already is in matrix we won't add it 

In [26]:
nice_cols = []
for i in range(len(count_vect_entities.columns)):
    nice_cols.append(count_vect_entities.columns[i] not in list(whole_df.columns))

In [27]:
count_vect_entities.loc[:,nice_cols]

Unnamed: 0,absolute,absolutetruth,achiketas,aday,adorethee,afewyears,ages,ahundred,ahundredyears,allday,...,vinepower,virtue,vomens,whatsoever,windy,wisdomchapter,workmaster,works,yamas,years
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
585,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
586,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
587,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
588,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


words are glued together, because words are just numbers!!!

In [28]:
absolute_whole_df = pd.concat([whole_df ,count_vect_entities], axis = 1) 
absolute_whole_df

Unnamed: 0,aac,aaron,abandon,abas,abash,abat,abateth,aberr,abhor,abhorreth,...,works,xi,xii,yama,yamas,yea,years,yesterday,yoga,zorobabel
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,2,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
585,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
586,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
587,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
588,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [29]:
df = sparse.csr_matrix(absolute_whole_df)
x = trans.fit_transform(df)
model = KMeans(n_clusters=8)
lab2 = model.fit_predict(x)
lab2

array([6, 6, 6, 6, 6, 6, 6, 6, 6, 0, 3, 6, 6, 0, 6, 3, 6, 6, 0, 0, 6, 6,
       6, 3, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 4, 0, 6, 6, 6, 6, 6, 6, 6, 6,
       6, 1, 4, 4, 4, 4, 4, 0, 4, 4, 4, 4, 0, 0, 4, 4, 4, 4, 1, 1, 4, 1,
       4, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 1, 4, 4, 4, 1,
       0, 0, 4, 4, 4, 4, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
       4, 4, 4, 4, 4, 4, 0, 4, 4, 0, 0, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 7, 7, 0, 7, 7, 7, 7, 7, 7, 1, 1, 1, 7,
       7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 1, 7, 7, 0, 7, 0, 0, 0, 7, 7,
       1, 0, 0, 0, 0, 7, 7, 0, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 7,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 1, 0, 7,
       0, 0, 0, 0, 7, 0, 0, 7, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1,
       0, 0, 3, 3, 3, 3, 0, 3, 3, 0, 0, 0, 3, 3, 3,

In [30]:
from sklearn.metrics.cluster import homogeneity_score, completeness_score, v_measure_score
print("homogeneity_score = {}".format(homogeneity_score(labels, lab2)))
print("completeness_score = {}".format(completeness_score(labels, lab2)))
print("v_measure_score = {}".format(v_measure_score(labels, lab2)))

homogeneity_score = 0.6200980023387109
completeness_score = 0.5676547551045505
v_measure_score = 0.5927186065492539
