In [93]:
from collections import Counter
from textblob import Word

from get_nice_text import *

In [6]:
data = get_nice_text()

`pip install spacy
python -m spacy download en_core_web_sm`

Example:

In [18]:
import spacy 
  
nlp = spacy.load('en_core_web_sm') 
  
sentence = str(data[0])
  
doc = nlp(sentence) 
  
for ent in doc.ents: 
    print(ent.text, ent.label_) 

Buddha PERSON
Rahula PERSON
The Buddha Rahula GPE
Rahula The Buddha PERSON
Rahula GPE
The Buddha WORK_OF_ART
Rahula PERSON
Rahula PERSON
Gratified PERSON
Rahula PERSON


We want only persons

In [37]:
def get_entities(data):

    def unique(arr): 
    
        un = []

        for elem in arr: 
            if  elem not in un : 
                un.append(elem)

        return un

    entities_list = []
    for i in range(len(data)):
        chapter = str(data[i])
        doc = nlp(chapter) 

        for ent in doc.ents: 
            extracted_ent = [ent.text, ent.label_]
            entities_list.append(extracted_ent)
    
    out = unique(entities_list)
    
    return out
        

In [39]:
gp = get_entities(data)

In [106]:
gp[:40]

[['Buddha', 'PERSON'],
 ['Rahula', 'PERSON'],
 ['The Buddha Rahula', 'GPE'],
 ['Rahula The Buddha', 'PERSON'],
 ['Rahula', 'GPE'],
 ['The Buddha', 'WORK_OF_ART'],
 ['Gratified', 'PERSON'],
 ['Kosambi', 'GPE'],
 ['Simsapa', 'PERSON'],
 ['Birth', 'PERSON'],
 ['five', 'CARDINAL'],
 ['one', 'CARDINAL'],
 ['two', 'CARDINAL'],
 ['three', 'CARDINAL'],
 ['twelve', 'CARDINAL'],
 ['four', 'CARDINAL'],
 ['Unprovoked', 'GPE'],
 ['Savatthi', 'ORG'],
 ['Monks', 'NORP'],
 ['The Buddha  Clinging', 'WORK_OF_ART'],
 ['MahaKotthita', 'ORG'],
 ['MahaKotthita Sariputta', 'PRODUCT'],
 ['earth', 'LOC'],
 ['Sariputta', 'PRODUCT'],
 ['the internal water property', 'ORG'],
 ['phlegm', 'PERSON'],
 ['the external water property', 'ORG'],
 ['windy', 'PERSON'],
 ['Sister Dhammadinna', 'PERSON'],
 ['six', 'CARDINAL'],
 ['Consciousness   Consciousness', 'ORG'],
 ['Dhamma', 'ORG'],
 ['First', 'ORDINAL'],
 ['Release', 'PRODUCT'],
 ['Dispassion', 'ORG'],
 ['Consciousness    Fabrications    ', 'ORG'],
 ['One', 'CARDINAL'

Sometimes it makes sance, sometimes it doesn't but it is more or less accurate

## Lemmatization - converting to root word

In [83]:
str(data[300])[:500]

'ing   We have thought of ourselves  perhaps  as creatures moving upon this earth  rather helpless  at the mercy of storm and hunger and our enemies  We are to think of ourselves as immortals  dwelling in the Light  encompassed and sustained by spiritual powers  The steady effort to hold this thought will awaken dormant and unrealized powers  which will unveil to us the nearness of the Eternal   '

In [84]:
tmp = " ".join([Word(word).lemmatize() for word in str(data[300]).split()])
tmp[:500]

'ing We have thought of ourselves perhaps a creature moving upon this earth rather helpless at the mercy of storm and hunger and our enemy We are to think of ourselves a immortal dwelling in the Light encompassed and sustained by spiritual power The steady effort to hold this thought will awaken dormant and unrealized power which will unveil to u the nearness of the Eternal'

## Stemming - getting "base" of word

In [62]:
str(data[500])[:500]

' set before thy face      And put a knife to thy throat  if it be so that thou have thy soul in thy own power      Be not desirous of his meats  in which is the bread of deceit      Labour not to be rich  but set bounds to thy prudence      Lift not up thy eyes to riches which thou canst not have  because they shall make themselves wings like those of an eagle  and shall fly towards heaven      Eat not with an envious man  and desire not his meats      Because  like a soothsayer  and diviner  he'

In [61]:
tmp = " ".join([Word(word).stem() for word in str(data[500]).split()])
tmp[:499]

'set befor thi face and put a knife to thi throat if it be so that thou have thi soul in thi own power Be not desir of hi meat in which is the bread of deceit labour not to be rich but set bound to thi prudenc lift not up thi eye to rich which thou canst not have becaus they shall make themselv wing like those of an eagl and shall fli toward heaven eat not with an enviou man and desir not hi meat becaus like a soothsay and divin he thinketh that which he knoweth not eat and drink will he say to '

In [66]:
tmp = [Word(word).stem() for word in str(data[500]).split()]
tmp[:10]

['set', 'befor', 'thi', 'face', 'and', 'put', 'a', 'knife', 'to', 'thi']

### Why not getting the ending?  

In [67]:
base = str(data[500]).split()
stemmed = tmp
base[:10], stemmed[:10]

(['set', 'before', 'thy', 'face', 'And', 'put', 'a', 'knife', 'to', 'thy'],
 ['set', 'befor', 'thi', 'face', 'and', 'put', 'a', 'knife', 'to', 'thi'])

In [90]:
def get_ending(base, stemmed): 
    """
    returns list of endings, so it might be different length from original vectors (when there is no ending)
    """
    
    
    out = []
    
    for i in range(len(base)): 
        st = stemmed[i]
        bs = base[i]
        diff = len(bs) - len(st)
        
        if diff > 0 : 
            out.append(base[i][-diff:])
    
    return(out)

In [91]:
get_ending(base, stemmed)[:10]

['e', 'ous', 's', 's', 's', 'e', 's', 'es', 'e', 'es']

It gets just the ending and if there is no ending than gives space. 

In [100]:
def count_endings(data):
    n = len(data)
    
    dicts = [0 for i in range(n)]
    for i in range(n):
        chapter = data[i]
        stemmed = [Word(word).stem() for word in str(chapter).split()]
        base = str(chapter).split()
        
        endings = get_ending(base, stemmed)
        
        # makes dictionary
        counted = Counter(endings)
        dicts[i] = counted
        
    return(dicts)

In [105]:
list_of_endings = count_endings(data)
list_of_endings[:5]

[Counter({'ion': 30,
          's': 51,
          'ed': 19,
          'er': 1,
          'lful': 5,
          'ful': 17,
          'ences': 8,
          'ely': 1,
          'e': 9,
          'ing': 7,
          'ng': 3,
          'eable': 1,
          'ly': 2,
          'es': 2,
          'ated': 1,
          'atives': 3}),
 Counter({'e': 11,
          'ed': 5,
          's': 20,
          'ing': 8,
          'es': 3,
          'ous': 3,
          'ment': 2,
          'ion': 8,
          'ation': 3}),
 Counter({'e': 16,
          's': 36,
          'ity': 4,
          'ion': 11,
          'ful': 8,
          'ng': 5,
          'ation': 4,
          'd': 2,
          'ting': 1,
          'ed': 5,
          'ing': 3,
          'ates': 1,
          'es': 2,
          'ered': 1,
          'ment': 1,
          'fulness': 1,
          'ative': 1,
          'ions': 1}),
 Counter({'e': 31,
          'ing': 9,
          'ment': 1,
          'ation': 4,
          's': 30,
          'ed': 11,
   