In [76]:
%load_ext autoreload
%autoreload 2

from pathlib import Path
from spacy import displacy
from spacy.pipeline import EntityRuler
import spacy
import joblib
import pandas as pd
import mysql.connector as sql

from leventis.bhl.page import BHLPage
from leventis.preprocess import text_preprocessor
from leventis.helpers import nlp_add_or_replace_pipe

from leventis.components.entity_matcher import BiGramEntityMatcher
from leventis.components.sentenizer import Sentenizer
from leventis.components.expand_trait_entities import ExpandTraitEntities
from leventis.components.normalise_taxon_entities import NormaliseTaxonEntities
from leventis.components.abbreviated_names import AbbreviatedNames

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [77]:
DATA_PATH = Path('../data')

MODEL_PATH = DATA_PATH / 'models'

trait_patterns_file = DATA_PATH / 'trait_patterns.jsonl'

In [153]:
model = joblib.load(MODEL_PATH / 'bi-gram-bernoulli-naive-bayes-model.pkl') 

In [144]:
nlp = spacy.load("en_core_sci_sm", disable=['ner'])

nlp_add_or_replace_pipe(nlp, Sentenizer(), 'sentenizer', before='parser')

nlp_add_or_replace_pipe(nlp, BiGramEntityMatcher(model), 'entity_matcher')

nlp_add_or_replace_pipe(nlp, NormaliseTaxonEntities(), 'normalise_taxon_entities', after='entity_matcher')

nlp_add_or_replace_pipe(nlp, AbbreviatedNames(nlp), AbbreviatedNames.name, after='normalise_taxon_entities')

In [145]:
import mysql.connector as sql

db_connection = sql.connect(database='pup', user='root')

In [146]:
df = pd.read_sql('SELECT * FROM pup_traits', con=db_connection)

df.head()

Unnamed: 0,trait_id,trait_term,trait_character,trait_category,pup_term,comment
0,2537,abaxial,tree,position,Growth form,
1,2538,absent,,quantity,Stem Pubescence,
2,2539,absent,,quantity,Leaf pubescence,
3,2540,acaulescent,caespitose,architecture,Growth form,
4,2541,acauline,caespitose,architecture,Growth form,


In [147]:
trait_ner = EntityRuler(nlp).from_disk(trait_patterns_file)

nlp_add_or_replace_pipe(nlp, trait_ner, 'trait_ner')

In [148]:
expand_trait_entities = ExpandTraitEntities()

nlp_add_or_replace_pipe(nlp, expand_trait_entities, 'expand_trait_entities', last=True)    


In [149]:
db_connection = sql.connect(database='pup', user='root')

query = '''
SELECT bc.page_id, pt.trait_term, pn.pup_name AS taxon 
FROM bhl_citations bc 
    INNER JOIN pup_names_sp pn ON pn.pup_name_id = bc.pup_id 
    INNER JOIN trait_page tp ON tp.page_id = bc.page_id 
    INNER JOIN x_pup_traits pt ON tp.trait_id=pt.trait_id 
WHERE pn.pup_higher_group = 'Dicot' AND bc.item_language = 'English';
'''

df = pd.read_sql(query, con=db_connection)

df.head()

Unnamed: 0,page_id,trait_term,taxon
0,15987715,leaves,Ricinus communis
1,15989542,tree,Morisonia americana
2,15989542,shrub,Morisonia americana
3,15989551,leaf,Ricinus communis
4,15989579,tree,Melampyrum sylvaticum


In [150]:
# df.groupby('taxon')['page_id', 'trait_term'].describe()

In [151]:
bhl_page = BHLPage(15989551)

text = bhl_page.get_text()

normalised_text = text_preprocessor(text)

In [152]:
doc = nlp(normalised_text)

colors = {"TRAIT": "linear-gradient(90deg, gold, yellow)", "TAXON": "linear-gradient(90deg, lime, aquamarine)"}
options = {"colors": colors}

displacy.render(doc, style="ent", options=options)

# for ent in doc.ents:
#     print(ent._.get("taxon_name")) 


In [88]:
# for page_id in df[df.taxon == 'Acacia globulifera']['page_id'].unique():
    
#     bhl_page = BHLPage(page_id)

#     text = bhl_page.get_text()

#     normalised_text = text_preprocessor(text)
    
#     doc = nlp(normalised_text)
    
#     displacy.render(doc, style="ent", options=options)
    
#     break
    


In [154]:


# for page_id in df[df.taxon == 'Acacia globulifera']['page_id']:
#     print(page_id)

    



taxon_subject = None   

def get_taxon_subject(sent):
    taxa = [ent for ent in sent.ents if ent.label_ == 'TAXON']
    if len(taxa) == 1:
        return taxa[0]
    elif len(taxa) > 1:
        for taxon in taxa:
            print(taxon)
            print('Y')
#         # Try and identify the noun subjects in the sentence and 
#         # match to the entity taxa             
#         noun_subjects = [token for token in sent if token.dep_ == 'nsubj' and token.ent_type_ == 'TAXON']

#         # If we only have one noun subject, find the matching taxon entity and assign
#         # to the taxon subject             
#         if len(noun_subjects) == 1:
#             for taxon in taxa:
#                 if taxon.label == noun_subjects[0].ent_id:
#                     taxon_subject = taxon
#                     break        
    

    
    
    

        
for sent in doc.sents:
    
    taxa_ents = set([ent for ent in sent.ents if ent.label_ == 'TAXON'])
    
    # If we have      
    if taxa_ents:
        if len(taxa_ents) == 1:
            taxon_subject = taxa_ents[0]
        else:
            taxon_subject = None
            
    if taxon_subject:
        trait_ents = set([ent for ent in sent.ents if ent.label_ == 'TRAIT'])
        
        
        print(taxon_subject)
        print(trait_ents)
            
            
        
#             print(len(tokens))
#             print(tokens)
#             print(list(ent.noun_chunks))
            
#             print(dir(ent))
    
#     taxon_subject = get_taxon_subject()
    
    

#     taxa = [ent for ent in sent.ents if ent.label_ == 'TAXON']
#     traits = [ent for ent in sent.ents if ent.label_ == 'TRAIT']

#     if taxa:
        
#         taxon_subject = None  
        
#         # Do we have more than one taxon in the sentence         
#         if len(taxa) > 1:
            
#             # Try and identify the noun subjects in the sentence and 
#             # match to the entity taxa             
#             noun_subjects = [token for token in sent if token.dep_ == 'nsubj' and token.ent_type_ == 'TAXON']
            
#             # If we only have one noun subject, find the matching taxon entity and assign
#             # to the taxon subject             
#             if len(noun_subjects) == 1:
#                 for taxon in taxa:
#                     if taxon.label == noun_subjects[0].ent_id:
#                         taxon_subject = taxon
#                         break

#         else:
#             taxon_subject = taxa[0]
            
#     if taxon_subject:
#         print(taxon_subject)
#     else:
#         print(taxa)
#         print('NOPE')

#     if traits:
        
#         print(traits)

#         for token in sent:

#             if token.ent_type_ == 'TRAIT':
                
#                 phrase = list(expand_trait_phrase(token, doc, -1))                
#                 phrase.append(token.text)
#                 phrase += expand_trait_phrase(token, doc, +1)
                
#                 print(phrase)
                
                
                
#                 while True:                
#                     next_token = doc[token.i + 1]
                    
                    
                    
#                 next_token = doc[token.i + 2]
#                 print(next_token.pos_)
#                 print(next_token.dep_)
                
                
                
#                 while True
                
                
                
#                 print(token.i)
#                 print(list(token.conjuncts))
#                 print(dir(token))

# #                 print(list(token.subtree))
                
#                 break
        
        
#         print(sent.text) 
        
#         for chunk in sent.noun_chunks:
#             print(chunk)
#             print('--')
            
            
# # # # #         for chunk in sent.noun_chunks:
# # # # #             print('--')
# # # # #             print(chunk)
# # # # #             print(list(chunk.rights))
            
            
#         break



In [130]:
displacy.render(doc, style="ent")

In [991]:
x = (1,2,3)
print(x[1])

2


In [992]:
x[1] = 4

TypeError: 'tuple' object does not support item assignment