In [None]:
# import re
# import csv
# import sys
# import json
# import math
# import spacy
# import textacy
# import numpy as np
# import pandas as pd
# import matplotlib.pylab as plt
# from taxonerd import TaxoNERD
# from fastcoref import spacy_component
# from spacy.matcher import Matcher, DependencyMatcher, PhraseMatcher
%run "Helper.ipynb"
%run "Base.ipynb"
%run "Entity.ipynb"
%run "Species.ipynb"
%run "Keywords.ipynb"

In [None]:
VERBOSE_LEVEL = 0

In [18]:
class Main(Base):
    def __init__(self):
        self.sp_nlp = spacy.load("en_core_web_trf")
        self.sp_nlp.add_pipe(
            "fastcoref",
            config={'model_architecture': 'LingMessCoref', 'model_path': 'biu-nlp/lingmess-coref', 'device': 'cpu'}
        )
        self.sp_doc = None
        super().__init__(self)
        
        # Maps Character Position to Token in Document
        # Used to handle differences between different
        # pipelines and tools.
        self.index_map = None
        self.ent_map = None
        self.noun_chunk_map = None
        self.coref_map = None
        
        # Helps
        self.parts = Parts(self)
        self.species = Species(self)
        self.trait = TraitKeywords(self)
        self.cause = CauseKeywords(self)
        self.change = ChangeKeywords(self)

    
    def update_doc(self, doc, verbose=False):
        self.sp_doc = doc
        self.index_map = self.load_index_map()
        self.ent_map = self.load_ent_map()
        self.noun_chunk_map = self.load_noun_chunk_map()
        self.coref_map = self.load_coref_map()
        self.parts.update()
        self.species.update(doc.text, verbose=False)
        self.trait.update(verbose=False)
        self.cause.update(verbose=False)
        self.change.update(verbose=False)

    
    def update_text(self, text, verbose=False):
        self.sp_doc = self.sp_nlp(text)
        self.update_doc(self.sp_doc, verbose=verbose)

        
    def load_index_map(self):
        if self.sp_doc is None:
            raise Exception("DNE")

        # Map Character Index to Token
        index_map = {}
        for token in self.sp_doc:
            l_char_index = token.idx
            r_char_index = token.idx + len(token)

            for i in range(l_char_index, r_char_index):
                index_map[i] = token

        return index_map
    

    def load_noun_chunk_map(self):
        if self.sp_doc is None:
            raise Exception("DNE")
        noun_chunk_map = {}
        for noun_chunk in self.sp_doc.noun_chunks:
            for token in noun_chunk:
                noun_chunk_map[token] = noun_chunk
        return noun_chunk_map


    def load_ent_map(self):
        if self.sp_doc is None:
            raise Exception("DNE")
        ent_map = {}
        for ent in self.sp_doc.ents:
            for token in ent:
                ent_map[token] = ent
        return ent_map

    
    def load_coref_map(self):
        if self.sp_doc is None:
            raise Exception("DNE")

        coref_map = {}
        for cluster in self.sp_doc._.coref_clusters:
            ref_l_token = self.index_map[cluster[0][0]].i
            ref_r_token = self.index_map[cluster[0][1]-1].i
            
            ref_span = self.sp_doc[ref_l_token:ref_r_token+1]

            for start, end in cluster[1:]:
                l_token = self.index_map[start].i
                r_token = self.index_map[end-1].i

                span = self.sp_doc[l_token:r_token+1]
                for token in span:
                    coref_map[token] = ref_span

        last_noun = None
        for token in self.sp_doc:
            if token.pos_ in ["NOUN", "PROPN"]:
                last_noun = self.sp_doc[token.i:token.i+1]
        
            if token.lower_ in ["which"] and last_noun:
                coref_map[token] = last_noun
                last_noun = None
                
        return coref_map
        
        
    def token_at_char(self, char_index):
        if not self.sp_doc or not self.index_map:
            raise Exception("DNE")

        if char_index in self.index_map:
            return self.index_map[char_index]

        raise Exception(f"Token at Index {char_index} Not Found")