In [31]:
import pandas as pd
import spacy
nlp = spacy.load('en_core_web_sm') 

from spacy.matcher import Matcher 
from spacy.tokens import Span 
from spacy.tokenizer import Tokenizer
from spacy.util import compile_infix_regex

In [32]:
# QUERY API: https://krr.triply.cc/annadg/-/queries/Abstract-Data-Query/1
data = pd.read_csv('entityQueryResults.csv')

In [33]:
data

Unnamed: 0,paper,new_id,article_types,pm_central_id,abstract,value,abstract_entities
0,http://ns.inria.fr/covid19/2b244041ab6f2ab167b...,http://example.org/our_ontology/PMC4451521#Hyp...,"http://purl.org/ontology/bibo/AcademicArticle,...",PMC4451521,http://ns.inria.fr/covid19/2b244041ab6f2ab167b...,BACKGROUND: The current standard care therapy ...,https://krr.triply.cc/.well-known/genid/1a8e54...
1,http://ns.inria.fr/covid19/ee091049684ecee3ab3...,http://example.org/our_ontology/PMC4171353#Hyp...,"http://purl.org/ontology/bibo/AcademicArticle,...",PMC4171353,http://ns.inria.fr/covid19/ee091049684ecee3ab3...,Abstract Rabies virus (RABV) is a neurotropic ...,https://krr.triply.cc/.well-known/genid/0c5272...
2,http://ns.inria.fr/covid19/52c813a9ed6582f44ae...,http://example.org/our_ontology/PMC3653767#Hyp...,"http://purl.org/ontology/bibo/AcademicArticle,...",PMC3653767,http://ns.inria.fr/covid19/52c813a9ed6582f44ae...,BACKGROUND: The genetic element s2m seems to r...,https://krr.triply.cc/.well-known/genid/04f2a6...
3,http://ns.inria.fr/covid19/0b22db40e9e78fb29f6...,http://example.org/our_ontology/PMC3937728#Hyp...,"http://purl.org/ontology/bibo/AcademicArticle,...",PMC3937728,http://ns.inria.fr/covid19/0b22db40e9e78fb29f6...,Prior studies of clay–virus interactions have ...,https://krr.triply.cc/.well-known/genid/17395a...
4,http://ns.inria.fr/covid19/71788b8e64a665f6208...,http://example.org/our_ontology/PMC7087777#Hyp...,"http://purl.org/ontology/bibo/AcademicArticle,...",PMC7087777,http://ns.inria.fr/covid19/71788b8e64a665f6208...,"Following the SARS outbreak, extensive surveil...",https://krr.triply.cc/.well-known/genid/012736...
...,...,...,...,...,...,...,...
547,http://ns.inria.fr/covid19/097b9b4ada3e89942ea...,http://example.org/our_ontology/PMC6884901#Hyp...,"http://purl.org/ontology/bibo/AcademicArticle,...",PMC6884901,http://ns.inria.fr/covid19/097b9b4ada3e89942ea...,Vitamin A (VA) has pleiotropic effects on the ...,https://krr.triply.cc/.well-known/genid/093a08...
548,http://ns.inria.fr/covid19/5f8c204d73feaf62ba6...,http://example.org/our_ontology/PMC4464075#Hyp...,"http://purl.org/ontology/bibo/AcademicArticle,...",PMC4464075,http://ns.inria.fr/covid19/5f8c204d73feaf62ba6...,Dietary fats and sodium are both palatable and...,https://krr.triply.cc/.well-known/genid/003a90...
549,http://ns.inria.fr/covid19/845b2c4662c9d7db919...,http://example.org/our_ontology/PMC6211748#Hyp...,"http://purl.org/ontology/bibo/AcademicArticle,...",PMC6211748,http://ns.inria.fr/covid19/845b2c4662c9d7db919...,RNA viruses are the only known RNA-protein (RN...,https://krr.triply.cc/.well-known/genid/055146...
550,http://ns.inria.fr/covid19/87b2f2205b9dea38eea...,http://example.org/our_ontology/PMC4871871#Hyp...,"http://purl.org/ontology/bibo/AcademicArticle,...",PMC4871871,http://ns.inria.fr/covid19/87b2f2205b9dea38eea...,Rabies remains a major public health concern i...,https://krr.triply.cc/.well-known/genid/00457e...


In [34]:
def pattern_match(text):
    """function to find sentences that contain the lemma of hypothesis"""
    matcher = Matcher(nlp.vocab)
    # Add match ID "HypothesisIs" with no callback and one pattern
    pattern = [{'LEMMA':{"IN":["hypothesis","hypothesize","hypothesise", "hypothesized", "hypothesised"]}}]
    
    matcher.add("HypothesisIs", None, pattern)

    doc = nlp(text)
    matches = matcher(doc)
    for match_id, start, end in matches:
        string_id = nlp.vocab.strings[match_id]  # Get string representation
        span = doc[start:end]  # The matched span
        sent = span.sent
        return sent.text

In [35]:
data['pattern_match'] = data['value'].apply(pattern_match)

In [36]:
# need to get rid of this instance, otherwise the merge noun chunks does not work. Drop this instance with the following cell. 
data[data['pattern_match']=="While the hypothesis that dromedary camels are the likely major source of MERS-CoV infection in humans is gaining acceptance, conjecture continues over the original natural reservoir host(s)"].index[0]

220

In [37]:
data.drop([data.index[220]], inplace=True)

In [38]:
merge_nps = nlp.create_pipe("merge_noun_chunks")
nlp.add_pipe(merge_nps)

def merge_noun_chunks(text):
    """function to merge noun chunks in texts"""
    noun_chunks = []
    for t in nlp(text):
        noun_chunks.append(t.text)
        
    return noun_chunks

In [39]:
data['merged_noun_chunks'] = data['pattern_match'].apply(merge_noun_chunks)

In [40]:
def combine_chunks(list_of_chunks):
    for index, word in enumerate(list_of_chunks):
        if len(word.split(' ')) > 1:
            new_word = word.replace(' ', '_')
            list_of_chunks[index] = new_word
    sentence = ' '.join(list_of_chunks)
    
    return sentence

In [41]:
data['merged_sent'] = data['merged_noun_chunks'].apply(combine_chunks)

In [42]:
# solution from https://stackoverflow.com/questions/59993683/how-can-i-get-spacy-to-stop-splitting-both-hyphenated-numbers-and-words-into-sep 
def custom_tokenizer(nlp):
    """
    Function that keeps intra-hyphenated words as single tokens.
    """
    inf = list(nlp.Defaults.infixes)               # Default infixes
    inf.remove(r"(?<=[0-9])[+\-\*^](?=[0-9-])")    # Remove the generic op between numbers or between a number and a -
    inf = tuple(inf)                               # Convert inf to tuple
    infixes = inf + tuple([r"(?<=[0-9])[+*^](?=[0-9-])", r"(?<=[0-9])-(?=-)"])  # Add the removed rule after subtracting (?<=[0-9])-(?=[0-9]) pattern
    infixes = [x for x in infixes if '-|–|—|--|---|——|~' not in x] # Remove - between letters rule
    infix_re = compile_infix_regex(infixes)

    return Tokenizer(nlp.vocab, prefix_search=nlp.tokenizer.prefix_search,
                                suffix_search=nlp.tokenizer.suffix_search,
                                infix_finditer=infix_re.finditer,
                                token_match=nlp.tokenizer.token_match,
                                rules=nlp.Defaults.tokenizer_exceptions)

nlp.tokenizer = custom_tokenizer(nlp)

In [43]:
def get_keywords(hypothesis_sentence):
    hypothesis_keywords = []
    doc = nlp(hypothesis_sentence)
    for tok in doc:
        if tok.pos_ == "PROPN" or tok.pos_ == "NOUN":
            hypothesis_keywords.append(tok.text)
   
    return hypothesis_keywords

In [44]:
data['hypothesis_entities'] = data['merged_sent'].apply(get_keywords)

In [46]:
data.drop(columns = ['abstract_entities', 'merged_noun_chunks', 'merged_sent'] , inplace=True)
data.rename(columns={"pattern_match":"hypothesis_sentence"}, inplace=True)

In [47]:
data.to_csv('paper_hyp_entity_data.csv', index=False)