In [1]:
import pandas as pd
import spacy
#nlp = spacy.load('en_core_web_sm') 

from spacy.matcher import Matcher 
from spacy.tokens import Span 
from spacy.tokenizer import Tokenizer
from spacy.util import compile_infix_regex

nlp = spacy.load(r'C:\\Users\\Marta\\AppData\\Local\\Programs\\Python\\Python38\\Lib\\site-packages\\en_core_web_sm\\en_core_web_sm-2.2.0')

In [None]:
# QUERY API: https://krr.triply.cc/annadg/-/queries/Abstract-Data-Query/1
data = pd.read_csv('entityQueryResults.csv')

In [None]:
def pattern_match(text):
    """function to find sentences that contain the lemma of hypothesis"""
    matcher = Matcher(nlp.vocab)
    # Add match ID "HypothesisIs" with no callback and one pattern
    pattern = [{'LEMMA':{"IN":["hypothesis","hypothesize","hypothesise", "hypothesized", "hypothesised"]}}]
    
    matcher.add("HypothesisIs", None, pattern)

    doc = nlp(text)
    matches = matcher(doc)
    for match_id, start, end in matches:
        string_id = nlp.vocab.strings[match_id]  # Get string representation
        span = doc[start:end]  # The matched span
        sent = span.sent
        return sent.text

In [None]:
data['pattern_match'] = data['value'].apply(pattern_match)

In [None]:
# need to get rid of this instance, otherwise the merge noun chunks does not work. Drop this instance with the following cell. 
data[data['pattern_match']=="While the hypothesis that dromedary camels are the likely major source of MERS-CoV infection in humans is gaining acceptance, conjecture continues over the original natural reservoir host(s)"].index[0]

In [None]:
data.drop([data.index[220]], inplace=True)

In [None]:
# creating bool series True for NaN values - as the subsequent formula will break if there are 
bool_series = pd.isnull(data["pattern_match"])  
    
# filtering data  
data[bool_series]

In [None]:
# Investigate abstracts as not matching lemma pattern to verify integrity of pattern

for row in data.value[881:882]:  # iterating through the rows of the object column
    print(row, '\n')

In [None]:
# Drop rows where 'pattern match' does not return a match
data.dropna(subset=['pattern_match'], inplace = True)

In [None]:
merge_nps = nlp.create_pipe("merge_noun_chunks")
nlp.add_pipe(merge_nps)

def merge_noun_chunks(text):
    """function to merge noun chunks in texts"""
    noun_chunks = []
    for t in nlp(text):
        noun_chunks.append(t.text)
        
    return noun_chunks

In [None]:
data['merged_noun_chunks'] = data['pattern_match'].apply(merge_noun_chunks)

In [None]:
def combine_chunks(list_of_chunks):
    for index, word in enumerate(list_of_chunks):
        if len(word.split(' ')) > 1:
            new_word = word.replace(' ', '_')
            list_of_chunks[index] = new_word
    sentence = ' '.join(list_of_chunks)
    
    return sentence

In [None]:
data['merged_sent'] = data['merged_noun_chunks'].apply(combine_chunks)

In [None]:
# solution from https://stackoverflow.com/questions/59993683/how-can-i-get-spacy-to-stop-splitting-both-hyphenated-numbers-and-words-into-sep 
def custom_tokenizer(nlp):
    """
    Function that keeps intra-hyphenated words as single tokens.
    """
    inf = list(nlp.Defaults.infixes)               # Default infixes
    inf.remove(r"(?<=[0-9])[+\-\*^](?=[0-9-])")    # Remove the generic op between numbers or between a number and a -
    inf = tuple(inf)                               # Convert inf to tuple
    infixes = inf + tuple([r"(?<=[0-9])[+*^](?=[0-9-])", r"(?<=[0-9])-(?=-)"])  # Add the removed rule after subtracting (?<=[0-9])-(?=[0-9]) pattern
    infixes = [x for x in infixes if '-|–|—|--|---|——|~' not in x] # Remove - between letters rule
    infix_re = compile_infix_regex(infixes)

    return Tokenizer(nlp.vocab, prefix_search=nlp.tokenizer.prefix_search,
                                suffix_search=nlp.tokenizer.suffix_search,
                                infix_finditer=infix_re.finditer,
                                token_match=nlp.tokenizer.token_match,
                                rules=nlp.Defaults.tokenizer_exceptions)

nlp.tokenizer = custom_tokenizer(nlp)

In [None]:
def get_keywords(hypothesis_sentence):
    hypothesis_keywords = []
    doc = nlp(hypothesis_sentence)
    for tok in doc:
        if tok.pos_ == "PROPN" or tok.pos_ == "NOUN":
            hypothesis_keywords.append(tok.text)
   
    return hypothesis_keywords

In [None]:
data['hypothesis_entities'] = data['merged_sent'].apply(get_keywords)

In [None]:
data.drop(columns = ['abstract_entities', 'merged_noun_chunks', 'merged_sent'] , inplace=True)
data.rename(columns={"pattern_match":"hypothesis_sentence"}, inplace=True)

In [None]:
def clean_hypothesis_entities(text):
    cleaned_hypotheses = []
    for word in (text):
        word.split(' ')
        new_word = word.replace('_', ' ')
        cleaned_hypotheses.append(new_word)
    
    return cleaned_hypotheses

In [None]:
data["clean_hypothesis_entities"] = data["hypothesis_entities"].apply(clean_hypothesis_entities)

In [None]:
f = lambda x: 'entity_{}'.format(x + 1)
entity_df = pd.DataFrame(data.clean_hypothesis_entities.values.tolist(),data.index, dtype=object).fillna('').rename(columns=f)

In [None]:
data = data.reset_index(drop=True)
entity_df = entity_df.reset_index(drop=True)

In [None]:
merged_data = (pd.concat([data,entity_df],axis=1))

In [None]:
merged_data

In [None]:
merged_data.to_csv('paper_hyp_entity_data.csv', index=False)