In [1]:
import pandas as pd
import spacy
nlp = spacy.load('en_core_web_sm') 

from spacy.matcher import Matcher 
from spacy.tokens import Span 
from spacy.tokenizer import Tokenizer
from spacy.util import compile_infix_regex

# nlp = spacy.load(r'C:\\Users\\Marta\\AppData\\Local\\Programs\\Python\\Python38\\Lib\\site-packages\\en_core_web_sm\\en_core_web_sm-2.2.0')

## Observations / Notes

1. Ran the new query that Anna built - this results in 931 results
2. There are 91 results which do not have the lemma 'hypothesis' in them - these are removed from the dataframe
3. The final result currently has 840 abstracts
4. An abstract can have up to 31 'grouped' entities - once they are separated out (but that's for one paper that is the outlier)
5. I separated out each entity into its own column - I am not sure if this is the current way to go about it, but am thinking that in order to create triples we would want to do it per entity. We can tweak as required.


In [2]:
# QUERY API: https://api.krr.triply.cc/datasets/annadg/covid19/services/covid19-1/sparql
data = pd.read_csv('entityQueryResults.csv')

In [3]:
def pattern_match(text):
    """function to find sentences that contain the lemma of hypothesis"""
    matcher = Matcher(nlp.vocab)
    # Add match ID "HypothesisIs" with no callback and one pattern
    pattern = [{'LEMMA':{"IN":["hypothesis","hypothesize","hypothesise", "hypothesized", "hypothesised"]}}]
    
    matcher.add("HypothesisIs", None, pattern)

    doc = nlp(text)
    matches = matcher(doc)
    for match_id, start, end in matches:
        string_id = nlp.vocab.strings[match_id]  # Get string representation
        span = doc[start:end]  # The matched span
        sent = span.sent
        return sent.text

In [4]:
data['pattern_match'] = data['value'].apply(pattern_match)

In [5]:
# need to get rid of this instance, otherwise the merge noun chunks does not work. Drop this instance with the following cell. 
data[data['pattern_match']=="While the hypothesis that dromedary camels are the likely major source of MERS-CoV infection in humans is gaining acceptance, conjecture continues over the original natural reservoir host(s)"].index[0]

1173

In [6]:
data.drop([data.index[1173]], inplace=True)

In [7]:
# creating bool series True for NaN values - as the subsequent formula will break if there are 
bool_series = pd.isnull(data["pattern_match"])  
    
# filtering data  
data[bool_series]

Unnamed: 0,paper,article_types,new_id,pm_central_id,abstract,value,abstract_entities,pattern_match
7,http://ns.inria.fr/covid19/1eb3f3f0aafd8b2741a...,"http://purl.org/ontology/bibo/AcademicArticle,...",http://example.org/hypothesis_ontology/1eb3f3f...,,http://ns.inria.fr/covid19/1eb3f3f0aafd8b2741a...,Summary Objectives To measure the spatial cont...,https://krr.triply.cc/.well-known/genid/1683bd...,
19,http://ns.inria.fr/covid19/PMC4747015,"http://purl.org/ontology/bibo/AcademicArticle,...",http://example.org/hypothesis_ontology/PMC4747...,PMC4747015,http://ns.inria.fr/covid19/PMC4747015#abstract,Information regarding effective anesthetic reg...,,
45,http://ns.inria.fr/covid19/4b278bf04e245866207...,"http://purl.org/ontology/bibo/AcademicArticle,...",http://example.org/hypothesis_ontology/4b278bf...,,http://ns.inria.fr/covid19/4b278bf04e245866207...,Abstract Global re-emergence of Chikungunya vi...,https://krr.triply.cc/.well-known/genid/0e1b11...,
52,http://ns.inria.fr/covid19/84317229254324b8c2d...,"http://purl.org/ontology/bibo/AcademicArticle,...",http://example.org/hypothesis_ontology/PMC7106...,PMC7106413,http://ns.inria.fr/covid19/84317229254324b8c2d...,Summary The Coalition for Epidemic Preparednes...,https://krr.triply.cc/.well-known/genid/0000a5...,
54,http://ns.inria.fr/covid19/700cf45c09aadb04452...,"http://purl.org/ontology/bibo/AcademicArticle,...",http://example.org/hypothesis_ontology/700cf45...,,http://ns.inria.fr/covid19/700cf45c09aadb04452...,"As of March 23, 2020 there have been over 354,...",,
...,...,...,...,...,...,...,...,...
1181,http://ns.inria.fr/covid19/7461fe0adbb9a865f8a...,"http://purl.org/ontology/bibo/AcademicArticle,...",http://example.org/hypothesis_ontology/PMC5661...,PMC5661933,http://ns.inria.fr/covid19/7461fe0adbb9a865f8a...,BACKGROUND: The detection of wild poliovirus i...,https://krr.triply.cc/.well-known/genid/027f07...,
1188,http://ns.inria.fr/covid19/93df1925c1aa0cf7e72...,"http://purl.org/ontology/bibo/AcademicArticle,...",http://example.org/hypothesis_ontology/PMC3282...,PMC3282974,http://ns.inria.fr/covid19/93df1925c1aa0cf7e72...,Kawasaki disease (KD) is a self-limited system...,,
1212,http://ns.inria.fr/covid19/PMC4187631,"http://purl.org/ontology/bibo/AcademicArticle,...",http://example.org/hypothesis_ontology/PMC4187...,PMC4187631,http://ns.inria.fr/covid19/PMC4187631#abstract,SUMMARY: The pathogenicity and clinical pertin...,,
1213,http://ns.inria.fr/covid19/8d07b32de2cd9460999...,"http://purl.org/ontology/bibo/AcademicArticle,...",http://example.org/hypothesis_ontology/8d07b32...,,http://ns.inria.fr/covid19/8d07b32de2cd9460999...,A SEIR simulation model for the COVID-19 pande...,https://krr.triply.cc/.well-known/genid/04dd52...,


In [None]:
# Investigate abstracts as not matching lemma pattern to verify integrity of pattern

for row in data.value[881:882]:  # iterating through the rows of the object column
    print(row, '\n')

In [8]:
# Drop rows where 'pattern match' does not return a match
data.dropna(subset=['pattern_match'], inplace = True)

In [9]:
data

Unnamed: 0,paper,article_types,new_id,pm_central_id,abstract,value,abstract_entities,pattern_match
0,http://ns.inria.fr/covid19/PMC3243747,"http://purl.org/ontology/bibo/AcademicArticle,...",http://example.org/hypothesis_ontology/PMC3243...,PMC3243747,http://ns.inria.fr/covid19/PMC3243747#abstract,"Amiodarone [2-butyl-3-(3′,5′-diiodo-4’α-diethy...",,One hypothesis for the etiology of idiosyncrat...
1,http://ns.inria.fr/covid19/58432d2ad49266df441...,"http://purl.org/ontology/bibo/AcademicArticle,...",http://example.org/hypothesis_ontology/PMC5970...,PMC5970503,http://ns.inria.fr/covid19/58432d2ad49266df441...,BACKGROUND: Emergence and re-emergence of porc...,https://krr.triply.cc/.well-known/genid/00ffa3...,The objective of this study was to characteriz...
2,http://ns.inria.fr/covid19/02272eaecb2fa1b98f5...,"http://purl.org/ontology/bibo/AcademicArticle,...",http://example.org/hypothesis_ontology/02272ea...,,http://ns.inria.fr/covid19/02272eaecb2fa1b98f5...,Summary Interactions between swine influenza v...,https://krr.triply.cc/.well-known/genid/3576b7...,The present in vitro studies support the hypot...
3,http://ns.inria.fr/covid19/PMC3457171,"http://purl.org/ontology/bibo/AcademicArticle,...",http://example.org/hypothesis_ontology/PMC3457...,PMC3457171,http://ns.inria.fr/covid19/PMC3457171#abstract,In contrast to seasonal influenza virus infect...,,"However, this hypothesis could not be empirica..."
4,http://ns.inria.fr/covid19/a263f1ec26e982b0f07...,"http://purl.org/ontology/bibo/AcademicArticle,...",http://example.org/hypothesis_ontology/PMC4692...,PMC4692406,http://ns.inria.fr/covid19/a263f1ec26e982b0f07...,This study describes a spring 2013 outbreak of...,https://krr.triply.cc/.well-known/genid/0034d3...,Meteorological data were used to investigate t...
...,...,...,...,...,...,...,...,...
1210,http://ns.inria.fr/covid19/PMC2741394,"http://purl.org/ontology/bibo/AcademicArticle,...",http://example.org/hypothesis_ontology/PMC2741...,PMC2741394,http://ns.inria.fr/covid19/PMC2741394#abstract,OBJECTIVES: To test the hypothesis that respir...,,To test the hypothesis that respiratory muscle...
1211,http://ns.inria.fr/covid19/be99996c561fe43928f...,"http://purl.org/ontology/bibo/AcademicArticle,...",http://example.org/hypothesis_ontology/PMC5328...,PMC5328330,http://ns.inria.fr/covid19/be99996c561fe43928f...,Background: Healthcare personnel often use inc...,https://krr.triply.cc/.well-known/genid/1a64a5...,We tested the hypothesis that medical students...
1214,http://ns.inria.fr/covid19/e23d353f82a3c060a65...,"http://purl.org/ontology/bibo/AcademicArticle,...",http://example.org/hypothesis_ontology/PMC7104...,PMC7104067,http://ns.inria.fr/covid19/e23d353f82a3c060a65...,Abstract Background Patients with slowly progr...,https://krr.triply.cc/.well-known/genid/0bca65...,"In this study, we tested the hypothesis that a..."
1215,http://ns.inria.fr/covid19/dca8a40bcecec6511c5...,"http://purl.org/ontology/bibo/AcademicArticle,...",http://example.org/hypothesis_ontology/PMC4627...,PMC4627423,http://ns.inria.fr/covid19/dca8a40bcecec6511c5...,Sarcoidosis is a systemic disease characterize...,https://krr.triply.cc/.well-known/genid/04eec3...,The study addresses the hypothesis that distin...


In [10]:
merge_nps = nlp.create_pipe("merge_noun_chunks")
nlp.add_pipe(merge_nps)

def merge_noun_chunks(text):
    """function to merge noun chunks in texts"""
    noun_chunks = []
    for t in nlp(text):
        noun_chunks.append(t.text)
        
    return noun_chunks

In [11]:
data['merged_noun_chunks'] = data['pattern_match'].apply(merge_noun_chunks)

In [12]:
def combine_chunks(list_of_chunks):
    for index, word in enumerate(list_of_chunks):
        if len(word.split(' ')) > 1:
            new_word = word.replace(' ', '_')
            list_of_chunks[index] = new_word
    sentence = ' '.join(list_of_chunks)
    
    return sentence

In [13]:
data['merged_sent'] = data['merged_noun_chunks'].apply(combine_chunks)

In [14]:
# solution from https://stackoverflow.com/questions/59993683/how-can-i-get-spacy-to-stop-splitting-both-hyphenated-numbers-and-words-into-sep 
def custom_tokenizer(nlp):
    """
    Function that keeps intra-hyphenated words as single tokens.
    """
    inf = list(nlp.Defaults.infixes)               # Default infixes
    inf.remove(r"(?<=[0-9])[+\-\*^](?=[0-9-])")    # Remove the generic op between numbers or between a number and a -
    inf = tuple(inf)                               # Convert inf to tuple
    infixes = inf + tuple([r"(?<=[0-9])[+*^](?=[0-9-])", r"(?<=[0-9])-(?=-)"])  # Add the removed rule after subtracting (?<=[0-9])-(?=[0-9]) pattern
    infixes = [x for x in infixes if '-|–|—|--|---|——|~' not in x] # Remove - between letters rule
    infix_re = compile_infix_regex(infixes)

    return Tokenizer(nlp.vocab, prefix_search=nlp.tokenizer.prefix_search,
                                suffix_search=nlp.tokenizer.suffix_search,
                                infix_finditer=infix_re.finditer,
                                token_match=nlp.tokenizer.token_match,
                                rules=nlp.Defaults.tokenizer_exceptions)

nlp.tokenizer = custom_tokenizer(nlp)

In [15]:
def get_keywords(hypothesis_sentence):
    hypothesis_keywords = []
    doc = nlp(hypothesis_sentence)
    for tok in doc:
        if tok.pos_ == "PROPN" or tok.pos_ == "NOUN":
            hypothesis_keywords.append(tok.text)
   
    return hypothesis_keywords

In [16]:
data['hypothesis_entities'] = data['merged_sent'].apply(get_keywords)

In [17]:
data.drop(columns = ['abstract_entities', 'merged_noun_chunks', 'merged_sent'] , inplace=True)
data.rename(columns={"pattern_match":"hypothesis_sentence"}, inplace=True)

In [18]:
def clean_hypothesis_entities(text):
    cleaned_hypotheses = []
    for word in text:
        word.split(' ')
        new_word = word.replace('_', ' ')
        cleaned_hypotheses.append(new_word)
    
    return cleaned_hypotheses

In [19]:
data["clean_hypothesis_entities"] = data["hypothesis_entities"].apply(clean_hypothesis_entities)

In [20]:
data

Unnamed: 0,paper,article_types,new_id,pm_central_id,abstract,value,abstract_entities,hypothesis_sentence,hypothesis_entities,clean_hypothesis_entities
0,http://ns.inria.fr/covid19/PMC3243747,"http://purl.org/ontology/bibo/AcademicArticle,...",http://example.org/hypothesis_ontology/PMC3243...,PMC3243747,http://ns.inria.fr/covid19/PMC3243747#abstract,"Amiodarone [2-butyl-3-(3′,5′-diiodo-4’α-diethy...",,One hypothesis for the etiology of idiosyncrat...,"[One_hypothesis, the_etiology, idiosyncratic_a...","[One hypothesis, the etiology, idiosyncratic a..."
1,http://ns.inria.fr/covid19/58432d2ad49266df441...,"http://purl.org/ontology/bibo/AcademicArticle,...",http://example.org/hypothesis_ontology/PMC5970...,PMC5970503,http://ns.inria.fr/covid19/58432d2ad49266df441...,BACKGROUND: Emergence and re-emergence of porc...,https://krr.triply.cc/.well-known/genid/00ffa3...,The objective of this study was to characteriz...,"[The_objective, this_study, PEDV-positive_pigs]","[The objective, this study, PEDV-positive pigs]"
2,http://ns.inria.fr/covid19/02272eaecb2fa1b98f5...,"http://purl.org/ontology/bibo/AcademicArticle,...",http://example.org/hypothesis_ontology/02272ea...,,http://ns.inria.fr/covid19/02272eaecb2fa1b98f5...,Summary Interactions between swine influenza v...,https://krr.triply.cc/.well-known/genid/3576b7...,The present in vitro studies support the hypot...,"[The_present, vitro_studies, the_hypothesis, p...","[The present, vitro studies, the hypothesis, p..."
3,http://ns.inria.fr/covid19/PMC3457171,"http://purl.org/ontology/bibo/AcademicArticle,...",http://example.org/hypothesis_ontology/PMC3457...,PMC3457171,http://ns.inria.fr/covid19/PMC3457171#abstract,In contrast to seasonal influenza virus infect...,,"However, this hypothesis could not be empirica...",[],[]
4,http://ns.inria.fr/covid19/a263f1ec26e982b0f07...,"http://purl.org/ontology/bibo/AcademicArticle,...",http://example.org/hypothesis_ontology/PMC4692...,PMC4692406,http://ns.inria.fr/covid19/a263f1ec26e982b0f07...,This study describes a spring 2013 outbreak of...,https://krr.triply.cc/.well-known/genid/0034d3...,Meteorological data were used to investigate t...,"[Meteorological_data, the_hypothesis, PEDv, air]","[Meteorological data, the hypothesis, PEDv, air]"
...,...,...,...,...,...,...,...,...,...,...
1210,http://ns.inria.fr/covid19/PMC2741394,"http://purl.org/ontology/bibo/AcademicArticle,...",http://example.org/hypothesis_ontology/PMC2741...,PMC2741394,http://ns.inria.fr/covid19/PMC2741394#abstract,OBJECTIVES: To test the hypothesis that respir...,,To test the hypothesis that respiratory muscle...,"[the_hypothesis, respiratory_muscle_strength, ...","[the hypothesis, respiratory muscle strength, ..."
1211,http://ns.inria.fr/covid19/be99996c561fe43928f...,"http://purl.org/ontology/bibo/AcademicArticle,...",http://example.org/hypothesis_ontology/PMC5328...,PMC5328330,http://ns.inria.fr/covid19/be99996c561fe43928f...,Background: Healthcare personnel often use inc...,https://krr.triply.cc/.well-known/genid/1a64a5...,We tested the hypothesis that medical students...,"[the_hypothesis, medical_students, insufficien...","[the hypothesis, medical students, insufficien..."
1214,http://ns.inria.fr/covid19/e23d353f82a3c060a65...,"http://purl.org/ontology/bibo/AcademicArticle,...",http://example.org/hypothesis_ontology/PMC7104...,PMC7104067,http://ns.inria.fr/covid19/e23d353f82a3c060a65...,Abstract Background Patients with slowly progr...,https://krr.triply.cc/.well-known/genid/0bca65...,"In this study, we tested the hypothesis that a...","[this_study, the_hypothesis, an_acute_exacerba...","[this study, the hypothesis, an acute exacerba..."
1215,http://ns.inria.fr/covid19/dca8a40bcecec6511c5...,"http://purl.org/ontology/bibo/AcademicArticle,...",http://example.org/hypothesis_ontology/PMC4627...,PMC4627423,http://ns.inria.fr/covid19/dca8a40bcecec6511c5...,Sarcoidosis is a systemic disease characterize...,https://krr.triply.cc/.well-known/genid/04eec3...,The study addresses the hypothesis that distin...,"[The_study, the_hypothesis, distinct_patterns,...","[The study, the hypothesis, distinct patterns,..."


In [21]:
f = lambda x: 'entity_{}'.format(x + 1)
entity_df = pd.DataFrame(data.clean_hypothesis_entities.values.tolist(),data.index, dtype=object).fillna('').rename(columns=f)

In [22]:
data = data.reset_index(drop=True)
entity_df = entity_df.reset_index(drop=True)

In [23]:
merged_data = (pd.concat([data,entity_df],axis=1))

In [24]:
merged_data

Unnamed: 0,paper,article_types,new_id,pm_central_id,abstract,value,abstract_entities,hypothesis_sentence,hypothesis_entities,clean_hypothesis_entities,...,entity_21,entity_22,entity_23,entity_24,entity_25,entity_26,entity_27,entity_28,entity_29,entity_30
0,http://ns.inria.fr/covid19/PMC3243747,"http://purl.org/ontology/bibo/AcademicArticle,...",http://example.org/hypothesis_ontology/PMC3243...,PMC3243747,http://ns.inria.fr/covid19/PMC3243747#abstract,"Amiodarone [2-butyl-3-(3′,5′-diiodo-4’α-diethy...",,One hypothesis for the etiology of idiosyncrat...,"[One_hypothesis, the_etiology, idiosyncratic_a...","[One hypothesis, the etiology, idiosyncratic a...",...,,,,,,,,,,
1,http://ns.inria.fr/covid19/58432d2ad49266df441...,"http://purl.org/ontology/bibo/AcademicArticle,...",http://example.org/hypothesis_ontology/PMC5970...,PMC5970503,http://ns.inria.fr/covid19/58432d2ad49266df441...,BACKGROUND: Emergence and re-emergence of porc...,https://krr.triply.cc/.well-known/genid/00ffa3...,The objective of this study was to characteriz...,"[The_objective, this_study, PEDV-positive_pigs]","[The objective, this study, PEDV-positive pigs]",...,,,,,,,,,,
2,http://ns.inria.fr/covid19/02272eaecb2fa1b98f5...,"http://purl.org/ontology/bibo/AcademicArticle,...",http://example.org/hypothesis_ontology/02272ea...,,http://ns.inria.fr/covid19/02272eaecb2fa1b98f5...,Summary Interactions between swine influenza v...,https://krr.triply.cc/.well-known/genid/3576b7...,The present in vitro studies support the hypot...,"[The_present, vitro_studies, the_hypothesis, p...","[The present, vitro studies, the hypothesis, p...",...,,,,,,,,,,
3,http://ns.inria.fr/covid19/PMC3457171,"http://purl.org/ontology/bibo/AcademicArticle,...",http://example.org/hypothesis_ontology/PMC3457...,PMC3457171,http://ns.inria.fr/covid19/PMC3457171#abstract,In contrast to seasonal influenza virus infect...,,"However, this hypothesis could not be empirica...",[],[],...,,,,,,,,,,
4,http://ns.inria.fr/covid19/a263f1ec26e982b0f07...,"http://purl.org/ontology/bibo/AcademicArticle,...",http://example.org/hypothesis_ontology/PMC4692...,PMC4692406,http://ns.inria.fr/covid19/a263f1ec26e982b0f07...,This study describes a spring 2013 outbreak of...,https://krr.triply.cc/.well-known/genid/0034d3...,Meteorological data were used to investigate t...,"[Meteorological_data, the_hypothesis, PEDv, air]","[Meteorological data, the hypothesis, PEDv, air]",...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1096,http://ns.inria.fr/covid19/PMC2741394,"http://purl.org/ontology/bibo/AcademicArticle,...",http://example.org/hypothesis_ontology/PMC2741...,PMC2741394,http://ns.inria.fr/covid19/PMC2741394#abstract,OBJECTIVES: To test the hypothesis that respir...,,To test the hypothesis that respiratory muscle...,"[the_hypothesis, respiratory_muscle_strength, ...","[the hypothesis, respiratory muscle strength, ...",...,,,,,,,,,,
1097,http://ns.inria.fr/covid19/be99996c561fe43928f...,"http://purl.org/ontology/bibo/AcademicArticle,...",http://example.org/hypothesis_ontology/PMC5328...,PMC5328330,http://ns.inria.fr/covid19/be99996c561fe43928f...,Background: Healthcare personnel often use inc...,https://krr.triply.cc/.well-known/genid/1a64a5...,We tested the hypothesis that medical students...,"[the_hypothesis, medical_students, insufficien...","[the hypothesis, medical students, insufficien...",...,,,,,,,,,,
1098,http://ns.inria.fr/covid19/e23d353f82a3c060a65...,"http://purl.org/ontology/bibo/AcademicArticle,...",http://example.org/hypothesis_ontology/PMC7104...,PMC7104067,http://ns.inria.fr/covid19/e23d353f82a3c060a65...,Abstract Background Patients with slowly progr...,https://krr.triply.cc/.well-known/genid/0bca65...,"In this study, we tested the hypothesis that a...","[this_study, the_hypothesis, an_acute_exacerba...","[this study, the hypothesis, an acute exacerba...",...,,,,,,,,,,
1099,http://ns.inria.fr/covid19/dca8a40bcecec6511c5...,"http://purl.org/ontology/bibo/AcademicArticle,...",http://example.org/hypothesis_ontology/PMC4627...,PMC4627423,http://ns.inria.fr/covid19/dca8a40bcecec6511c5...,Sarcoidosis is a systemic disease characterize...,https://krr.triply.cc/.well-known/genid/04eec3...,The study addresses the hypothesis that distin...,"[The_study, the_hypothesis, distinct_patterns,...","[The study, the hypothesis, distinct patterns,...",...,,,,,,,,,,


In [25]:
merged_data.to_csv('paper_hyp_entity_data_adg.csv', index=False)