In [1]:
import pandas as pd
import spacy
nlp = spacy.load('en_core_web_sm')  # loading the model, which by default has pipeline ['tagger', 'parser', 'ner']

from spacy.matcher import Matcher 
from spacy.tokens import Span 
from spacy.tokenizer import Tokenizer
from spacy.util import compile_infix_regex

## Load data and create new dataframe

In [2]:
hyps = pd.read_csv('hyps3.csv')

## Get hypothesis sentences and noun chunks

In [3]:
def pattern_match(text):
    """function to find sentences that contain the lemma of hypothesis"""
    matcher = Matcher(nlp.vocab)
    # Add match ID "HypothesisIs" with no callback and one pattern
    pattern = [{'LEMMA':{"IN":["hypothesis","hypothesize","hypothesise", "hypothesized", "hypothesised"]}}]
    
    matcher.add("HypothesisIs", None, pattern)

    doc = nlp(text)
    matches = matcher(doc)
    for match_id, start, end in matches:
        string_id = nlp.vocab.strings[match_id]  # Get string representation
        span = doc[start:end]  # The matched span
        sent = span.sent
        return sent.text

In [4]:
df = pd.DataFrame(hyps['object'])
df['pattern_match'] = df['object'].apply(pattern_match)
# display(df)

In [5]:
# # creating bool series True for NaN values - as the subsequent formula will break if there are 
# bool_series = pd.isnull(df["pattern_match"])  
    
# # filtering data  
# df[bool_series]

In [6]:
# SKIP IF NO NaN values found
# Take a look at the abstracts where the match was not found
# Determine if it is a matching error or if there is in fact no match

# for row in data_df.text[460:461]:  # iterating through the rows of the object column
#     print(row, '\n')

In [7]:
# Drop NaN rows from the dataframe (or do something smarter to filter them out so they don't break subsequent functions)
#data_df = data_df.dropna()

** comment the next cell if the merged noun chunks normally works for you

In [8]:
df.drop([df.index[455]], inplace=True)

In [9]:
merge_nps = nlp.create_pipe("merge_noun_chunks")
nlp.add_pipe(merge_nps)

def merge_noun_chunks(text):
    """function to merge noun chunks in texts"""
    noun_chunks = []
    for t in nlp(text):
        noun_chunks.append(t.text)
        
    return noun_chunks

In [10]:
# was helpful for finding the sentence causing the error
# docs = []
# error_lst = []
# error_text_lst = []
# for text in df['pattern_match']: 
#     try: 
#         doc = nlp(text)
#         docs.append(doc)
#     except ValueError as e: 
#         error_lst.append(e)
#         error_text_lst.append(text)
#         print("Error found in sentence: " + text)

In [11]:
# df[df['pattern_match']=="While the hypothesis that dromedary camels are the likely major source of MERS-CoV infection in humans is gaining acceptance, conjecture continues over the original natural reservoir host(s)"].index[0]

In [13]:
# Run formula on the 'pattern_match' column
df['merged_noun_chunks'] = df['pattern_match'].apply(merge_noun_chunks)

In [14]:
def combine_chunks(list_of_chunks):
    for index, word in enumerate(list_of_chunks):
        if len(word.split(' ')) > 1:
            new_word = word.replace(' ', '_')
            list_of_chunks[index] = new_word
    sentence = ' '.join(list_of_chunks)
    return sentence

In [15]:
# merged_nouns_list = data['merged_noun_chunks'])
df['merged_sent'] = df['merged_noun_chunks'].apply(combine_chunks)

In [16]:
df

Unnamed: 0,object,pattern_match,merged_noun_chunks,merged_sent
0,MUC1 variable number tandem repeats (VNTRs) co...,"Therefore, we hypothesize that a MUC1 VNTR TAC...","[Therefore, ,, we, hypothesize, that, a_MUC1_V...","Therefore , we hypothesize that a_MUC1_VNTR_TA..."
1,BACKGROUND: Mounting evidence suggests that ho...,We conducted a scoping review of the literatur...,"[We, conducted, a_scoping_review, of, the_lite...",We conducted a_scoping_review of the_literatur...
2,Abstract Enterohemorrhagic Escherichia coli (E...,"Thus, we hypothesize that the expression of si...","[Thus, ,, we, hypothesize, that, the_expressio...","Thus , we hypothesize that the_expression of s..."
3,The digestive tract is the entry site for tran...,"Therefore, we hypothesized that RA could induc...","[Therefore, ,, we, hypothesized, that, RA, cou...","Therefore , we hypothesized that RA could indu..."
4,BACKGROUND: FIV infection frequently compromis...,We hypothesized that FIV infection may cause d...,"[We, hypothesized, that, FIV_infection, may, c...",We hypothesized that FIV_infection may cause d...
...,...,...,...,...
547,Abstract P/V gene substitutions convert the no...,"Here, we used two distinct animal model system...","[Here, ,, we, used, two_distinct_animal_model_...","Here , we used two_distinct_animal_model_syste..."
548,Alcoholic liver disease (ALD) is characterized...,We hypothesized that TREM‐1 signaling contribu...,"[We, hypothesized, that, TREM‐1, signaling, co...",We hypothesized that TREM‐1 signaling contribu...
549,The unique ornamental features and extreme sex...,Innate and adaptive immune genes involved in c...,"[Innate_and_adaptive_immune_genes, involved, i...",Innate_and_adaptive_immune_genes involved in c...
550,"In the recent years, it has been demonstrated ...",Based on an extensive bibliography where the i...,"[Based, on, an_extensive_bibliography, where, ...",Based on an_extensive_bibliography where the_i...


In [18]:
df.to_csv("hyps_merged_sents.csv", index=False)

## Get specific Noun Chunks

In [20]:
hyps = pd.read_csv('hyps_merged_sents.csv')
sent2parse = hyps["merged_sent"]

In [21]:
# solution from https://stackoverflow.com/questions/59993683/how-can-i-get-spacy-to-stop-splitting-both-hyphenated-numbers-and-words-into-sep 
def custom_tokenizer(nlp):
    """
    Function that keeps intra-hyphenated words as single tokens.
    """
    inf = list(nlp.Defaults.infixes)               # Default infixes
    inf.remove(r"(?<=[0-9])[+\-\*^](?=[0-9-])")    # Remove the generic op between numbers or between a number and a -
    inf = tuple(inf)                               # Convert inf to tuple
    infixes = inf + tuple([r"(?<=[0-9])[+*^](?=[0-9-])", r"(?<=[0-9])-(?=-)"])  # Add the removed rule after subtracting (?<=[0-9])-(?=[0-9]) pattern
    infixes = [x for x in infixes if '-|–|—|--|---|——|~' not in x] # Remove - between letters rule
    infix_re = compile_infix_regex(infixes)

    return Tokenizer(nlp.vocab, prefix_search=nlp.tokenizer.prefix_search,
                                suffix_search=nlp.tokenizer.suffix_search,
                                infix_finditer=infix_re.finditer,
                                token_match=nlp.tokenizer.token_match,
                                rules=nlp.Defaults.tokenizer_exceptions)

nlp.tokenizer = custom_tokenizer(nlp)

In [22]:
# def get_keywords(hypothesis_sentence):
#     keywords = []
#     for hypothesis in series:
#         hypothesis_keywords = []
#         doc = nlp(hypothesis)
#         for tok in doc:
#             if tok.pos_ == "PROPN" or tok.pos_ == "NOUN":
#                 hypothesis_keywords.append(tok.text)
#         keywords.append(hypothesis_keywords)
#     return keywords

In [27]:
def get_keywords(hypothesis_sentence):
    hypothesis_keywords = []
    doc = nlp(hypothesis_sentence)
    for tok in doc:
        if tok.pos_ == "PROPN" or tok.pos_ == "NOUN":
            hypothesis_keywords.append(tok.text)
    return hypothesis_keywords

In [28]:
hyps['keywords'] = hyps['merged_sent'].apply(get_keywords)

In [29]:
hyps.to_csv("abstract_data.csv", index=False)