In [11]:
import csv
import pandas as pd
from tqdm import tqdm
from requests.compat import urljoin
import requests
import glob
import os
from bs4 import BeautifulSoup

In [40]:
# !python -m spacy download en_core_web_sm

In [39]:
# !pip install spacy
import spacy

nlp = spacy.load("en_core_web_sm")

In [13]:
# Open types.txt and read the types
with open('../data/gold_standard_corpus.tsv', 'r') as f:
    pmcids = f.read().splitlines()

In [14]:
pmcids

['PMC4792959',
 'PMC4556948',
 'PMC5993813',
 'PMC3174205',
 'PMC5962829',
 'PMC3874094',
 'PMC3792120',
 'PMC4901335',
 'PMC3581133',
 'PMC2935479',
 'PMC5225553',
 'PMC5744400',
 'PMC3281816',
 'PMC3583137',
 'PMC4022742',
 'PMC3542345',
 'PMC4452330',
 'PMC4464872',
 'PMC4872455',
 'PMC3651197',
 'PMC3362782',
 'PMC5817132',
 'PMC4313693',
 'PMC4489904',
 'PMC4552872',
 'PMC5376652',
 'PMC5070310',
 'PMC5921292',
 'PMC5641157',
 'PMC3751948',
 'PMC5472290',
 'PMC4649626',
 'PMC5502978',
 'PMC4767726',
 'PMC3897916',
 'PMC5087830',
 'PMC3585192',
 'PMC5484670',
 'PMC5259676',
 'PMC3024232',
 'PMC3097211',
 'PMC5317055',
 'PMC3648400',
 'PMC5750880',
 'PMC5100220',
 'PMC4749753',
 'PMC5344356',
 'PMC5110973',
 'PMC5708618',
 'PMC3598673',
 'PMC3751959',
 'PMC5131611',
 'PMC5891595',
 'PMC3950279',
 'PMC5972578',
 'PMC5082793',
 'PMC5487420',
 'PMC5106849',
 'PMC3858553',
 'PMC3613406',
 'PMC4167147',
 'PMC3599585',
 'PMC2761781',
 'PMC3899050',
 'PMC2481430',
 'PMC5006041',
 'PMC47908

In [28]:
def get_Json_through_PMCID(pmcid):
    base_url = "https://www.ebi.ac.uk/europepmc/annotations_api/"
    article_url = urljoin(base_url,
                          "annotationsByArticleIds?articleIds=PMC%3A" + pmcid + "&provider=Europe%20PMC&format=JSON")
    r = requests.get(article_url)

    if r.status_code == 200:
        return r
    else:
        return False


In [32]:
def get_epmc_annotations_to_file(PMCids):
    with open('../data/annotations_api.csv', 'w', newline='\n') as f1:
        test_writer = csv.writer(f1, delimiter='\t', lineterminator='\n')

        # count = 0
        for each_id in tqdm(PMCids):
            # count = count+1
            # print(each_test_pmc_id + '\t' + str(count))
            json_annotations = get_Json_through_PMCID(each_id[3:])  # Just the number is needed. So remove the PMC from the front
            if json_annotations:
                json_results = json_annotations.json()
                try:
                    pmc_id = json_results[0]['pmcid']
                    # print(pmc_id)
                    for each_annotation in json_results[0]['annotations']:
                        exact = each_annotation['prefix'] + each_annotation['exact'] + each_annotation['postfix']
                        token = each_annotation['tags'][0]['name']
                        ner = each_annotation['type']
                        row = [pmc_id, exact, token, ner]
                        test_writer.writerow(row)
                except(IndexError):
                    print('no annotations found!! '+str(each_id))
            else:
                print('no annotations! '+str(each_id))
                continue


In [33]:
get_epmc_annotations_to_file(pmcids)

100%|█████████████████████████████████████████| 300/300 [00:52<00:00,  5.76it/s]


In [15]:
annotations_df = pd.read_csv('../data/annotations_api.csv', sep='\t', names=['pmcid', 'exact', 'token', 'ner'])
annotations_df

Unnamed: 0,pmcid,exact,token,ner
0,PMC4792959,"Plant Biology, 260 Panama Street, Stanford, C",Panama,Organisms
1,PMC4792959,"In plants, such barriers can either act before",plants,Organisms
2,PMC4792959,her act before (pre-pollination barriers) or a...,pollination,Gene Ontology
3,PMC4792959,barriers) or after pollination (post-pollinat...,pollination,Gene Ontology
4,PMC4792959,or after pollination (post-pollination barrie...,pollination,Gene Ontology
...,...,...,...,...
88185,PMC3458065,f erythrocytes from HIV-positive individual,HIV,Organisms
88186,PMC3458065,lts suggest that in HIV-positive individual,HIV,Organisms
88187,PMC3458065,ing high amounts of HIV by the presence of,HIV,Organisms
88188,PMC3458065,-gp160/120 on their membranes and this may produc,membranes,Gene Ontology


In [16]:
annotations_df['ner'].unique()

array(['Organisms', 'Gene Ontology', 'Diseases', 'Gene_Proteins',
       'Experimental Methods', 'Chemicals', 'Accession Numbers',
       'Resources'], dtype=object)

In [23]:
annotations_df_other = annotations_df[annotations_df['ner'].isin(['Gene Ontology', 'Experimental Methods', 'Accession Numbers', 'Resources'])]
annotations_df_other

Unnamed: 0,pmcid,exact,token,ner
2,PMC4792959,her act before (pre-pollination barriers) or a...,pollination,Gene Ontology
3,PMC4792959,barriers) or after pollination (post-pollinat...,pollination,Gene Ontology
4,PMC4792959,or after pollination (post-pollination barrie...,pollination,Gene Ontology
5,PMC4792959,Pre-pollination barriers can be spatial or tem...,pollination,Gene Ontology
7,PMC4792959,"ecies, whereas post-pollination barriers come ...",pollination,Gene Ontology
...,...,...,...,...
88126,PMC3458065,ated for 7 to 10 days and assayed for syncytiu...,syncytium formation,Gene Ontology
88127,PMC3458065,is able to inhibit syncytium formation (loss ...,syncytium formation,Gene Ontology
88134,PMC3458065,end-point dilution assay in MT-2 cells cultu,assay,Experimental Methods
88156,PMC3458065,e IgG anti-HIV binding to erythrocyte membrane.,membrane,Gene Ontology


In [17]:
# # Get unique PMCIDs
# unique_pmcids = annotations_df_other['pmcid'].unique()url = f"https://www.ebi.ac.uk/europepmc/webservices/rest/{pmcid}/fullTextXML"
# unique_pmcids

# import requests
# from bs4 import BeautifulSoup
# import re

# def get_full_text_xml(pmcid):
#     url = f"https://www.ebi.ac.uk/europepmc/webservices/rest/{pmcid}/fullTextXML"
#     response = requests.get(url)
#     if response.status_code == 200:
#         soup = BeautifulSoup(response.content, 'xml')
#         p_tags = soup.find_all('p')
#         p_texts = [tag.get_text() for tag in p_tags]
#         return p_texts
#     else:
#         return None

In [24]:
unique_pmcids = annotations_df_other['pmcid'].unique()

In [25]:
def get_full_text_xml(pmcid):
    path = f"../data/300_articles_source_files/{pmcid}.xml"
    try:
        with open(path, 'r', encoding='utf-8') as file:
            content = file.read()
        
        soup = BeautifulSoup(content, 'lxml-xml')  # Use lxml-xml parser
        plain_tags = soup.find_all('plain')
        plain_texts = [tag.get_text() for tag in plain_tags]
        return plain_texts
    except FileNotFoundError:
        print(f"File {pmcid}.xml not found.")
        return None
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

In [48]:
def find_sentence_with_substring(string_list, substring):
    for text in string_list:
        sentences = re.split(r'(?<=[.!?])\s+', text)
        for sentence in sentences:
            if substring in sentence:
                return sentence
    return None

# def adjust_token_for_context(sentence, token, ner_):
#     words = sentence.split()
#     punctuation_marks = [',', '.', '?', '!']

#     # Adjust token for larger term if token is a substring and does not contain specific punctuation
#     if ner_ in ['Gene Ontology', 'Experimental Methods']:
#         for word in words:
#             if token in word and word != token and not any(mark in token for mark in punctuation_marks):
#                 token = word  # Update the token to the larger term
#                 break

#     # Extend the token if 'assay', 'assays', 'insertion', or 'insertions' follows the token and the preceding word is in upper or mixed case
#     for key_word in ['assay', 'insertion', 'insertions', 'assays']:
#         if key_word in words:
#             key_word_index = words.index(key_word)
#             if key_word_index > 0 and not words[key_word_index - 1].islower():
#                 if words[key_word_index - 1].lower() not in ['the', 'an', 'a']:
#                     token = words[key_word_index - 1] + ' ' + token  # Update the token
#                     token = token  # Remove parentheses

#     return token.replace('(', '').replace(')', '')

def adjust_token_for_context(sentence, token, ner_):
    words = sentence.split()
    punctuation_marks = [',', '.', '?', '!']
    extended_tokens = []

    # Adjust token for larger term if token is a substring and does not contain specific punctuation
    if ner_ in ['Gene Ontology', 'Experimental Methods']:
        for word in words:
            if token in word and word != token and not any(mark in token for mark in punctuation_marks):
                extended_tokens.append(word)  # Add the matched larger term to the list

    # Extend the token if 'assay', 'assays', 'insertion', or 'insertions' follows the token and the preceding word is in upper or mixed case
    for key_word in ['assay', 'insertion', 'insertions', 'assays']:
        if key_word in words:
            key_word_index = words.index(key_word)
            if key_word_index > 0 and not words[key_word_index - 1].islower():
                if words[key_word_index - 1].lower() not in ['the', 'an', 'a']:
                    extended_token = words[key_word_index - 1] + ' ' + token  # Form the extended token
                    extended_token = extended_token.replace('(', '').replace(')', '')
                    extended_tokens.append(extended_token)  # Add the extended token to the list

    return extended_tokens if extended_tokens else [token]




def process_pmcid(df, pmcid, p_texts):
    sentences_data = {}
    for _, row in df[df['pmcid'] == pmcid].iterrows():
        sentence = find_sentence_with_substring(p_texts, row['exact'])
        if sentence:
            if sentence not in sentences_data:
                sentences_data[sentence] = set()

            updated_tokens = adjust_token_for_context(sentence, row['token'], row['ner'])
            for updated_token in updated_tokens:
                sentences_data[sentence].add((updated_token, row['ner']))

    return [[pmcid, sentence, list(ner_tags)] for sentence, ner_tags in sentences_data.items()]

# # The rest of the script remains the same
# def process_pmcid(df, pmcid, p_texts):
#     sentences_data = {}
#     for _, row in df[df['pmcid'] == pmcid].iterrows():
#         sentence = find_sentence_with_substring(p_texts, row['exact'])
#         if sentence:
#             if sentence not in sentences_data:
#                 sentences_data[sentence] = set()

#             updated_token = adjust_token_for_context(sentence, row['token'],row['ner'])
#             sentences_data[sentence].add((updated_token, row['ner']))

#     return [[pmcid, sentence, list(ner_tags)] for sentence, ner_tags in sentences_data.items()]

final_data = []

for pmcid in tqdm(unique_pmcids):
    p_texts = get_full_text_xml(pmcid)
    if p_texts:
        processed_data = process_pmcid(annotations_df_other, pmcid, p_texts)
        final_data.extend(processed_data)

# Convert to DataFrame
final_df = pd.DataFrame(final_data, columns=['pmcid', 'sentence', 'ner'])

# Save as TSV
final_df.to_csv('../data/xxx6.tsv', sep='\t', index=False)

100%|█████████████████████████████████████████| 297/297 [00:31<00:00,  9.34it/s]


In [18]:
viral capture assay,
interspecific pollination,
N-glycosylation.
flow cytometry assay

SyntaxError: invalid syntax (1924422297.py, line 1)

In [15]:
def merge_overlapping_tags(ner_tags):
    # Sort by start index
    sorted_tags = sorted(ner_tags, key=lambda x: x[0])
    merged_tags = []
    current_tag = None

    for tag in sorted_tags:
        if current_tag is None:
            current_tag = tag
        else:
            # Check for overlap or adjacency and same entity type
            if (tag[0] <= current_tag[1] or tag[0] == current_tag[1] + 1) and tag[3] == current_tag[3]:
                # Extend the current tag if the new tag ends later
                current_tag[1] = max(current_tag[1], tag[1])
                current_tag[2] = current_tag[2] if current_tag[2] in tag[2] else current_tag[2] + ' ' + tag[2]  # Merge tokens
            else:
                merged_tags.append(current_tag)
                current_tag = tag

    if current_tag is not None:
        merged_tags.append(current_tag)

    return merged_tags




def remove_duplicate_tags(ner_tags):
    return [list(t) for t in set(tuple(tag) for tag in ner_tags)]

def adjust_spans_for_context(sentence, start_index, end_index, token):
    words = sentence.split()
    word_positions = [sentence.find(word) for word in words]

    # Adjust span for larger term if token is a substring
    for i, word in enumerate(words):
        if token in word and word != token:
            start_index = min(start_index, word_positions[i])
            end_index = max(end_index, word_positions[i] + len(word))
            token = word  # Update the token to the larger term
            break

    # Extend the span if 'assay' follows the token and the preceding word is in upper or mixed case
    for i, word in enumerate(words):
        if word.lower() == 'assay' or word.lower() == 'insertion' and i > 0 and not words[i - 1].islower():
            prev_word_start = word_positions[i - 1]
            start_index = min(start_index, prev_word_start)
            token = words[i - 1] + ' ' + token  # Update the token
            break

    return start_index, end_index, token

def process_pmcid(df, pmcid, p_texts):
    sentences_data = {}
    for _, row in df[df['pmcid'] == pmcid].iterrows():
        sentence = find_sentence_with_substring(p_texts, row['exact'])
        if sentence:
            if sentence not in sentences_data:
                sentences_data[sentence] = []

            start_index = sentence.find(row['token'])
            if start_index != -1:
                end_index = start_index + len(row['token'])
                start_index, end_index, updated_token = adjust_spans_for_context(sentence, start_index, end_index, row['token'])
                sentences_data[sentence].append([start_index, end_index, updated_token, row['ner']])

    for sentence, tags in sentences_data.items():
        sentences_data[sentence] = remove_duplicate_tags(tags)
        sentences_data[sentence] = merge_overlapping_tags(sentences_data[sentence])

    return [(pmcid, sentence, ner_tags) for sentence, ner_tags in sentences_data.items()]

final_data = []

for pmcid in tqdm(unique_pmcids):
    p_texts = get_full_text_xml(pmcid)
    if p_texts:
        processed_data = process_pmcid(annotations_df_other, pmcid, p_texts)
        final_data.extend(processed_data)

# Convert to DataFrame
final_df = pd.DataFrame(final_data, columns=['pmcid', 'sentence', 'ner'])

# Save as TSV
final_df.to_csv('../data/xxx4.tsv', sep='\t', index=False)


100%|█████████████████████████████████████████| 297/297 [03:31<00:00,  1.40it/s]


In [11]:
def merge_overlapping_tags(ner_tags):
    # Sort by start index
    sorted_tags = sorted(ner_tags, key=lambda x: x[0])
    merged_tags = []
    current_tag = None

    for tag in sorted_tags:
        if current_tag is None:
            current_tag = tag
        else:
            # Check for overlap or adjacency
            if tag[0] <= current_tag[1]:
                # Extend the current tag if the new tag ends later
                current_tag[1] = max(current_tag[1], tag[1])
                current_tag[2] = current_tag[2] + ' ' + tag[2]  # Merge tokens
                current_tag[3] = current_tag[3] + ' ' + tag[3]  # Merge entity types
            else:
                merged_tags.append(current_tag)
                current_tag = tag

    if current_tag is not None:
        merged_tags.append(current_tag)

    return merged_tags

def remove_duplicate_tags(ner_tags):
    return [list(t) for t in set(tuple(tag) for tag in ner_tags)]

def process_pmcid(df, pmcid, p_texts):
    sentences_data = {}
    for _, row in df[df['pmcid'] == pmcid].iterrows():
        sentence = find_sentence_with_substring(p_texts, row['exact'])
        if sentence:
            if sentence not in sentences_data:
                sentences_data[sentence] = []

            start_index = sentence.find(row['token'])
            if start_index != -1:
                end_index = start_index + len(row['token'])
                sentences_data[sentence].append([start_index, end_index, row['token'], row['ner']])

    for sentence, tags in sentences_data.items():
        sentences_data[sentence] = remove_duplicate_tags(tags)
        sentences_data[sentence] = merge_overlapping_tags(sentences_data[sentence])

    return [(pmcid, sentence, ner_tags) for sentence, ner_tags in sentences_data.items()]

final_data = []

for pmcid in tqdm(unique_pmcids):
    p_texts = get_full_text_xml(pmcid)
    if p_texts:
        processed_data = process_pmcid(annotations_df_other, pmcid, p_texts)
        final_data.extend(processed_data)

# Convert to DataFrame
final_df = pd.DataFrame(final_data, columns=['pmcid', 'sentence', 'ner'])

# Save as TSV
final_df.to_csv('../data/xxx2.tsv', sep='\t', index=False)


100%|█████████████████████████████████████████| 297/297 [04:00<00:00,  1.23it/s]


In [66]:
def process_pmcid(df, pmcid):
    p_texts = get_full_text_xml(pmcid)
    if p_texts is None:
        return []

    sentences_data = {}
    # Iterate over rows with the same PMCID
    for _, row in df[df['pmcid'] == pmcid].iterrows():
        sentence = find_sentence_with_substring(p_texts, row['exact'])
        if sentence:
            if sentence not in sentences_data:
                sentences_data[sentence] = []
            
            # Find the start and end indices of the token in the sentence
            start_index = sentence.find(row['token'])
            if start_index != -1:  # Token found in the sentence
                end_index = start_index + len(row['token'])
                sentences_data[sentence].append([start_index, end_index, row['token'], row['ner']])

    # Convert the dictionary to a list of tuples
    processed_data = [(pmcid, sentence, ner_tags) for sentence, ner_tags in sentences_data.items()]
    return processed_data

final_data = []

for pmcid in tqdm(unique_pmcids):
    processed_data = process_pmcid(df, pmcid)
    final_data.extend(processed_data)

# Convert to DataFrame
final_df = pd.DataFrame(final_data, columns=['pmcid', 'sentence', 'ner'])

# Save as TSV
final_df.to_csv('../data/xxx1.tsv', sep='\t', index=False)


100%|█████████████████████████████████████████| 297/297 [03:38<00:00,  1.36it/s]


In [64]:
"In plants, such barriers can either act before (pre-pollination barriers) or after pollination (post-pollination barriers)."[32:83]

'her act before (pre-pollination barriers) or after '

NameError: name 'tqdm' is not defined

In [33]:
import pandas as pd
import os
from tqdm import tqdm

def find_span(sentence, substring):
    start = sentence.find(substring)
    if start != -1:
        return [start, start + len(substring)]
    return None

annotations_df = pd.read_csv('../data/annotations_api.csv', delimiter='\t', names=['pmcid', 'exact', 'token', 'ner'])
annotations_df = annotations_df[annotations_df['ner'].isin(['Gene Ontology', 'Experimental Methods', 'Accession Numbers', 'Resources'])]

folder_path = '../data/CD_GP_DS_OG_test'
for file in os.listdir(folder_path):
    if file.endswith('.csv'):
        file_path = os.path.join(folder_path, file)
        df = pd.read_csv(file_path)

        # Initialize an empty list to store relevant rows
        relevant_rows = []

        for index, row in tqdm(df.iterrows(), total=df.shape[0], desc=f"Processing {file}"):
            sentence = row[0]
            entities_str = row[1]
            entities = eval(entities_str) if entities_str != 'None' and entities_str.startswith('[') else []
            updated = False

            for _, annotation in annotations_df.iterrows():
                if annotation['exact'] in sentence:
                    span = find_span(sentence, annotation['token'])
                    if span and all(isinstance(e, list) and (span[0] > e[1] or span[1] < e[0]) for e in entities):
                        entities.append([span[0], span[1], annotation['token'], annotation['ner']])
                        updated = True

            # Add the row to the relevant_rows list if it contains updated entities
            if updated:
                relevant_rows.append([sentence, str(entities)])
                print(sentence, str(entities))

        # Create a new DataFrame from the list of relevant rows
        new_df = pd.DataFrame(relevant_rows, columns=df.columns)

        # Save to a new file
        new_file_path = os.path.join(folder_path, f"new_{file}")
        new_df.to_csv(new_file_path, index=False)

print("Processing completed.")


Processing clean_CD_GP_DS_OG_train.csv:   0%| | 8/80013 [00:06<17:20:43,  1.28it

This work aimed to assess whether GR3027 improves motor incoordination, spatial learning, and circadian rhythms of activity in rats with HE.  [[127, 131, 'rats', 'OG'], [137, 139, 'HE', 'DS'], [34, 40, 'GR3027', 'CD'], [80, 88, 'learning', 'Gene Ontology'], [94, 111, 'circadian rhythms', 'Gene Ontology']]


Processing clean_CD_GP_DS_OG_train.csv:   0%| | 12/80013 [00:09<17:47:22,  1.25i

In both hyperammonemic and PCS rats, GR3027 restores motor coordination, spatial memory in the Morris water maze, and spatial learning in the radial maze.  [[31, 35, 'rats', 'OG'], [37, 43, 'GR3027', 'CD'], [81, 87, 'memory', 'Gene Ontology'], [126, 134, 'learning', 'Gene Ontology']]


Processing clean_CD_GP_DS_OG_train.csv:   0%| | 13/80013 [00:10<17:29:07,  1.27i

GR3027 also partially restores circadian rhythms of ambulatory and vertical activity in PCS rats.  [[92, 96, 'rats', 'OG'], [0, 6, 'GR3027', 'CD'], [31, 48, 'circadian rhythms', 'Gene Ontology']]


Processing clean_CD_GP_DS_OG_train.csv:   0%| | 24/80013 [00:18<16:54:14,  1.31i

Increased GABAergic tone induces motor incoordination, and extracellular GABA in cerebellum correlates with motor incoordination in rats (9). α1-Containing GABAA receptors are likely involved in the motor incoordination since benzodiazepines induce ataxia by enhancing activation of α1-containing GABAA receptors (27).  [[156, 171, 'GABAA receptors', 'GP'], [297, 312, 'GABAA receptors', 'GP'], [249, 255, 'ataxia', 'DS'], [132, 136, 'rats', 'OG'], [226, 241, 'benzodiazepines', 'CD'], [59, 72, 'extracellular', 'Gene Ontology']]


Processing clean_CD_GP_DS_OG_train.csv:   0%| | 25/80013 [00:19<16:53:04,  1.32i

Furthermore, overactivation of GABAA receptors by the agonists diazepam and muscimol or the neurosteroids allopregnanolone and 3α,21-dihydroxy-5α-pregnan-20-one (THDOC) impairs spatial learning and memory in the Morris water maze (19, 33, 34).  [[31, 46, 'GABAA receptors', 'GP'], [63, 71, 'diazepam', 'CD'], [76, 84, 'muscimol', 'CD'], [106, 122, 'allopregnanolone', 'CD'], [185, 193, 'learning', 'Gene Ontology'], [198, 204, 'memory', 'Gene Ontology']]


Processing clean_CD_GP_DS_OG_train.csv:   0%| | 26/80013 [00:20<16:46:53,  1.32i

GABAergic tone is increased in the cerebellum of rats with chronic hyperammonemia and HE because of increased extracellular GABA and increased levels of neurosteroids acting as positive modulators of GABAA receptors (allopregnanolone, THDOC) (12).  [[200, 215, 'GABAA receptors', 'GP'], [86, 88, 'HE', 'DS'], [49, 53, 'rats', 'OG'], [67, 81, 'hyperammonemia', 'DS'], [217, 233, 'allopregnanolone', 'CD'], [110, 123, 'extracellular', 'Gene Ontology']]


Processing clean_CD_GP_DS_OG_train.csv:   0%| | 31/80013 [00:24<17:25:12,  1.28i


KeyboardInterrupt: 

In [36]:
import pandas as pd
import os
from tqdm import tqdm

def find_span(sentence, substring):
    start = sentence.find(substring)
    if start != -1:
        return [start, start + len(substring)]
    return None

annotations_df = pd.read_csv('../data/annotations_api.csv', delimiter='\t', names=['pmcid', 'exact', 'token', 'ner'])
annotations_df = annotations_df[annotations_df['ner'].isin(['Gene Ontology', 'Experimental Methods', 'Accession Numbers', 'Resources'])]

folder_path = '../data/CD_GP_DS_OG_test'
for file in os.listdir(folder_path):
    if file.endswith('.csv'):
        file_path = os.path.join(folder_path, file)
        df = pd.read_csv(file_path)

        # Open a new file for writing
        new_file_path = os.path.join(folder_path, f"new_{file}")
        with open(new_file_path, 'w') as new_file:
            # Write headers
            new_file.write(','.join(df.columns) + '\n')

            for index, row in tqdm(df.iterrows(), total=df.shape[0], desc=f"Processing {file}"):
                sentence = row[0]
                entities_str = row[1]
                entities = eval(entities_str) if entities_str != 'None' and entities_str.startswith('[') else []
                updated = False

                for _, annotation in annotations_df.iterrows():
                    if annotation['exact'] in sentence:
                        span = find_span(sentence, annotation['token'])
                        if span and all(isinstance(e, list) and (span[0] > e[1] or span[1] < e[0]) for e in entities):
                            entities.append([span[0], span[1], annotation['token'], annotation['ner']])
                            updated = True

                # Write the updated row to the new file
                if updated:
                    new_file.write(f"{sentence},{str(entities)}\n")

print("Processing completed.")


Processing clean_CD_GP_DS_OG_train.csv:   1%| | 515/80013 [06:30<16:43:24,  1.32


KeyboardInterrupt: 

In [40]:
import requests
from bs4 import BeautifulSoup

# URL to request
url = "https://www.ebi.ac.uk/europepmc/webservices/rest/PMC4792959/fullTextXML"

# Make the request
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    # Parse the content using BeautifulSoup
    soup = BeautifulSoup(response.content, 'xml')
    
    # Find all <p> tags
    p_tags = soup.find_all('p')

    # Extract text from each <p> tag
    p_texts = [tag.get_text() for tag in p_tags]
else:
    p_texts = f"Failed to retrieve data: Status code {response.status_code}"

p_texts[:5]  # Display first 5 paragraphs as an example


['Present address: Boyce-Thompson Institute for Plant Research, 533 Tower Road, Ithaca, New York 14853, USA',
 'Present address: Carnegie Institution for Science, Department of Plant Biology, 260 Panama Street, Stanford, California 94305, USA',
 'Species-specific gamete recognition is a key premise to ensure reproductive success and the maintenance of species boundaries. During plant pollen tube (PT) reception, gametophyte interactions likely allow the species-specific recognition of signals from the PT (male gametophyte) by the embryo sac (female gametophyte), resulting in PT rupture, sperm release, and double fertilization. This process is impaired in interspecific crosses between Arabidopsis thaliana and related species, leading to PT overgrowth and a failure to deliver the sperm cells. Here we show that ARTUMES (ARU) specifically regulates the recognition of interspecific PTs in A. thaliana. ARU, identified in a genome-wide association study (GWAS), exclusively influences interspec

In [47]:
import re
# Function to find the sentence containing the substring
def find_sentence_with_substring(string_list, substring):
    for text in string_list:
        # Split the text into sentences
        sentences = re.split(r'(?<=[.!?])\s+', text)
        for sentence in sentences:
            if substring in sentence:
                return sentence
    return None

# Find the sentence


In [49]:
substring_to_check =  'vule cDNA, the UBC9 assay was performed as an' 

found_sentence = find_sentence_with_substring(p_texts, substring_to_check)
found_sentence

'For digital droplet PCR on ovule cDNA, the UBC9 assay was performed as an EvaGreen assay, whereas ARU transcripts were detected using a gene-specific probe (5′-FAM- TACTGCACAAAGGTTG -MGB-3′).'

In [43]:
found = any('ubstrate-specific N-glycosylation of proteins in yeas' in s for s in p_texts)
found

True