In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize

nltk.download('punkt')
nltk.download('stopwords')

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/judecrener-p/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/judecrener-p/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
def extract_pubmed_abstracts(file_path):
    # Create lists to store PubMed IDs and abstracts
    pubmed_ids = []
    abstracts = []
    ann = []

    # Open the file and read through it line by line
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            pubmed_id = 0
            # Check if the line contains an abstract
            if '|a|' in line:
                parts = line.split('|a|')
                if len(parts) > 1:
                    # Extract PubMed ID from the part before '|a|'
                    pubmed_id = parts[0].split('|')[0]
                    # The abstract is the part after '|a|'
                    abstract = parts[1].strip()

                    pubmed_ids.append(pubmed_id)
                    abstracts.append(abstract)
          
            if '|ann|' in line: 
              parts = line.split('|ann|')
              id = parts[0].strip()
              annotations = parts[1].strip()
              ann.append({'id': id, 'abbr': annotations.split('|')[0], 'FS': annotations.split('|')[1]})

    df = pd.DataFrame({
        'PubMedID': pubmed_ids,
        'Abstract': abstracts
    })

    return df, pd.DataFrame(ann)

In [3]:
file_path = '../data/ncbi/NCBItestset_corpus_WSD.txt'
df_abstracts, df_annotation = extract_pubmed_abstracts(file_path)

In [4]:
def words_tokenize(text):
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words and word not in ['.', ',', '(', ')', '[', ']',':','-']]
    return filtered_tokens

def add_unique(dictionary, list_of_dictionaries):
    if dictionary not in list_of_dictionaries:
        list_of_dictionaries.append(dictionary)
        
non_ambiguous_abbr = ['ADNDI','B-NHL','BPAD','C5D','C6D','EDMD','FNDI','FRDA','HNPCC','NF1','PKU','T-PLL','VLCAD','XLDCM']

In [5]:
df_abstracts

Unnamed: 0,PubMedID,Abstract
0,9949209,Abnormal hepatic copper accumulation is recogn...
1,9950360,BACKGROUND/AIMS The development of colorectal...
2,9634518,Phenylketonuria (PKU) and mild hyperphenylalan...
3,9563950,Myotonic dystrophy (DM) is caused by a CTG exp...
4,9674903,Maternal uniparental disomy (UPD) for chromoso...
...,...,...
56,9590284,Imprinting in the 15q11-q13 region involves an...
57,9843038,Hereditary coproporphyria (HCP) is an autosoma...
58,9529364,"Constitutional mutations of the WT1 gene, enco..."
59,9391889,We report a rare case of paternally transmitte...


In [12]:
final_sentences = []
for id, abstract in df_abstracts.iterrows():
  pmid = abstract['PubMedID']
  text = abstract['Abstract']
  sentences = sent_tokenize(text)
  annotations = df_annotation[df_annotation['id']==pmid]
  for id, ann in annotations.iterrows():
    abbr = ann['abbr']
    FS = ann['FS']
    for sentence in sentences:
      if abbr.strip() not in non_ambiguous_abbr:
        if f' {abbr} ' or f'({abbr})' in sentence:
          if FS.lower() in sentence.lower():
            masked_sentence = re.sub(FS, '', sentence, flags=re.IGNORECASE)
            masked_sentence = masked_sentence.replace(f'({abbr})', abbr)
            data = {'pmid': pmid, 'sentence': sentence, 'masked_sentence':masked_sentence, 'abbr': abbr, 'FS': FS.lower()}
            add_unique(data,final_sentences)
          elif f' {abbr} ' in sentence:
            masked_sentence = sentence
            data = {'pmid': pmid, 'sentence': sentence, 'masked_sentence':masked_sentence, 'abbr': abbr, 'FS': FS.lower()}
            add_unique(data,final_sentences)

sentences_df = pd.DataFrame(final_sentences)
sentences_df.to_csv('../data/ncbi/WSD_NCBI_sentences.csv', index=False)

In [13]:
sentences_df

Unnamed: 0,pmid,sentence,masked_sentence,abbr,FS
0,9949209,The major cause of hepatic copper accumulation...,The major cause of hepatic copper accumulation...,WD,wilson disease
1,9949209,We examined whether the WD gene ATP7B was also...,We examined whether the WD gene ATP7B was also...,WD,wilson disease
2,9949209,"However, BAC clones containing ATP7B and C0410...","However, BAC clones containing ATP7B and C0410...",WD,wilson disease
3,9949209,By investigating the common autosomal recessiv...,By investigating the common autosomal recessiv...,CT,copper toxicosis
4,9949209,We examined whether the WD gene ATP7B was also...,We examined whether the WD gene ATP7B was also...,CT,copper toxicosis
...,...,...,...,...,...
195,9529364,No WT1 mutations were detected in the six othe...,No WT1 mutations were detected in the six othe...,IDMS,isolated diffuse mesangial sclerosis
196,9391889,We report a rare case of paternally transmitte...,We report a rare case of paternally transmitte...,DM,myotonic dystrophy
197,9391889,Only six other cases of paternal transmission ...,Only six other cases of paternal transmission ...,DM,myotonic dystrophy
198,9391889,Decreased fertility of males with adult onset ...,Decreased fertility of males with adult onset ...,DM,myotonic dystrophy
