<a href="https://colab.research.google.com/github/Laboratorio-de-Analise-de-Dados/data_extraction_model/blob/main/xml_embedding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Create embeddings from XML scientific paper text




In [None]:
# @title
## Environment set up

import os

if "COLAB_GPU" in os.environ:
  !pip install -U torch #2.1.1+
  !pip install tqdm # Progress bar
  !pip install sentence-transformers # Embedding models
  !pip install flash-attn --no-build-isolation # For faster attention mechanism
  !pip install accelerate
  !pip install bitsandbytes # Quantizing models
  !pip install tqdm
  !pip install --upgrade torch torchvision transformers

Collecting sentence-transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentence-transformers
Successfully installed sentence-transformers-3.0.1
Collecting flash-attn
  Downloading flash_attn-2.6.3.tar.gz (2.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m42.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: flash-attn
  Building wheel for flash-attn (setup.py) ... [?25l[?25hdone
  Created wheel for flash-attn: filename=flash_attn-2.6.3-cp310-cp310-linux_x86_64.whl size=187309225 sha256=237ef9c6157db394e1ddde4ba609a21ebb98382377a27041edc09318801a6f24
  Stored in directory: /root/.cache/pip/wheels/7e/e3/c3/89c

Extract and process specific information from an XML document

In [None]:
import xml.etree.ElementTree as ET
import requests
from spacy.lang.en import English
from tqdm.auto import tqdm


## XML Extraction
def extract_article_info(xml_url):

    # Requisition to obtain XML article
    response = requests.get(xml_url)

    # Obtain the root element of the XML
    root = ET.fromstring(response.content)

    article_info = {}

    # Finds XML general informations
    document = root.find('.//document')

    # Finds passage extraction
    passages = document.findall('.//passage')
    article_info['passages'] = []

    # Inicializate the NLP model
    nlp = English()
    nlp.add_pipe('sentencizer')

    # Iterate through finded passages
    for passage in tqdm(passages):

      # Obtain passage section type
      section_type = passage.find('.//infon[@key="section_type"]').text if passage.find('.//infon[@key="section_type"]') is not None else "Unknown"

      # Obtain the passage text
      text = passage.find('text').text if passage.find('text') is not None else ""


      # Sentence split
      sentences = list(nlp(text).sents)
      sentences = [str(sentence) for sentence in sentences]

      # Add information about the text
      article_info['passages'].append({
        'section_type': section_type,
        'text' : text,
        'sentences': sentences,
        'sentence_count' : len(text.split('. ')),
        'sentence_count_spacy': len(sentences),
        'char_count' : len(text),
        'word_count': len(text.split()),
        'token_count': len(text) / 4 # 1 token = 4 chars
      })

    # Return information dict
    return article_info

## URL
xml_url = 'https://www.ncbi.nlm.nih.gov/research/bionlp/RESTful/pmcoa.cgi/BioC_xml/26539989/unicode'
## Function calling
article_info = extract_article_info(xml_url)

# Create a list
passages_list = article_info['passages']
print()

  0%|          | 0/160 [00:00<?, ?it/s]




In [None]:
# Describe our data
import pandas as pd
df = pd.DataFrame(passages_list)
df.describe().round(2)

Unnamed: 0,sentence_count,sentence_count_spacy,char_count,word_count,token_count
count,160.0,160.0,160.0,160.0,160.0
mean,2.38,2.22,244.08,36.4,61.02
std,3.8,3.13,484.07,73.84,121.02
min,1.0,1.0,2.0,1.0,0.5
25%,1.0,1.0,20.75,2.0,5.19
50%,1.0,1.0,69.5,9.0,17.38
75%,1.0,1.0,135.75,17.0,33.94
max,24.0,19.0,2888.0,425.0,722.0


Breaking long passages down into smaller ones for a manageable chunks of sentences.

In [None]:
# Define split size to turn groups of sentences into chunks
num_sentence_chunk_size = 4

def split_list(input_list: list, slice_size: int) -> list[list[str]]:
  '''
  Split a list into smaller sublists of size slice_size
  A list of 9 sentences would be split into 2 lists of [[5], [4]] if the num_sentence_chunk_size would be 5, for example
  '''
  return [input_list[i:i + slice_size] for i in range(0, len(input_list), slice_size)]

for item in tqdm(passages_list):
    # Call the split_list function
    item["sentence_chunks"] = split_list(input_list=item["sentences"], slice_size=num_sentence_chunk_size)
    # Sotre the number of chunks generated in the current item
    item["num_chunks"] = len(item["sentence_chunks"])

  0%|          | 0/160 [00:00<?, ?it/s]

Process each chunk of sentences by joining them into a single string, as its own item in a new list.

In [None]:
import re

# Split each chunk int0
for item in tqdm(passages_list):
    for sentence_chunk in item["sentence_chunks"]:
        chunk_dict = {}
        chunk_dict["section_type"] = item["section_type"]

        # Join the sentences together into a paragraph-like (they are single string)
        joined_sentence_chunk = "".join(sentence_chunk).replace("  ", " ").strip()

        # Ensures there is a space after each period followed by a capital letter
        joined_sentence_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_sentence_chunk)
        chunk_dict["sentence_chunk"] = joined_sentence_chunk

        # Calculate and store statistics about the chunk
        chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)
        chunk_dict["chunk_word_count"] = len([word for word in joined_sentence_chunk.split(" ")])
        chunk_dict["chunk_token_count"] = len(joined_sentence_chunk) / 4
        passages_chunks.append(chunk_dict)

len(passages_chunks)

  0%|          | 0/160 [00:00<?, ?it/s]

788

Create a DataFrame from the processed text chunks, filterig them based on a minimun lenght threshold


In [None]:
df = pd.DataFrame(passages_chunks)

# Minimum token lenght threshold
min_token_length = 20

# Filter the rows where the chunk token is less than or equal to the min_token_length
for row in df[df["chunk_token_count"] <= min_token_length].sample(5).iterrows():
    print(f'Chunk token count: {row[1]["chunk_token_count"]} | Text: {row[1]["sentence_chunk"]}')


Chunk token count: 8.0 | Text: *p or q value is less than 0.05.
Chunk token count: 19.0 | Text: Gut microbiota from twins discordant for obesity modulate metabolism in mice
Chunk token count: 1.0 | Text: ConA
Chunk token count: 7.0 | Text: Yakult Intestinal Flora-SCAN
Chunk token count: 0.75 | Text: LBP


In [None]:
# Filterng the minimun token count to a new record
chunks_over_min_token_len = df[df["chunk_token_count"] > min_token_length].to_dict(orient="records")
chunks_over_min_token_len[:3]

[{'section_type': 'TITLE',
  'sentence_chunk': 'Intestinal Dysbiosis and Lowered Serum Lipopolysaccharide-Binding Protein in Parkinson’s Disease',
  'chunk_char_count': 96,
  'chunk_word_count': 10,
  'chunk_token_count': 24.0},
 {'section_type': 'ABSTRACT',
  'sentence_chunk': 'The intestine is one of the first affected organs in Parkinson’s disease (PD). PD subjects show abnormal staining for Escherichia coli and α-synuclein in the colon.',
  'chunk_char_count': 164,
  'chunk_word_count': 26,
  'chunk_token_count': 41.0},
 {'section_type': 'ABSTRACT',
  'sentence_chunk': 'We recruited 52 PD patients and 36 healthy cohabitants. We measured serum markers and quantified the numbers of 19 fecal bacterial groups/genera/species by quantitative RT-PCR of 16S or 23S rRNA. Although the six most predominant bacterial groups/genera/species covered on average 71.3% of total intestinal bacteria, our analysis was not comprehensive compared to metagenome analysis or 16S rRNA amplicon sequencing.',


The model is loaded and set to use GPU acceleration and individual text chunks are encoded into ebeddings and added to each item in the list. Then a list of text chunks is encoded in batches for efficiency, and the results are stored as Pytorch tensors

In [None]:
from sentence_transformers import SentenceTransformer

# Initialize the SentenceTransformer model with the speccified model and device
embedding_model = SentenceTransformer(model_name_or_path = "all-mpnet-base-v2",  device="cuda")


for item in tqdm(chunks_over_min_token_len):
    # Enconde the 'sentence_chunk' into an embedding using the model
    item["embedding"] = embedding_model.encode(item["sentence_chunk"])

# Extract 'sentence_chunk' into a list
text_chunks = [item["sentence_chunk"] for item in chunks_over_min_token_len]

# Enconde the list of text chunks into embedding in batch mode with conversion to tensor format
text_chunk_embeddings = embedding_model.encode(text_chunks, batch_size=32, convert_to_tensor=True)

text_chunk_embeddings

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

  0%|          | 0/160 [00:00<?, ?it/s]

tensor([[ 0.0236,  0.0899,  0.0079,  ..., -0.0444, -0.0072, -0.0733],
        [-0.0079,  0.0220, -0.0023,  ...,  0.0362, -0.0394, -0.0223],
        [ 0.0186,  0.0857,  0.0125,  ..., -0.0258, -0.0122, -0.0492],
        ...,
        [ 0.0284,  0.0255,  0.0335,  ..., -0.0457,  0.0161, -0.0260],
        [-0.0419,  0.0931, -0.0061,  ..., -0.0400, -0.0362, -0.0682],
        [-0.0328,  0.1031, -0.0127,  ..., -0.0322, -0.0601, -0.0459]],
       device='cuda:0')

In [None]:
# Create a dataframe and save the embeddengs to a file
text_chunks_and_embeddings_df = pd.DataFrame(chunks_over_min_token_len)

pubmed_id = xml_url.split('/')[-2]
embeddings_df_save_path = f'{pubmed_id}_df.csv'
text_chunks_and_embeddings_df.to_csv(embeddings_df_save_path, index=False)

In [None]:
# Checking
text_chunks_and_embeddings_df_load = pd.read_csv('/content/26539989_df.csv')
text_chunks_and_embeddings_df_load.sample(5)

Unnamed: 0,section_type,sentence_chunk,chunk_char_count,chunk_word_count,chunk_token_count
87,REF,Hydrogen in drinking water reduces dopaminergi...,148,16,37.0
101,REF,The role of lipopolysaccharide-binding protein...,121,16,30.25
65,DISCUSS,As the increased Lactobacillus was also observ...,165,24,41.25
45,RESULTS,"As 12 species in Clostridium produce hydrogen,...",383,59,95.75
0,TITLE,Intestinal Dysbiosis and Lowered Serum Lipopol...,96,10,24.0
