In [None]:
# !python -m ipykernel install --user --name=falconframes_env --display-name "Python (falconframes_env)"

In [1]:
import os

print("Files in /home/stirunag/work/github/CAPITAL/daily_pipeline/lib:")
print(os.listdir("/home/stirunag/work/github/CAPITAL/daily_pipeline/lib"))


Files in /home/stirunag/work/github/CAPITAL/daily_pipeline/lib:
['__init__.py', 'python_scripts']


In [2]:
import json
import os
import sys
import gzip
from collections import defaultdict, Counter
from transformers import AutoTokenizer, pipeline
from optimum.onnxruntime import ORTModelForTokenClassification
import argparse
from tqdm import tqdm

import sys
from pathlib import Path

# Add the parent directory to the sys.path
sys.path.append(str(Path('/home/stirunag/work/github/CAPITAL/daily_pipeline/lib/python_scripts/').resolve()))

# Import the functions
from entity_linker import map_to_url, map_terms, map_terms_batch, map_terms_reverse, get_exact_match, get_embedding_match, clean_term, clean_term_EM


  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


/home/stirunag/work/github/CAPITAL/normalisation/en_floret_model
/home/stirunag/work/github/CAPITAL/normalisation/dictionary/
Loading spaCy model for entity linking...




SpaCy model loaded successfully.
Loaded data for CD
Loaded data for OG
Loaded data for DS
Loaded data for GP
Loaded data for GO
Loaded data for EM


In [3]:
# Directly assign the paths
input_path = "/home/stirunag/work/github/CAPITAL/daily_pipeline/notebooks/data/patch_2024_10_28_0.json.gz"  # Replace with your actual input file path
output_path = "/home/stirunag/work/github/CAPITAL/daily_pipeline/results"  # Replace with your actual output directory path
model_path_quantised = "/home/stirunag/work/github/CAPITAL/model"  # Replace with your actual model directory path

# Check that input is a file
if not os.path.isfile(input_path):
    raise ValueError(f"Expected a file for input, but got: {input_path}")

# Check if output directory exists; if not, create it
if not os.path.isdir(output_path):
    print(f"Output directory '{output_path}' does not exist. Creating it.")
    os.makedirs(output_path, exist_ok=True)

# Ensure 'no_matches' directory exists within the output directory
no_match_dir = os.path.join(output_path, "no_matches")
os.makedirs(no_match_dir, exist_ok=True)
no_match_file_path = os.path.join(no_match_dir, "patch_no_match.json")

# Initialize NER model using the provided model path
print("Loading NER model and tokenizer from " + model_path_quantised)
model_quantized = ORTModelForTokenClassification.from_pretrained(
    model_path_quantised, file_name="model_quantized.onnx")
tokenizer_quantized = AutoTokenizer.from_pretrained(
    model_path_quantised,
    model_max_length=512,
    batch_size=4,
    truncation=True
)
ner_quantized = pipeline(
    "token-classification",
    model=model_quantized,
    tokenizer=tokenizer_quantized,
    aggregation_strategy="first"
)
print("NER model and tokenizer loaded successfully.")

Loading NER model and tokenizer from /home/stirunag/work/github/CAPITAL/model
NER model and tokenizer loaded successfully.


In [4]:
SECTIONS_MAP = {
    "TITLE": "Title",
    "ABSTRACT": "Abstract",
    "INTRO": "Introduction",
    "METHODS": "Methods",
    "RESULTS": "Results",
    "DISCUSS": "Discussion",
    "CONCL": "Conclusion",
    "CASE": "Case study",
    "ACK_FUND": "Acknowledgments",
    "AUTH_CONT": "Author Contributions",
    "COMP_INT": "Competing Interests",
    "ABBR": "Abbreviations",
    "SUPPL": "Supplementary material",
    "REF": "References",
    "TABLE": "Table",
    "FIGURE": "Figure",
    "DATA SHARING STATEMENT": "Data Availability",
    "APPENDIX": "Appendix",
    "OTHER": "Other"
}


In [5]:
def count_lines_in_gzip(file_path):
    """Counts the lines in a gzipped file."""
    with gzip.open(file_path, "rt") as f:
        return sum(1 for _ in f)

In [6]:
PROVIDER = "europepmc"

# Mapping from abbreviation to full form
ENTITY_TYPE_MAP = {
    "EM": "methods",
    "DS": "disease",
    "GP": "gene_protein",
    "GO": "go_term",
    "CD": "chemical",
    "OG": "organism"
    # Add other mappings as necessary
}


# Helper Functions
def map_entity_type(abbrev):
    """Map abbreviation to full form."""
    return ENTITY_TYPE_MAP.get(abbrev, abbrev.lower())


def get_word_position(sent_id, sentence_text, char_start):
    """
    Calculate the word position based on character start index.
    Returns a string in the format 'sent_id.word_position'.
    """
    words = sentence_text.split()
    current_char = 0
    for idx, word in enumerate(words):
        word_start = sentence_text.find(word, current_char)
        if word_start == char_start:
            return f"{sent_id}.{idx + 1}"
        current_char = word_start + len(word)
    return f"{sent_id}.0"  # Return 0 if position not found


def get_prefix_postfix(sentence_text, char_start, char_end, num_words=3, max_chars=30):
    """
    Extract prefix and postfix based on word positions with constraints:
    - Returns up to `num_words` before and after the target term.
    - Ensures each prefix and postfix does not exceed `max_chars`.
    """
    words = sentence_text.split()
    word_positions = [sentence_text.find(word) for word in words]

    # Identify the word index for the starting character of the entity
    word_index = None
    for idx, start in enumerate(word_positions):
        if start == char_start:
            word_index = idx
            break

    prefix, postfix = "", ""
    if word_index is not None:
        # Extract prefix words up to `num_words` or `max_chars`
        prefix_words = words[max(0, word_index - num_words):word_index]
        prefix = ' '.join(prefix_words)
        if len(prefix) > max_chars:
            prefix = prefix[-max_chars:]  # Truncate to the last `max_chars` characters

        # Extract postfix words up to `num_words` or `max_chars`
        postfix_words = words[word_index + 1:word_index + 1 + num_words]
        postfix = ' '.join(postfix_words)
        if len(postfix) > max_chars:
            postfix = postfix[:max_chars]  # Truncate to the first `max_chars` characters

    return prefix, postfix



In [7]:
def batch_annotate_sentences(sentences, section):
    batched_text = [s["text"] for s in sentences]
    ner_results = ner_quantized(batched_text)

    annotations = []

    for i, sentence_entities in enumerate(ner_results):
        sentence_id = sentences[i]["sent_id"]
        sentence_text = sentences[i]["text"]
        for entity in sentence_entities:
            term = sentence_text[entity['start']:entity['end']]
            position = get_word_position(sentence_id, sentence_text, entity['start'])
            prefix, postfix = get_prefix_postfix(sentence_text, entity['start'], entity['end'])
            full_entity_type = entity["entity_group"]

            annotations.append({
                "type": full_entity_type,
                "position": position,
                "prefix": prefix,
                "exact": term,
                "section": section,
                "postfix": postfix
            })
    return annotations


In [35]:
# Mapping from abbreviation to full form
ENTITY_TYPE_MAP = {
    "EM": "methods",
    "DS": "disease",
    "GP": "gene_protein",
    "GO": "go_term",
    "CD": "chemical",
    "OG": "organism"
    # Add other mappings as necessary
}


# Helper Functions
def map_entity_type(abbrev):
    """Map abbreviation to full form."""
    return ENTITY_TYPE_MAP.get(abbrev, abbrev.lower())

In [37]:
def generate_tags(all_annotations):
    """
    Generate tags for each annotation in all_annotations using map_terms_reverse and map_to_url.
    Each annotation will have 'name' and 'uri' fields in the 'tags' list.
    """
    output_annotations = []
    
    # Group entities by type for map_terms_reverse
    entities_by_type = defaultdict(set)
    for annotation in all_annotations:
        entities_by_type[annotation['type']].add(annotation['exact'])
    
    # Process each entity type with map_terms_reverse to get mapped terms and URLs
    mapped_results = {}
    for entity_type, entities in entities_by_type.items():
        mapped_results[entity_type] = map_terms_reverse(entities, entity_type)
    
    # Generate tags for each annotation
    for annotation in all_annotations:
        entity_type = annotation['type']
        term = annotation['exact']
        
        # Retrieve grounded code and term from mapped results
        if term in mapped_results[entity_type]:
            grounded_code, grounded_term = mapped_results[entity_type][term]
            uri = map_to_url(entity_type, grounded_code)  # Generate URI based on entity group and code
            
            # Add the annotation with tags
            output_annotations.append({
                "type": map_entity_type(entity_type),
                "position": annotation["position"],
                "prefix": annotation["prefix"],
                "exact": term,
                "section": annotation["section"],
                "postfix": annotation["postfix"],
                "tags": [
                    {
                        "name": grounded_term,
                        "uri": uri
                    }
                ]
            })
        else:
            # In case there’s no mapping found, skip or add with no URI
            output_annotations.append({
                "type": map_entity_type(entity_type),
                "position": annotation["position"],
                "prefix": annotation["prefix"],
                "exact": term,
                "section": annotation["section"],
                "postfix": annotation["postfix"],
                "tags": [
                    {
                        "name": "#",
                        "uri": "#"
                    }
                ]
            })

    return output_annotations


In [38]:
input_filename = os.path.basename(input_path).replace(".json.gz", "")
output_file_OA = os.path.join(output_path, f"{input_filename}_OA.json")
output_file_NOA = os.path.join(output_path, f"{input_filename}_NOA.json")

# Similarly, create no match filenames
no_match_file_OA = os.path.join(output_path, "no_matches", f"{input_filename}_OA_no_match.json")
no_match_file_NOA = os.path.join(output_path, "no_matches", f"{input_filename}_NOA_no_match.json")

# Count the total number of lines for the progress bar
total_lines = count_lines_in_gzip(input_path)

print(total_lines)

909


In [10]:
with gzip.open(input_path, "rt") as infile:
    for line in tqdm(infile, desc="Processing lines", unit="line", total=total_lines):
        article_data = json.loads(line)
        open_status = article_data.get("open_status", "")
        if open_status == "O":
            break
    

Processing lines:   1%|          | 10/909 [00:00<00:00, 3371.36line/s]


In [39]:
article_data

{'article_ids': {'pmcid': '11380737', 'doi': '10.34172/jrhs.2024.159'},
 'open_status': 'O',
 'article_type': 'research-article',
 'keywords': ['Dietary inflammatory index',
  'Diet',
  'Head and neck neoplasms',
  'Case-control studies'],
 'sections': {'TITLE': [{'text': 'Dietary Inflammatory Index and Head and Neck Cancer: A Multicenter Case-Control Study in Iran',
    'sent_id': 1}],
  'ABSTRACT': [{'text': 'Background: The inflammatory potential of diet may affect carcinogenesis.',
    'sent_id': 2},
   {'text': 'This study aimed to determine the association between dietary inflammatory index (DII) and the risk of head and neck cancer (HNC), as well as the interaction between DII and cigarette smoking in HNC development within the Iranian population.',
    'sent_id': 3},
   {'text': 'Study Design: This is a case-control study.', 'sent_id': 4},
   {'text': 'Methods: In this multicenter case-control study, participants’ dietary intake was assessed using a validated 130-item food freq

In [40]:
pmcid = article_data.get("article_ids", {}).get("pmcid")
ft_id = article_data.get("article_ids", {}).get("archive") or article_data.get("article_ids", {}).get("manuscript")

print(pmcid, ft_id)

11380737 None


In [41]:
all_annotations = []
for section_key, sentences in article_data.get("sections", {}).items():
        if section_key == "REF":
            continue  # Skip processing this section
        section = SECTIONS_MAP.get(section_key, "Other")
        print(section)
        batch_annotations = batch_annotate_sentences(sentences, section)
        if not batch_annotations:
            continue
      
        all_annotations.extend(batch_annotations)
        

        



Title
Abstract
Introduction
Methods
Results
Discussion
Conclusion
Acknowledgments
Competing Interests
Table
Other
Other


In [42]:
all_linked_annotations = generate_tags(all_annotations)
all_linked_annotations

[{'type': 'disease',
  'position': '1.5',
  'prefix': 'Inflammatory Index and',
  'exact': 'Head and Neck Cancer',
  'section': 'Title',
  'postfix': 'and Neck Cancer:',
  'tags': [{'name': 'malignant tumour of head andor neck',
    'uri': 'http://linkedlifedata.com/resource/umls-concept/C0278996'}]},
 {'type': 'disease',
  'position': '3.17',
  'prefix': 'the risk of',
  'exact': 'head and neck cancer',
  'section': 'Abstract',
  'postfix': 'and neck cancer',
  'tags': [{'name': 'malignant tumour of head andor neck',
    'uri': 'http://linkedlifedata.com/resource/umls-concept/C0278996'}]},
 {'type': 'disease',
  'position': '3.0',
  'prefix': 'cigarette smoking in',
  'exact': 'HNC',
  'section': 'Abstract',
  'postfix': 'development within the',
  'tags': [{'name': '#',
    'uri': 'http://linkedlifedata.com/resource/umls-concept/#'}]},
 {'type': 'disease',
  'position': '3.33',
  'prefix': '',
  'exact': 'HNC',
  'section': 'Abstract',
  'postfix': '',
  'tags': [{'name': '#',
    'u

In [43]:
print(pmcid)

11380737


In [45]:
from collections import OrderedDict

def format_output_annotations(all_linked_annotations_, pmcid, ft_id):
    """
    Formats output annotations into two JSON structures:
    - 'match_json' for matched annotations
    - 'non_match_json' for unmatched annotations
    """
    match_annotations = []
    non_match_annotations = []

    # Separate annotations based on tags
    for annotation in all_linked_annotations_:
        # Check if the annotation is unmatched (name and uri are '#')
        if annotation["tags"][0]["name"] == "#" and annotation["tags"][0]["uri"].endswith("#"):
            non_match_annotations.append(annotation)
        else:
            match_annotations.append(annotation)

    # Construct final JSON outputs
    match_json = OrderedDict()
    non_match_json = OrderedDict()

    # Add pmcid or ft_id to both match and non-match JSONs
    if pmcid:
        match_json["pmcid"] = pmcid
        non_match_json["pmcid"] = pmcid
    elif ft_id:
        match_json["ft_id"] = ft_id
        non_match_json["ft_id"] = ft_id

    # Add provider and anns fields to each JSON
    match_json["provider"] = "europepmc"
    match_json["anns"] = match_annotations

    non_match_json["provider"] = "europepmc"
    non_match_json["anns"] = non_match_annotations

    return match_json, non_match_json


In [46]:
match_json, non_match_json = format_output_annotations(all_linked_annotations, pmcid, ft_id)



In [47]:
non_match_json

OrderedDict([('pmcid', '11380737'),
             ('provider', 'europepmc'),
             ('anns',
              [{'type': 'disease',
                'position': '3.0',
                'prefix': 'cigarette smoking in',
                'exact': 'HNC',
                'section': 'Abstract',
                'postfix': 'development within the',
                'tags': [{'name': '#',
                  'uri': 'http://linkedlifedata.com/resource/umls-concept/#'}]},
               {'type': 'disease',
                'position': '3.33',
                'prefix': '',
                'exact': 'HNC',
                'section': 'Abstract',
                'postfix': '',
                'tags': [{'name': '#',
                  'uri': 'http://linkedlifedata.com/resource/umls-concept/#'}]},
               {'type': 'disease',
                'position': '7.11',
                'prefix': 'ratios (ORs) for',
                'exact': 'HNC',
                'section': 'Abstract',
                'postfix': 

In [48]:
from collections import defaultdict, OrderedDict

def modify_restricted_json(json_data, open_status):
    """
    Modifies the JSON structure for restricted access:
    - Removes 'prefix' and 'postfix' fields.
    - Adds 'frequency' field to count occurrences of each unique 'exact' term.
    """
    if open_status in ["OA", "O"]:
        return json_data  # No modification if open access

    # Count frequency of each 'exact' term
    frequency_counter = defaultdict(int)
    for annotation in json_data["anns"]:
        frequency_counter[annotation["exact"]] += 1

    # Modify each annotation
    restricted_annotations = []
    for annotation in json_data["anns"]:
        # Remove 'prefix' and 'postfix' fields, add 'frequency'
        restricted_annotation = {
            "exact": annotation["exact"],
            "tags": annotation["tags"],
            "type": annotation["type"],
            "section": annotation["section"],
            "provider": json_data["provider"],
            "frequency": frequency_counter[annotation["exact"]]
        }
        restricted_annotations.append(restricted_annotation)

    # Build the modified JSON structure
    modified_json = OrderedDict()
    modified_json["provider"] = json_data["provider"]
    modified_json["anns"] = restricted_annotations

    # Preserve 'pmcid' or 'ft_id' in the modified JSON
    if "pmcid" in json_data:
        modified_json["pmcid"] = json_data["pmcid"]
    elif "ft_id" in json_data:
        modified_json["ft_id"] = json_data["ft_id"]

    return modified_json


In [55]:
xx = modify_restricted_json(non_match_json, open_status='OA')

In [56]:
xx

OrderedDict([('pmcid', '11380737'),
             ('provider', 'europepmc'),
             ('anns',
              [{'type': 'disease',
                'position': '3.0',
                'prefix': 'cigarette smoking in',
                'exact': 'HNC',
                'section': 'Abstract',
                'postfix': 'development within the',
                'tags': [{'name': '#',
                  'uri': 'http://linkedlifedata.com/resource/umls-concept/#'}]},
               {'type': 'disease',
                'position': '3.33',
                'prefix': '',
                'exact': 'HNC',
                'section': 'Abstract',
                'postfix': '',
                'tags': [{'name': '#',
                  'uri': 'http://linkedlifedata.com/resource/umls-concept/#'}]},
               {'type': 'disease',
                'position': '7.11',
                'prefix': 'ratios (ORs) for',
                'exact': 'HNC',
                'section': 'Abstract',
                'postfix': 