In [39]:
import json
import pandas as pd
from fuzzywuzzy import fuzz
from collections import defaultdict

# File paths
fulltext_dict_path = 'data/TM_pipelines_outputs/patch-28-10-2024-0.api.json'
fulltext_ml_path = '../results/fulltext/europepmc/patch-28-10-2024-0.jsonl.gz.json'


# Function to parse full-text files with filtering and type conversion
def parse_fulltext_file_with_filter(file_path, type_map=None):
    """
    Parse the full-text JSONL file and filter entities by specific types.
    Convert types and text to lowercase for deduplication.
    """
    type_map = type_map or {}  # Mapping for converting types
    valid_types = {"exp_methods", "disease", "gene_protein", "go_term", "chemical", "organism"}
    
    data = defaultdict(lambda: defaultdict(set))
    with open(file_path, 'r', encoding='latin-1', errors='replace') as file:
        for line in file:
            obj = json.loads(line)
            pmid = obj.get("pmcid", obj.get("ft_id", "Unknown")).lower()  # Normalize pmid/ft_id to lowercase
            for ann in obj.get('anns', []):
                ann_type = ann.get('type', "Unknown").lower()
                # Convert type if present in the type map
                ann_type = type_map.get(ann_type, ann_type)
                if ann_type in valid_types:
                    exact_text = ann['exact'].lower()  # Convert text to lowercase
                    data[pmid][ann_type].add(exact_text)
    return data

# Define the type mapping
type_map = {
    "gene expression": "go_term",
    "methods": "exp_methods"
}

# Parse full-text files with filtering and type conversion
fulltext_ml_data = parse_fulltext_file_with_filter(fulltext_ml_path, type_map=type_map)
fulltext_dict_data = parse_fulltext_file_with_filter(fulltext_dict_path, type_map=type_map)

# Full-text analysis using fuzzy matching for partial matches
results = []
for pmid in set(fulltext_ml_data.keys()).union(fulltext_dict_data.keys()):
    ml_ann_types = fulltext_ml_data.get(pmid, {})
    dict_ann_types = fulltext_dict_data.get(pmid, {})
    
    # Analyze for each annotation type in either file
    all_ann_types = set(ml_ann_types.keys()).union(dict_ann_types.keys())
    for ann_type in all_ann_types:
        ml_exact_texts = ml_ann_types.get(ann_type, set())
        dict_exact_texts = dict_ann_types.get(ann_type, set())
        
        # Calculate differences and overlaps
        ml_not_in_dict = ml_exact_texts - dict_exact_texts
        dict_not_in_ml = dict_exact_texts - ml_exact_texts
        partial_matches = {
            ml_text for ml_text in ml_exact_texts
            if any(fuzz.partial_ratio(ml_text, dict_text) >= 80 for dict_text in dict_exact_texts)
        }
        common_entities = ml_exact_texts & dict_exact_texts
        
        # Append the analysis result
        results.append({
            "pmid_or_ft_id": pmid,
            "ann_type": ann_type,
            "partial_match_count": len(partial_matches),
            "total_ml_count": len(ml_exact_texts),
            "total_dict_count": len(dict_exact_texts),
            "ml_not_in_dict_count": len(ml_not_in_dict),
            "dict_not_in_ml_count": len(dict_not_in_ml),
            # "common_count": len(common_entities),
            "ml_not_in_dict": "; ".join(ml_not_in_dict),  # Convert set to string for CSV
            "dict_not_in_ml": "; ".join(dict_not_in_ml),
            "partial_matches": "; ".join(partial_matches)
        })


# Create DataFrame and save to CSV
df = pd.DataFrame(results)
output_file = "../results/comparison_results_with_fuzzy_matching.csv"
df.to_csv(output_file, index=False)


In [40]:
len(fulltext_ml_data), len(fulltext_dict_data)

(983, 984)

In [41]:
df

Unnamed: 0,pmid_or_ft_id,ann_type,partial_match_count,total_ml_count,total_dict_count,ml_not_in_dict_count,dict_not_in_ml_count,ml_not_in_dict,dict_not_in_ml,partial_matches
0,11321433,chemical,8,10,7,4,1,lipoproteins; estrogen; triglyceride; sugar,insulin,cholesterol; triglyceride; triglycerides; lipo...
1,11321433,gene_protein,0,2,0,2,0,insulin; mets,,
2,11321433,go_term,2,2,3,1,2,synthesis,behaviour; estrogen synthesis,synthesis; pathogenesis
3,11321433,disease,17,24,15,12,3,hpv infections; insulin resistance; cardiovasc...,advanced cancer; infections; infection,hpv infections; hypertension; diabetes; cancer...
4,11321433,exp_methods,1,1,1,0,0,,,biopsy
...,...,...,...,...,...,...,...,...,...,...
4711,11347145,gene_protein,75,90,62,38,10,hat; transcriptional activators; polymerase; r...,nucleolin; pol ii; histone acetyltransferase p...,dna helicase; topoisomerases; rnase a; exo1; s...
4712,11347145,go_term,20,24,21,6,3,binding; micronuclei; cytoplasmic; reverse tra...,sites of double-strand breaks; cellular homeos...,cytoplasm; replication fork; envelope; membran...
4713,11347145,disease,13,14,16,2,4,with oculomotor apraxia type; blood,multiple cancers; tumorigenesis; oculomotor ap...,myelodysplastic syndrome; ovarian cancer; acut...
4714,11347145,exp_methods,16,17,16,7,6,immunofluorescence; proximity; pla; assays; pr...,histone acetylation; recombination; proximity ...,immunofluorescence; assay; proximity; assays; ...


In [45]:
# File paths for abstract dictionaries and machine learning outputs
abs_dict_path = 'data/TM_pipelines_outputs/patch-28-10-2024-0_abs.api.json'
abs_ml_path = '../results/abstracts/europepmc/patch-28-10-2024-0.json'

# Function to parse abstract files
def parse_abstract_file_with_filter(file_path, type_map=None):
    """
    Parse the abstract JSONL file and filter entities by specific types.
    Convert types and text to lowercase for deduplication.
    """
    type_map = type_map or {}  # Mapping for converting types
    valid_types = {"exp_methods", "disease", "gene_protein", "go_term", "chemical", "organism"}
    
    data = defaultdict(lambda: defaultdict(set))
    with open(file_path, 'r', encoding='latin-1', errors='replace') as file:
        for line in file:
            obj = json.loads(line)
            ext_id = obj.get("ext_id", "Unknown")
            for ann in obj.get('anns', []):
                ann_type = ann.get('type', "Unknown").lower()
                # Convert type if present in the type map
                ann_type = type_map.get(ann_type, ann_type)
                if ann_type in valid_types:
                    exact_text = ann['exact'].lower()  # Convert text to lowercase
                    data[ext_id][ann_type].add(exact_text)
    return data

# Define the type mapping
type_map = {
    "gene expression": "go_term",
    "methods": "exp_methods"
}

# Parse abstract files with filtering and type conversion
abs_dict_data = parse_abstract_file_with_filter(abs_dict_path, type_map=type_map)
abs_ml_data = parse_abstract_file_with_filter(abs_ml_path, type_map=type_map)

# Updated analysis
results = []
for ext_id in set(abs_dict_data.keys()).union(abs_ml_data.keys()):
    dict_ann_types = abs_dict_data.get(ext_id, {})
    ml_ann_types = abs_ml_data.get(ext_id, {})
    
    all_ann_types = set(dict_ann_types.keys()).union(ml_ann_types.keys())
    for ann_type in all_ann_types:
        dict_exact_texts = dict_ann_types.get(ann_type, set())
        ml_exact_texts = ml_ann_types.get(ann_type, set())
        
        # Calculate differences and overlaps with fuzzy matching
        dict_not_in_ml = dict_exact_texts - ml_exact_texts
        ml_not_in_dict = ml_exact_texts - dict_exact_texts
        partial_matches = {
            ml_text for ml_text in ml_exact_texts
            if any(fuzz.partial_ratio(ml_text, dict_text) >= 80 for dict_text in dict_exact_texts)
        }
        common_entities = ml_exact_texts & dict_exact_texts
        
        # Append analysis results
        results.append({
            "ext_id": ext_id,
            "ann_type": ann_type,
            "partial_match_count": len(partial_matches),
            "total_ml_count": len(ml_exact_texts),
            "total_dict_count": len(dict_exact_texts),
            "found_in_ml_not_in_dict_count": len(ml_not_in_dict),
            "found_in_dict_not_in_ml_count": len(dict_not_in_ml),
            # "common_count": len(common_entities),
            "ml_not_in_dict": "; ".join(ml_not_in_dict),
            "dict_not_in_ml": "; ".join(dict_not_in_ml),
            "partial_matches": "; ".join(partial_matches)
        })


# Create DataFrame and save to CSV
df = pd.DataFrame(results)
output_file = "../results/abstract_comparison_results_with_fuzzy_matching.csv"
df.to_csv(output_file, index=False)



In [32]:
df

Unnamed: 0,ext_id,ann_type,partial_match_count,total_ml_count,total_dict_count,found_in_ml_not_in_dict_count,found_in_dict_not_in_ml_count,ml_not_in_dict,dict_not_in_ml,partial_matches
0,PPR930567,disease,1,3,1,2,0,adhd; -focused thoughts,,depression
1,PPR930567,organism,1,1,1,0,0,,,humans
2,38385056,disease,3,4,4,1,1,aml,tumorigenesis,acute myeloid leukemia; cancer; acute leukemia
3,38385056,exp_methods,0,0,2,0,2,,translocation; chromosomal aberrations,
4,38385056,gene_protein,0,0,1,0,1,,p13,
...,...,...,...,...,...,...,...,...,...,...
45042,39183390,organism,11,17,9,9,1,filamentous bacteria; s; bacillus; coccus; scu...,s. baicalensis georgi,s; clostridium; helicobacter; scutellaria baic...
45043,39457007,disease,4,4,5,0,1,,myxoma,glioblastoma; glioma; viral infection; tumor
45044,39457007,gene_protein,0,0,1,0,1,,myx,
45045,39457007,organism,2,2,3,1,2,virus,myxoma virus; murine,virus; human


In [42]:
# Function to check for missing JSON lines with no annotations
def check_missing_annotations(ml_file_path, dict_file_path, type_map=None):
    """
    Compare ML and dictionary files to check for missing JSON lines in ML and verify if they have no annotations.
    """
    type_map = type_map or {}  # Mapping for converting types
    ml_ids = set()
    dict_ids_with_annotations = set()

    # Read ML file and collect IDs
    with open(ml_file_path, 'r', encoding='latin-1', errors='replace') as file:
        for line in file:
            obj = json.loads(line)
            pmid = obj.get("pmcid", obj.get("ft_id", "Unknown")).lower()
            ml_ids.add(pmid)

    # Read dictionary file and collect IDs with annotations
    with open(dict_file_path, 'r', encoding='latin-1', errors='replace') as file:
        for line in file:
            obj = json.loads(line)
            pmid = obj.get("pmcid", obj.get("ft_id", "Unknown")).lower()
            if any(ann['type'].lower() in type_map or ann['type'].lower() in valid_types for ann in obj.get('anns', [])):
                dict_ids_with_annotations.add(pmid)

    # Find missing IDs in ML
    missing_ids = dict_ids_with_annotations - ml_ids

    return {
        "ml_ids_count": len(ml_ids),
        "dict_ids_with_annotations_count": len(dict_ids_with_annotations),
        "missing_ids_count": len(missing_ids),
        "missing_ids": missing_ids
    }

# Define valid types and type mapping
valid_types = {"exp_methods", "disease", "gene_protein", "go_term", "chemical", "organism"}
type_map = {"gene expression": "go_term", "methods": "exp_methods"}

# Check for abstracts
abstract_check = check_missing_annotations(abs_ml_path, abs_dict_path, type_map)

# Check for full-text
fulltext_check = check_missing_annotations(fulltext_ml_path, fulltext_dict_path, type_map)

# Prepare results for user
missing_analysis = {
    "Abstracts": abstract_check,
    "Full-Text": fulltext_check
}

In [47]:
def check_missing_annotations(ml_file_path, dict_file_path, id_key, type_map=None):
    """
    Compare ML and dictionary files to check for missing JSON lines in ML and verify if they have annotations.
    """
    type_map = type_map or {}  # Mapping for converting types
    valid_types = {"exp_methods", "disease", "gene_protein", "go_term", "chemical", "organism"}
    ml_ids = set()
    dict_ids_with_annotations = set()

    # Read ML file and collect IDs
    with open(ml_file_path, 'r', encoding='latin-1', errors='replace') as file:
        for line in file:
            obj = json.loads(line)
            doc_id = obj.get(id_key, "Unknown").lower()
            ml_ids.add(doc_id)

    # Read dictionary file and collect IDs with annotations
    with open(dict_file_path, 'r', encoding='latin-1', errors='replace') as file:
        for line in file:
            obj = json.loads(line)
            doc_id = obj.get(id_key, "Unknown").lower()
            if any(ann['type'].lower() in type_map or ann['type'].lower() in valid_types for ann in obj.get('anns', [])):
                dict_ids_with_annotations.add(doc_id)

    # Find missing IDs in ML
    missing_ids = dict_ids_with_annotations - ml_ids

    return {
        "ml_ids_count": len(ml_ids),
        "dict_ids_with_annotations_count": len(dict_ids_with_annotations),
        "missing_ids_count": len(missing_ids),
        "missing_ids": missing_ids
    }

# Define type mapping for conversion
type_map = {"gene expression": "go_term", "methods": "exp_methods"}

# # File paths for abstract and full-text data
abs_dict_path = 'data/TM_pipelines_outputs/patch-28-10-2024-0_abs.api.json'
abs_ml_path = '../results/abstracts/europepmc/patch-28-10-2024-0.json'
fulltext_dict_path = 'data/TM_pipelines_outputs/patch-28-10-2024-0.api.json'
fulltext_ml_path = '../results/fulltext/europepmc/patch-28-10-2024-0.jsonl.gz.json'
# Check for missing annotations in abstracts (using ext_id)
abstract_check = check_missing_annotations(abs_ml_path, abs_dict_path, "ext_id", type_map)

# Check for missing annotations in full-text (using pmcid or ft_id)
fulltext_check = check_missing_annotations(fulltext_ml_path, fulltext_dict_path, "pmcid", type_map)

# Prepare summary for user
summary = {
    "Dataset": ["Abstracts", "Full-Text"],
    "ML IDs Count": [abstract_check["ml_ids_count"], fulltext_check["ml_ids_count"]],
    "Dict IDs with Annotations Count": [
        abstract_check["dict_ids_with_annotations_count"],
        fulltext_check["dict_ids_with_annotations_count"],
    ],
    "Missing IDs Count": [abstract_check["missing_ids_count"], fulltext_check["missing_ids_count"]],
}

# Create DataFrame for summary
summary_df = pd.DataFrame(summary)

# # Save detailed missing annotations to CSV
# abstract_missing_file = "abstract_missing_annotations.csv"
# fulltext_missing_file = "fulltext_missing_annotations.csv"
# 
# abstract_missing_df = pd.DataFrame(list(abstract_check["missing_ids"]), columns=["Missing Abstract ext_id"])
# abstract_missing_df.to_csv(abstract_missing_file, index=False)
# 
# fulltext_missing_df = pd.DataFrame(list(fulltext_check["missing_ids"]), columns=["Missing Fulltext pmcid_or_ft_id"])
# fulltext_missing_df.to_csv(fulltext_missing_file, index=False)

In [49]:
print(summary_df)

     Dataset  ML IDs Count  Dict IDs with Annotations Count  Missing IDs Count
0  Abstracts         14777                            15384                912
1  Full-Text           974                              975                  4


In [51]:
missing_ids = set(fulltext_check["missing_ids"])
print(f"Missing IDs: {missing_ids}")


Missing IDs: {'11325299', '11249782', '11328371', '10470160'}


In [52]:
fulltext_check

{'ml_ids_count': 974,
 'dict_ids_with_annotations_count': 975,
 'missing_ids_count': 4,
 'missing_ids': {'10470160', '11249782', '11325299', '11328371'}}

In [53]:
# Function for detailed analysis of missing annotations
def deep_analysis_of_annotations(ml_file_path, dict_file_path, id_key, type_map=None):
    """
    Perform a deep analysis of annotations:
    - Missing IDs in ML
    - Missing IDs with annotations
    - Found in Dict but not in ML
    - Found in ML but not in Dict
    """
    type_map = type_map or {}  # Mapping for converting types
    valid_types = {"exp_methods", "disease", "gene_protein", "go_term", "chemical", "organism"}

    ml_ids = {}
    dict_ids_with_annotations = {}

    # Read ML file and collect IDs and annotations
    with open(ml_file_path, 'r', encoding='latin-1', errors='replace') as file:
        for line in file:
            obj = json.loads(line)
            doc_id = obj.get(id_key, "Unknown").lower()
            annotations = {ann['exact'].lower() for ann in obj.get('anns', [])}
            ml_ids[doc_id] = annotations

    # Read dictionary file and collect IDs with annotations
    with open(dict_file_path, 'r', encoding='latin-1', errors='replace') as file:
        for line in file:
            obj = json.loads(line)
            doc_id = obj.get(id_key, "Unknown").lower()
            annotations = {
                ann['exact'].lower() for ann in obj.get('anns', [])
                if ann['type'].lower() in type_map or ann['type'].lower() in valid_types
            }
            if annotations:
                dict_ids_with_annotations[doc_id] = annotations

    # Missing IDs
    missing_ids = set(dict_ids_with_annotations.keys()) - set(ml_ids.keys())

    # IDs found in both with missing annotations in ML
    missing_annotations_in_ml = {
        doc_id: dict_ids_with_annotations[doc_id]
        for doc_id in dict_ids_with_annotations.keys()
        if doc_id in ml_ids and not ml_ids[doc_id]
    }

    # Found in Dict but not in ML
    found_in_dict_not_in_ml = {
        doc_id: dict_ids_with_annotations[doc_id]
        for doc_id in missing_ids
    }

    # Found in ML but not in Dict
    found_in_ml_not_in_dict = {
        doc_id: ml_ids[doc_id]
        for doc_id in ml_ids.keys()
        if doc_id not in dict_ids_with_annotations
    }

    return {
        "ml_ids_count": len(ml_ids),
        "dict_ids_with_annotations_count": len(dict_ids_with_annotations),
        "missing_ids_count": len(missing_ids),
        "missing_ids_with_annotations": found_in_dict_not_in_ml,
        "missing_annotations_in_ml": missing_annotations_in_ml,
        "found_in_dict_not_in_ml": found_in_dict_not_in_ml,
        "found_in_ml_not_in_dict": found_in_ml_not_in_dict,
    }


In [54]:
# Define type mapping for conversion
type_map = {"gene expression": "go_term", "methods": "exp_methods"}

# Perform deep analysis for abstracts
abstract_analysis = deep_analysis_of_annotations(abs_ml_path, abs_dict_path, "ext_id", type_map)

# Perform deep analysis for full-text
fulltext_analysis = deep_analysis_of_annotations(fulltext_ml_path, fulltext_dict_path, "pmcid", type_map)


In [55]:
fulltext_analysis

{'ml_ids_count': 974,
 'dict_ids_with_annotations_count': 975,
 'missing_ids_count': 4,
 'missing_ids_with_annotations': {'11325299': {'uptake'},
  '11249782': {' sec', 'behavior', 'c2n2', 'delta', 'gamma', 'singular'},
  '11328371': {'learning'},
  '10470160': {'behavior', 'learning'}},
 'missing_annotations_in_ml': {},
 'found_in_dict_not_in_ml': {'11325299': {'uptake'},
  '11249782': {' sec', 'behavior', 'c2n2', 'delta', 'gamma', 'singular'},
  '11328371': {'learning'},
  '10470160': {'behavior', 'learning'}},
 'found_in_ml_not_in_dict': {'10531108': {'gag'},
  '11252919': {'covid-19'},
  '11361398': {'animal', 'rhoa', 'rock'}}}

In [56]:
abstract_analysis

{'ml_ids_count': 14777,
 'dict_ids_with_annotations_count': 15384,
 'missing_ids_count': 912,
 'missing_ids_with_annotations': {'39455799': {'mcm'},
  '39466424': {'acne'},
  '39459124': {'learning'},
  '38822920': {'intracranial haemorrhage', 'trauma'},
  'ppr930376': {'tpn'},
  'ppr717598': {'learning'},
  '39459706': {'behavior'},
  '39463198': {'trauma'},
  '39462477': {'electron'},
  '38420485': {'learning'},
  '39464207': {'learning'},
  'ppr931380': {'learning'},
  '39331295': {'mega'},
  '38657901': {'critically ill'},
  '39465076': {'aotearoa'},
  '39465228': {'lam'},
  '39462275': {'behaviours', 'learning'},
  '39458770': {'adr', 'composites', 'pbat', 'pla'},
  'ppr749756': {'singular'},
  '38414887': {'pgy1', 'pgy2'},
  '38405299': {'behavior'},
  'ppr712067': {'inversion'},
  '38425991': {'behavior'},
  '39465985': {'electron', 'transport'},
  'ppr867909': {'behavior', 'insertion'},
  '39461914': {'learning'},
  '39178809': {'learning'},
  '39459418': {'vascularization'},
 