## COVID_NER data review

Testing files from ePMC team, reformatting to better suit use case of CeLLaTe model training
Reasoning:
    - Alot of the data available tends toward binning multiple entity types together
    - Halted - on training with this model it showed a lot of data-leakage, with 208 sentences being identical matches in the train and test sets.
    - Source data, COVID-NER, is worth keeping in mind for future given it is recorded at a document level. Also annotated for more entity types which may be of future interest.

Sourced from https://github.com/tsantosh7/COVID-19-Named-Entity-Recognition/tree/master/Datasets/BIO/Anatomy_dataset

In [1]:
import csv
from typing import Dict, List, Optional
import numpy as np
import pandas as pd
import spacy



In [4]:
source_path = "/Users/withers/GitProjects/COVID-19-Named-Entity-Recognition/Datasets/BIO/Anatomy_dataset/train.csv"

import pandas as pd

def read_bio_file(filepath: str) -> pd.DataFrame:
    """
    Read-in BIO file, parsing to assign token / labels to columns
    Returns pd.DataFrame
    """
    sentences = []
    sentence = []

    with open(filepath, 'r') as file:
        for line in file:
            line = line.strip()
            if line == "":
                if sentence:
                    sentences.append(sentence)
                    sentence = []
            else:
                parts = line.split()
                if len(parts) == 2:
                    token, label = parts
                    sentence.append((token, label))
                else:
                    print(f"Issue with line: {line}")

        # Catch last sentence
        if sentence:
            sentences.append(sentence)

    # Parse into DataFrame
    tokens = []
    labels = []

    for sent in sentences:
        for token, label in sent:
            tokens.append(token)
            labels.append(label)

    df = pd.DataFrame({
        'token': tokens,
        'label': labels
    })

    return df

input_df = read_bio_file(source_path)
input_df.head(n=20)


Unnamed: 0,token,label
0,RNA,O
1,transcripts,O
2,were,O
3,packaged,O
4,into,O
5,both,O
6,BCV,O
7,and,O
8,MHV,O
9,virions,B-ANAT


#### Read-in dictionaries

In [7]:
dict_file = "/Users/withers/GitProjects/OTAR3088/Data_mining/labelstudio_e2e/output/labelstudio/master_dictionary.tsv"
dictionary = pd.read_csv(dict_file, sep="\t")
map = {"CELL": "Cell", "TISSUE": "Tissue"}
dictionary["label"] = dictionary["label"].map(map)
dictionary

Unnamed: 0,term,label
0,P3HR-1,Cell
1,UCLA P-3,Cell
2,UMSCC22B,Cell
3,UMUC3,Cell
4,V79-4,Cell
...,...,...
3824,vulva,Tissue
3825,white adipose tissue,Tissue
3826,white muscle,Tissue
3827,whole body,Tissue


In [None]:
input_df["token_lower"] = input_df["token"].str.lower()
dictionary["term_lower"] = dictionary["term"].str.lower()

def dictionary_tagging(token):
    if token in dictionary["term_lower"]:
        print(token)
        matched = dictionary[dictionary["term_lower"] == token]
        return matched["label"]

In [25]:
# Grab text from df, split into sentences
all_text = " ".join(input_df["token"].to_list())
all_sent = all_text.split(sep=" . ")
print(len(all_sent))

49383


In [32]:
import spacy
import pandas as pd
from spacy.matcher import PhraseMatcher

# Load SpaCy model
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner", "tagger", "lemmatizer"])

def tag_sentences_with_dictionary(dict_df: Dict, sentences: List, term_column: str, tag_column: str) -> pd.DataFrame:
    """
    dict_df: DataFrame with term & label columns, specified in term_column & tag_column declarations
    sentences: list of sentences to tag
    Returns: pd.DataFrame
    """
    # Dictionary matching obj
    matcher = PhraseMatcher(nlp.vocab, attr="LOWER")

    # Add patterns to matcher
    for _, row in dict_df.iterrows():
        term = row[term_column]
        tag = row[tag_column]
        # Tokenize and make named entities of terms
        # Formalizing dictionary for matching quickly
        pattern = nlp.make_doc(term)
        matcher.add(tag, [pattern])

    all_tokens = []
    all_labels = []

    for doc in nlp.pipe(sentences, batch_size=64):
        # Prep sentence - tokenize, etc
        tags = ["O"] * len(doc)

        matches = matcher(doc)

        # Map match id to tag (ID -> String)
        for match_id, start, end in matches:
            tag = nlp.vocab.strings[match_id]
            for i in range(start, end):
                tags[i] = tag

        all_tokens.extend([token.text for token in doc])
        all_labels.extend(tags)

    result_df = pd.DataFrame({
        "token": all_tokens,
        "dict_label": all_labels
    })

    return result_df

res_df = tag_sentences_with_dictionary(dict_df=dictionary, sentences=all_sent, term_column="term", tag_column="label")
print(f"Count of entities tagged by dictionaries: {len(res_df[res_df['dict_label'] != 'O'])}")

Count of entities tagged by dictionaries: 38741


In [53]:
res_df

Unnamed: 0,token,dict_label
0,RNA,O
1,transcripts,O
2,were,O
3,packaged,O
4,into,O
...,...,...
1291029,and,O
1291030,connective,Tissue
1291031,tissue,Tissue
1291032,diseases,O


In [49]:
isnull_df = res_df[res_df["dict_label"].isna()]
len(isnull_df)

0

In [34]:
print(f"Count of entities tagged by manual curation: {len(input_df[input_df['label'] != 'O'])}")

Count of entities tagged by manual curation: 86467


In [57]:
merged_df = pd.concat([input_df, res_df["dict_label"]], axis=1)
merged_df.head()


Unnamed: 0,token,label,token_lower,dict_label,dict_label.1
0,RNA,O,rna,O,O
1,transcripts,O,transcripts,O,O
2,were,O,were,O,O
3,packaged,O,packaged,O,O
4,into,O,into,O,O


In [42]:
overlap_count = comparison_df[(comparison_df["label"] != "O") & (comparison_df["dict_label"] != "O")]
len(overlap_count)

4235

In [43]:
overlap_count

Unnamed: 0,token,label,token_lower,dict_label
79,lower,B-ANAT,lower,Tissue
80,respiratory,I-ANAT,respiratory,Tissue
81,tract,I-ANAT,tract,Tissue
112,lung,B-ANAT,lung,Tissue
735,calf,B-ANAT,calf,Tissue
...,...,...,...,...
1321220,tract,I-ANAT,tract,
1321239,immune,B-ANAT,immune,
1321240,system,I-ANAT,system,
1321282,connective,B-ANAT,connective,


In [9]:
show = input_df[input_df["dict_tag"].notnull()]
show

Unnamed: 0,token,label,token_lower,dict_tag


In [2]:
nlp = spacy.load("en_core_web_sm")

def sent_from_df_col(column: pd.Series) -> List:
    return " ".join(column.to_list())

def lemmatize_term(term: str) -> str:
    """
    Return lemmatized form of term
    """
    lower = nlp(term.lower())
    lemmat = ' '.join([token.lemma_ for token in lower])
    return lemmat

def dictionary_tagging(dict_file: pd.DataFrame, texts: pd.DataFrame, token_col: Optional[str] = "token") -> pd.DataFrame:
    """
    Tag terms in strings representing column of BIO file containing tokens
    Return dictioanry labels as new column within texts DataFrame
    """
    # Map lemmatized terms to input dictionary terms
    term_lemma_map = {}
    for row in dict_file:
        term = row['term'].lower()
        label = row['label']
        lemma = lemmatize_term(term)
        term_lemma_map[lemma] = (term, label)

    # Read and process input texts
    texts = [token.strip() for token in texts["token"]]

    annotations = []

    for text in texts:
        doc = nlp(text)
        tokens = [token for token in doc]
        ## Print next line to display lemmatizations
        # lemmatized_text = ' '.join([token.lemma_.lower() for token in tokens])
        results = []

        # Try to match given term's lemma in lemmatized version of the text
        for lemma, (orig_term, label) in term_lemma_map.items():
            lemma_tokens = lemma.split()
            n = len(lemma_tokens)
            for i in range(len(tokens) - n + 1):
                window = tokens[i:i + n]
                window_lemmas = [t.lemma_.lower() for t in window]

                if window_lemmas == lemma_tokens and window:
                    start_char = window[0].idx
                    end_char = window[-1].idx + len(window[-1])
                    results.append({
                        "from_name": "label",
                        "to_name": "text",
                        "type": "labels",
                        "value": {
                            "start": start_char,
                            "end": end_char,
                            "text": text[start_char:end_char],
                            "labels": [label]
                        }
                    })
        if pmcid:
            text = text + '\n' + f'Source paper: {pmcid}'
        res = {
            "data": {"text": text},
            "annotations": [{"result": results}] if results else []
        }
        annotations.append(res)

    # Save annotations output
    with open(output_json, 'w', encoding='utf-8') as f:
        json.dump(annotations, f, indent=2, ensure_ascii=False)
    # hits = len(annotations[0]['annotations'][0]['result'])
    # print(f"Saved {hits} dictionary annotation(s) to {output_json}")
