In [13]:
import numpy as np
import os
import pandas as pd
from pathlib import Path
from typing import List, Tuple

# One-stop shop for cellfinder first aid

1. Cleaning for obvious extras to "Anatomy" terms, given we want "Anatomy" to be an equivalent to "Tissue
2. Review terms with multiple labels assigned - set priority entity types to resolve
3. Remove entity types extra to ["Cell type", "Cell line", "Anatomy"~Tissue]
4. Write to BRAT format for use in model training/validation
5. Older / intitial work - initial review to investigate how equivalent "Anatomy" is to "Tissue" as an entity type

### Read-in cellfinder dataframes

In [24]:
def crawl_directory(dir_path: str, file_type: str)-> List:
    p = Path(dir_path)
    in_files = list(p.glob(f"**/*.{file_type}"))
    if in_files == []:
        print("Path path producing error - revise. \nHint - check your file_type declaration does not contain '.'")
    return in_files

In [26]:
# File walk to grab all .ann from cellfinder1_brat
in_files = crawl_directory(dir_path="/Users/withers/Downloads/cellfinder1_brat/",
                           file_type="ann")

Clean in-files so columns are organised and more readable

In [27]:
def readin_df(path_to_infile: str) -> Tuple[str, pd.DataFrame]:
    """ 
    Passed .ann file, clean up string formats and return with
    PMCID/ file name for reference
    """
    df = pd.read_csv(path_to_infile, sep="\t", header=None)
    pmcid = os.path.basename(path_to_infile)[:-4]

    # Split column 1 into label, start, end
    df_col1 = pd.DataFrame(np.column_stack([x.split() for x in df[1]])).transpose()
    # Drop old column
    df = df.drop(columns=[1])
    # Append cleaned cols to original df
    df = pd.concat([df, df_col1], axis=1)
    df.columns = ["id", "term", "label", "start",  "end"]

    return pmcid, df

annotations = {}
for f in in_files:
    pmcid, res_df = readin_df(path_to_infile=f)
    annotations[pmcid] = res_df

### 1. For each resulting Dataframe, remove non-compliant _"Anatomy"_ terms for use as _"Tissue"_ annotations

In [76]:
# Endings which do not comply with Anatomy terms referring to tissue only
# These endings tend to mean processes or cell contexts
redlist = ("genesis", "cytic", "ial", "yal")

def end_check(term: str):
    return term.endswith(redlist)

for pmcid in annotations:
    df = annotations[pmcid]
    df["redlist"] = df["term"].apply(end_check)
    before = len(df)

    to_drop = df[(df["redlist"] == True) & (df["label"] == "Anatomy")]
    df_clean = df.drop(df[(df["redlist"] == True) & (df["label"] == "Anatomy")].index)
    after = len(df_clean)

    if int(before - after) != 0:
        print(f"*** {str(before - after)} non-compliant Anatomy term(s) dropped from {pmcid} Dataframe.")
        print(", ".join(to_drop["term"].to_list()) + "\n")
    else:
        print(f"No changes to {pmcid} Dataframe\n")

    annotations[pmcid] = df


*** 3 non-compliant Anatomy term(s) dropped from 18162134 Dataframe.
organogenesis, neurogenesis, embryogenesis

No changes to 16316465 Dataframe

*** 2 non-compliant Anatomy term(s) dropped from 17381551 Dataframe.
myogenesis, endothelial

No changes to 16672070 Dataframe

*** 1 non-compliant Anatomy term(s) dropped from 17389645 Dataframe.
embryogenesis

No changes to 17288595 Dataframe

*** 2 non-compliant Anatomy term(s) dropped from 16623949 Dataframe.
myelomonocytic, endothelial

*** 7 non-compliant Anatomy term(s) dropped from 15971941 Dataframe.
endothelial, Adipocytic, paraxial, adipocytic, epithelial, Osteogenesis, Chondrocytic

No changes to 18286199 Dataframe

*** 3 non-compliant Anatomy term(s) dropped from 17967047 Dataframe.
neuroepithelial, neuroepithelial, neuroepithelial



Write cleaned annotations to file

In [87]:
import glob
import shutil
import os

def write_ann(df: pd.DataFrame, output_dir: str, pmcid: str):
    # Ensure destination directory exists
    os.makedirs(output_dir, exist_ok=True)

    output_path = output_dir + f"{pmcid}.ann"
    with open(output_path, 'w', encoding='utf-8') as f:
        for _, row in df.iterrows():
            line = f"{row['id']}\t{row['label']} {row['start']} {row['end']}\t{row['term']}\n"
            f.write(line)

def copy_txt_files(source_dir: str, destination_dir: str):
    # Ensure destination directory exists
    os.makedirs(destination_dir, exist_ok=True)

    # Use glob to find all .txt files in the source directory
    txt_files = glob.glob(os.path.join(source_dir, '*.txt'))

    # Copy each file
    for file_path in txt_files:
        filename = os.path.basename(file_path)
        dest_path = os.path.join(destination_dir, filename)
        shutil.copy2(file_path, dest_path)

In [None]:
output_dir = "/Users/withers/Downloads/cellfinder1_brat_clean/"
for pmcid in annotations:
    df = annotations[pmcid]
    df = df.drop("redlist", axis=1)
    df = df[["id", "label", "start",  "end", "term"]]
    annotations[pmcid] = df
    write_ann(df = annotations[pmcid],
              output_dir = output_dir,
              pmcid = pmcid)

Finally, copy .txt files to new BRAT folder

In [None]:
copy_txt_files(source_dir = "/Users/withers/Downloads/cellfinder1_brat/",
               destination_dir = "/Users/withers/Downloads/cellfinder1_brat_clean/")

In [66]:
# File walk to grab all .ann from cellfinder1_brat_clean
in_files = crawl_directory(dir_path="/Users/withers/Downloads/cellfinder1_brat_clean/",
                           file_type="ann")

annotations = {}
for f in in_files:
    pmcid, res_df = readin_df(path_to_infile=f)
    annotations[pmcid] = res_df

### 2. Check if there are overlapping annotations in cellfinder

#### With the preferred_labels_mode Series, clean each full df to assign only one annotation for each term
Precedent set as follows:
    - First, select the preferred label by the mode of the label's occurrences
    - Prioritise by entity type as a fall-back: CellLine > Anatomy > CellType

In [None]:
# Sets priority Cell line > Anatomy > Cell type
priority = ["CellLine", "Anatomy", "CellType"]

def get_mode_label(series: pd.Series)-> str:
    """Return the most frequent label in a given series"""
    modes = series.mode()
    if len(modes) > 1:

        # If there is a tie in the mode label, tie-break by priority.
        for p_label in priority:
            if p_label in modes.values: #Catch highest priority label
                return p_label

        # Else, failing a tie break, return the first mode term
        return modes.iloc[0]

    # If there is an obvious mode label, return
    return modes[0]

# Group by term, selecting one label term should more than 1 occur in the annotations
for pmcid in annotations:
    print(f"PMCID: {pmcid}")
    df = annotations[pmcid]
    preferred_labels_mode = df.groupby('term')['label'].apply(get_mode_label)

    # Add back to df
    df['preferred_label'] = df['term'].map(preferred_labels_mode)

    # Update 'label' should it not be the preferred label
    df.loc[(df['label'] != df['preferred_label']), 'label'] = df['preferred_label']
    df = df.drop(columns=["preferred_label"])

    annotations[pmcid] = df
    print(df)

PMCID: 18162134
       id          term        label  start    end
0      T1          MST3  GeneProtein  23576  23580
1      T2        PRKCB1  GeneProtein  22722  22728
2      T3         CDC42  GeneProtein  23601  23606
3      T4        CTNNB1  GeneProtein  23326  23332
4      T5          ERK1  GeneProtein  23592  23596
..    ...           ...          ...    ...    ...
673  T674         hESCs     CellType  30060  30065
674  T675         ERBB3  GeneProtein  34156  34161
675  T676          FGF2  GeneProtein  12738  12742
676  T677  phospho-GSK3  GeneProtein  12748  12760
677  T678         Stat1  GeneProtein  21660  21665

[678 rows x 5 columns]
PMCID: 16316465
       id                                     term        label start   end
0      T1                                    human      Species  3043  3048
1      T2                                   Oct3/4  GeneProtein  3013  3019
2      T3                   pluripotent population      Anatomy  1734  1756
3      T4                   

Sanity check that the resolutions above have worked

In [79]:
# Group the terms by their labels, filtering for those with more than 1 label assigned
sanity_check = True
for pmcid in annotations:
    df = annotations[pmcid]
    df_grouped = df.groupby("term")["label"].unique()
    multiple_types = df_grouped[df_grouped.apply(len) > 1]
    if len(multiple_types) != 0:
        sanity_check = False
    
if sanity_check:
    print("Yay! All clashes resolved")

Yay! All clashes resolved


### 3. Drop annotations of entity types extra to those required

In [82]:
# Remove entity types extra to ["Cell type", "Cell line", "Anatomy"~Tissue]

for pmcid in annotations:
    df = annotations[pmcid]
    df_select = df[df["label"].isin(priority)]
    print(f"Cleaned df of length {len(df)} to {len(df_select)} rows")
    annotations[pmcid] = df_select


Cleaned df of length 678 to 148 rows
Cleaned df of length 414 to 161 rows
Cleaned df of length 849 to 619 rows
Cleaned df of length 496 to 368 rows
Cleaned df of length 220 to 80 rows
Cleaned df of length 449 to 217 rows
Cleaned df of length 890 to 570 rows
Cleaned df of length 445 to 327 rows
Cleaned df of length 552 to 339 rows
Cleaned df of length 905 to 634 rows


### 4. Write out filtered files to BRAT format for use in model training/validation


In [88]:
output_dir = "/Users/withers/Downloads/cellfinder1_brat_clean_07/"
for pmcid in annotations:
    df = annotations[pmcid]
    df = df[["id", "label", "start",  "end", "term"]]
    write_ann(df = annotations[pmcid],
              output_dir = output_dir,
              pmcid = pmcid)
print(f"Annotations written to file at: '{output_dir}'")

copy_txt_files(source_dir = "/Users/withers/Downloads/cellfinder1_brat/",
               destination_dir = "/Users/withers/Downloads/cellfinder1_brat_clean_07/")

Annotations written to file at: '/Users/withers/Downloads/cellfinder1_brat_clean_07/'


### 5. OLD/INITIAL WORKS | View the Tissue coverage of cellfinder terms by dictionaries 

##### First concatentate the dictionaries used in labelstudio

In [63]:
# for pmcid in annotations:
#     with open(f"/Users/withers/Downloads/cellfinder1_brat/{pmcid}.txt", "r") as f:
#         input_text = f.readlines()
#     f.close()
# print(type(input_text))
# [print(x,"\n") for x in input_text]

In [39]:
tissue_df = "/Users/withers/GitProjects/OTAR3088/DataFolder/Data-Extraction-Query/tissue_df.tsv"
btissue_df = "/Users/withers/GitProjects/OTAR3088/DataFolder/Data-Extraction-Query/brendatissue_df.tsv"

tissue = pd.read_csv(tissue_df, sep="\t", header=None)[2].to_list()
btissue = pd.read_csv(btissue_df, sep="\t", header=None)[2].to_list()
tissue.extend(btissue)
tissue = [x.lower() for x in tissue]
tissue = list(set(tissue))
tissue

['choroid plexus',
 'eye',
 'red nucleus',
 'nail',
 'corpus callosum',
 'globus pallidus',
 'capillary',
 'gastrula stage',
 'dentine',
 'neoplasm',
 'blood/esophagus',
 'pancreas/plasma',
 'ovarian follicle',
 'anterior visceral endoderm',
 'blood brain barrier',
 'blood vessel endothelium',
 'blood vessel',
 'endothelial tube',
 'bile/plasma',
 'cardiac muscle fiber',
 'caudal fin',
 'epiphysis',
 'whole body',
 'large intestine/muscle',
 'embryo sac',
 'arm',
 'esophagus muscularis mucosa',
 'cyst',
 'tooth enamel',
 'crest',
 'adipose tissue',
 'renal glomerulus',
 'gill',
 'corpus luteum',
 'peripheral nervous system',
 'trigeminal ganglion',
 'visceral fat',
 'del',
 'colonic mucosa',
 'ovary/plasma',
 'cornea',
 'dorsal raphe nucleus',
 'intermediate filament',
 'lymph sac',
 'allantoic fluid',
 'microfilament',
 'iris',
 'blood plasma',
 'alveolar wall',
 'atrium/ileum',
 'stria terminalis',
 'anterior commissure',
 'stomach',
 'feces',
 'urethra',
 'hypogastric nerve',
 'bloo

##### Then view overlap of terms in dictionary & those in cell finder

In [40]:
overlap = [term for term in tissue_check_set if term in tissue]
print("\n".join(overlap))

tissue
immune system
bone marrow
blood
peripheral blood
liver


In [41]:
cellfinder_only = [term for term in tissue_check_set if term not in tissue]
print("\n".join(cellfinder_only))

# TODO - Coverage compared to cover of dictionary in sample
percent = len(cellfinder_only) / len(tissue) * 100
print('%.2f' % percent)

iver
hematopoietic system
cystic bodies
myelomonocytic colonies
human fetal liver
clumps
viral
colonies
fetal liver
vector transduced colonies
germ layers
mobilized peripheral blood
cells
clusters
anti-hiv
embryonic cystic bodies
monolayers
fetal
myeloid colonies
feeder layers
cord blood
endothelial
hiv-1
layers
myeloid lineages
hematopoietic
lentiviral
lymphoid
cystic body
myelomonocytic
vector
blastocysts
inner cell mass
myeloid
antiviral
cell clumps
embryonic
stromal
3.80


In [30]:
dict_only = [term for term in tissue if term not in tissue_check_set]
# print("\n".join(dict_only))

In [42]:
tissue_check

Unnamed: 0,label,start,end,term
0,Anatomy,2664,2679,inner cell mass
11,Anatomy,30894,30910,Myeloid colonies
15,Anatomy,2689,2700,blastocysts
19,Anatomy,2284,2289,HIV-1
21,Anatomy,6371,6385,myelomonocytic
...,...,...,...,...
869,Anatomy,18280,18296,peripheral blood
877,Anatomy,18291,18296,blood
884,Anatomy,19552,19557,fetal
885,Anatomy,19558,19563,liver


In [52]:
tissue_check_f = tissue_check[tissue_check["term"].isin(["iver"])]
s = tissue_check_f['start'].to_list()[0]
e = tissue_check_f['end'].to_list()[0]