In [1]:
from datasets import load_dataset

# Data collected from - https://huggingface.co/datasets/databio/cell-line-nli
ds = load_dataset("databio/cell-line-nli", "pair-class", split='test')

In [3]:
from huggingface_hub import login

login("Insert login key here")

In [4]:
from flair.data import Sentence
from flair.models import SequenceTagger

# load tagger
tagger = SequenceTagger.load("OTAR3088/flair-CellLine-model")


## Sample work to see data structure

# make example sentence
sentence = Sentence(ds[1]['hypothesis'])

# predict NER tags
tagger.predict(sentence)

# print sentence
print(sentence)

# print predicted NER spans
print('The following NER tags are found:')

# iterate over entities and print
for entity in sentence.get_spans('ner'):
    print(entity)

2025-02-28 16:28:45,229 SequenceTagger predicts: Dictionary with 5 tags: O, S-CellLine, B-CellLine, E-CellLine, I-CellLine
Sentence[7]: "The Epstein-Barr virus EBV transformed DA04442."
The following NER tags are found:


In [6]:
import pandas as pd
from tqdm import tqdm

res_list = []

## For now we are running with a sample set
# for i in tqdm(range(0,len(ds))):
for i in tqdm(range(0,1000)):
    row = []
    text = ds[i]['hypothesis']
    sentence = Sentence(text)
    row.append(text)
    tagger.predict(sentence)
    entities = []
    for entity in sentence.get_spans('ner'):
        entities.append(entity)
    row.append(entities)
    res_list.append(row)

res_df = pd.DataFrame(res_list, columns=['sentences', 'entities'])

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [01:00<00:00, 16.56it/s]


In [7]:
import requests
from typing import List

## Search through OLS to investigate tagging via CLO
# TODO - This is v computationally expensive, will switch to other means which hopefully would also include cellosaurus annotations

def ols_search(search: str) -> List:
    # TODO - way too many loops
    url = f'https://www.ebi.ac.uk/ols4/api/search?q={search}&ontology=clo,cl&type=class%2Cproperty&exact=false&rows=1&start=0&format=json&lang=en'
    resp = requests.request("GET", url)
    if resp.status_code == 200:
        resp = resp.json()
        try:
            res = resp['response']['docs']
            for hit in res:
                if top_res['label'] == 'cell line' or not top_res['short_form'].startswith('CLO'):
                    continue
            else:
                return [top_res['label'], top_res['short_form']]

        except: # Resp docs = empty list
            return []
    else:
        return []

def ols_search_sentence(input_list: List) -> List:
    hit = []
    for word in input_list:
        res = ols_search(search=word)
        if len(res) > 0:
            hit = res
    return res

In [18]:
from IPython.display import display, HTML

def make_scrollable(df, height=500):
    return HTML(f"<div style='max-height:{height}px; overflow:auto'>{df.to_html()}</div>")


# res_df.to_csv('full_cell_line_res.csv')
make_scrollable(res_df)

Unnamed: 0,sentences,entities,search_text
0,The donor of DA04442 has Type 2 Diabetes Mellitus.,[],"[The, donor, of, DA04442, has, Type, 2, Diabetes, Mellitus.]"
1,The Epstein-Barr virus EBV transformed DA04442.,[],"[The, Epstein-Barr, virus, EBV, transformed, DA04442.]"
2,The donor of SKG-3b has Squamous cell carcinoma of the cervix uteri.,[],"[The, donor, of, SKG-3b, has, Squamous, cell, carcinoma, of, the, cervix, uteri.]"
3,DA04442 is sourced from blood.,[],"[DA04442, is, sourced, from, blood.]"
4,"A patient of Diabetes, Type 2 contributed to DA04442.",[],"[A, patient, of, Diabetes,, Type, 2, contributed, to, DA04442.]"
5,TE 175.T was derived from lymph node.,[],"[TE, 175.T, was, derived, from, lymph, node.]"
6,The Epstein-Barr virus EBV transformed DA04442.,[],"[The, Epstein-Barr, virus, EBV, transformed, DA04442.]"
7,The Transformed cell line includes DA04442.,[],"[The, Transformed, cell, line, includes, DA04442.]"
8,The Rhesus macaque polyomavirus transformed GM16093.,[],"[The, Rhesus, macaque, polyomavirus, transformed, GM16093.]"
9,DA04442 cell line belongs to the category of Transformed cell line.,[],"[DA04442, cell, line, belongs, to, the, category, of, Transformed, cell, line.]"


In [14]:
import time

# TODO - currently being tinkered with / too much memory being used on OLS for it to be happy

def ols_search(search: str) -> List:
    url = f'https://www.ebi.ac.uk/ols4/api/search?q={search}&ontology=clo,cl&type=class%2Cproperty&exact=false&rows=1&start=0&format=json&lang=en'
    resp = requests.request("GET", url)
    if resp.status_code == 200:
        resp = resp.json()
        try:
            res = resp['response']['docs']
            for hit in res:
                if top_res['label'] == 'cell line' and not top_res['short_form'].startswith('CLO'):
                    return []
            else:
                return [top_res['label'], top_res['short_form']]

        except: # Resp docs = empty list
            return []
    else:
        return []

def ols_search_sentence(input_list: List) -> List:
    hit = []
    for word in input_list:
        res = ols_search(search=word)
        time.sleep(1)
        if len(res) > 0:
            hit = res
    return res

# res_df.loc[:, 'search_text'] = res_df['sentences'].apply(lambda x: x.split(' '))
# res_df_sample = res_df[:10]
# res_df_sample.loc[:, 'ols_search'] = res_df_sample['search_text'].apply(lambda x: ols_search_sentence(x))
# res_df_sample = res_df_sample.drop('search_text', axis=1)

# print('Sample of 10 rows')
# make_scrollable(res_df_sample)

In [20]:
with_hits = res_df[res_df['entities'].apply(lambda x: len(x) > 0)]
print(f'{len(with_hits)} / {len(res_df)} rows tagged by model')

with_hits.loc[:, 'search_text'] = with_hits['sentences'].apply(lambda x: x.split(' '))
sample_with_hits = with_hits[:20]
sample_with_hits.loc[:, 'ols_search'] = sample_with_hits['search_text'].apply(lambda x: ols_search_sentence(x))
sample_with_hits = sample_with_hits.drop('search_text', axis=1)
make_scrollable(sample_with_hits)

162 / 1000 rows tagged by model


ConnectionError: HTTPSConnectionPool(host='www.ebi.ac.uk', port=443): Max retries exceeded with url: /ols4/api/search?q=line.&ontology=clo,cl&type=class%2Cproperty&exact=false&rows=1&start=0&format=json&lang=en (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x30c3aba00>: Failed to establish a new connection: [Errno 12] Cannot allocate memory'))

In [None]:
# res_df.to_csv('sample_cell_line_res.csv')