# Open source model and data reviews, for variants / AEs

Related thoughts for use cases: 
- https://docs.google.com/document/d/1HDpOcQQOmT5A398j_qr-Hm3my4fjuZd8To1YTXhlXNg/edit?tab=t.0#heading=h.kt9y0vodzhfp
- https://docs.google.com/document/d/1KGyr_zPBAdTophA-XGzkW0ObhVNMoKBhioJdpVpdnA0/edit?tab=t.0#heading=h.kfugo3aqnalh
    - With this second example, we can see value in having models which can capture genevar, drug targets and adverse events

### Beginning with protein structure NER model, sourced from https://huggingface.co/PDBEurope/BiomedNLP-PubMedBERT-ProteinStructure-NER-v3.1

In [2]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

tokenizer = AutoTokenizer.from_pretrained("PDBEurope/BiomedNLP-PubMedBERT-ProteinStructure-NER-v3.1")
model = AutoModelForTokenClassification.from_pretrained("PDBEurope/BiomedNLP-PubMedBERT-ProteinStructure-NER-v3.1")

tokenizer_config.json:   0%|          | 0.00/394 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

In [42]:
import pandas as pd
import requests
from pprint import pprint
from typing import List, Tuple

In [None]:
def query_epmc(query: str, page_size: int = 10) -> List:
    query = f"{query} HAS_FT:Y AND OPEN_ACCESS:Y"
    url = "https://www.ebi.ac.uk/europepmc/webservices/rest/search"
    params = {
        "query": query,
        "format": "json",
        "pageSize": page_size
    }

    resp = requests.get(url, params=params)
    resp_json = resp.json()

    if 'resultList' in resp_json:
        # pprint(resp_json)
        pmcids = [i['pmcid'] for i in resp_json['resultList']['result']]
        return pmcids
    else:
        return []

In [20]:
papers = query_epmc(query="protein variant")
papers

['PMC11843469',
 'PMC10823009',
 'PMC11301264',
 'PMC10788875',
 'PMC10232650',
 'PMC9235876',
 'PMC9293687',
 'PMC9725131',
 'PMC10016848',
 'PMC8495913']

In [43]:
# Stuck with BioC given Melanie's model used the same / ease of use
def query_bioc(pmid: str) -> List:
    resp = requests.get(f"https://www.ncbi.nlm.nih.gov/research/bionlp/RESTful/pmcoa.cgi/BioC_json/{pmid}/unicode")
    resp_json = resp.json()
    # pprint(resp_json)
    text = []
    for search_hit in resp_json:
        docs = search_hit['documents']
        for doc in docs:
            passages = doc['passages']
            sections = []
            for block in passages:
                section = block['infons']['section_type']
                sections.append(section)
                text.append(block['text'])
                if section in ['INTRO', 'DISCUSS', 'RESULTS']:
                    text.append(block['text'])
            # print(list(set(sections)))
    return text

In [145]:
import torch

def run_model(
        input_text: List[str],
        batch_size: int=4,
        max_length: int=512) -> Tuple:

    all_results = []
    
    # Process in batches
    for i in range(0, len(input_text), batch_size):
        # Get the current batch
        batch_texts = input_text[i:i + batch_size]
        
        # Tokenize and pass to model
        inputs = tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True, max_length=max_length)

        with torch.no_grad():
            outputs = model(**inputs)

        # Decode predictions and configure id to label map
        predictions = torch.argmax(outputs.logits, dim=2)
        id_to_label = model.config.id2label

        # Review predictions
        for (original_text, input_ids, prediction_ids) in zip(batch_texts, inputs['input_ids'], predictions):
            
            # Decode tokens and predictions
            tokens = tokenizer.convert_ids_to_tokens(input_ids)
            labels = [id_to_label[pred_id.item()] for pred_id in prediction_ids]

            entities = []
            current_entity = None
            
            for token, label in zip(tokens, labels):
                if token in tokenizer.all_special_tokens:
                    continue

                if label.startswith("B-"):
                    if current_entity:
                        entities.append(current_entity)
                    
                    entity_type = label[2:]
                    current_entity = {"entity": token.replace("##", ""), "type": entity_type}
                
                elif label.startswith("I-"):
                    if current_entity:
                        if [x for x in [current_entity["entity"], token] if "-" in x]: 
                            token = " " + token.replace("##", "")
                            current_entity["entity"] += token
                        else:
                            token = " " + token.replace("##", "")
                            current_entity["entity"] += token
                    
                else:
                    if current_entity:
                        entities.append(current_entity)
                    current_entity = None
            
            if current_entity:
                entities.append(current_entity)

            all_results.append((original_text, entities))

    return all_results

In [40]:
'''
def run_model2(
        input_text: List[str],
        batch_size: int = 4,
        max_length: int = 512) -> Tuple:
    """
    Runs a named entity recognition (NER) model on a list of text inputs.
    This version correctly combines 'B-' and 'I-' parts of an entity.

    Args:
        input_text (List[str]): A list of strings to process.
        batch_size (int, optional): The number of texts to process in a single batch. Defaults to 4.
        max_length (int, optional): The maximum length for tokenization. Defaults to 512.

    Returns:
        Tuple: A tuple containing a list of tuples. Each inner tuple contains the
               original text and a list of identified entities.
    """
    all_results = []
    
    # Process in batches
    for i in range(0, len(input_text), batch_size):
        # Get the current batch
        batch_texts = input_text[i:i + batch_size]
        
        # Tokenize and pass to model
        inputs = tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True, max_length=max_length)
        inputs.to('cuda' if torch.cuda.is_available() else 'cpu') # Move tensors to GPU if available

        with torch.no_grad():
            outputs = model(**inputs)

        # Decode predictions and configure id to label map
        predictions = torch.argmax(outputs.logits, dim=2)
        id_to_label = model.config.id2label

        # Review predictions
        for (original_text, input_ids, prediction_ids) in zip(batch_texts, inputs['input_ids'], predictions):
            
            # Decode tokens and predictions
            tokens = tokenizer.convert_ids_to_tokens(input_ids)
            labels = [id_to_label[pred_id.item()] for pred_id in prediction_ids]

            entities = []
            current_entity = None
            
            for token, label in zip(tokens, labels):
                # Skip special tokens
                if token in tokenizer.all_special_tokens:
                    continue
                
                # Check for beginning of an entity
                if label.startswith("B-"):
                    # If there was a previous entity, append it
                    if current_entity:
                        entities.append(current_entity)
                    
                    entity_type = label[2:]
                    current_entity = {"entity": token.replace("##", ""), "type": entity_type}
                
                # Check for continuation of an entity
                elif label.startswith("I-"):
                    # If we are currently inside an entity, append the token
                    if current_entity and current_entity["type"] == label[2:]:
                        current_entity["entity"] += token.replace("##", "").replace(" ", "")
                    # If 'I-' label doesn't match the current entity type,
                    # treat it as a new entity (or a standalone 'I-' label, common error)
                    else:
                        if current_entity:
                            entities.append(current_entity)
                        
                        entity_type = label[2:]
                        current_entity = {"entity": token.replace("##", ""), "type": entity_type}

                # Check for 'O' / other labels
                else:
                    # If there was an entity, append it
                    if current_entity:
                        entities.append(current_entity)
                    current_entity = None
            
            # Append the last entity if the loop ends while one is present
            if current_entity:
                entities.append(current_entity)

            # Append the result for the current text
            all_results.append((original_text, entities))

    return all_results
    '''

'\ndef run_model2(\n        input_text: List[str],\n        batch_size: int = 4,\n        max_length: int = 512) -> Tuple:\n    """\n    Runs a named entity recognition (NER) model on a list of text inputs.\n    This version correctly combines \'B-\' and \'I-\' parts of an entity.\n\n    Args:\n        input_text (List[str]): A list of strings to process.\n        batch_size (int, optional): The number of texts to process in a single batch. Defaults to 4.\n        max_length (int, optional): The maximum length for tokenization. Defaults to 512.\n\n    Returns:\n        Tuple: A tuple containing a list of tuples. Each inner tuple contains the\n               original text and a list of identified entities.\n    """\n    all_results = []\n    \n    # Process in batches\n    for i in range(0, len(input_text), batch_size):\n        # Get the current batch\n        batch_texts = input_text[i:i + batch_size]\n        \n        # Tokenize and pass to model\n        inputs = tokenizer(batch_te

In [113]:
# test = run_model2(input_text)
# filter_res(test, "mutant")

In [158]:
input_text = query_bioc(pmid=papers[0])
# len(texts)
e = run_model(input_text=input_text)
# for (text, entity) in e:
#     if entity['type'] == "mutant":
#         print("Original text:")
#         pprint([x for x in original_text.split(sep='. ')])
#         print(f"Found term: {entity['entity']}\n - - - - - - - - -\n")

  return forward_call(*args, **kwargs)


In [159]:
def filter_res(results: List[Tuple], filter: str):
    for res in results:
        text = res[0]
        tagged = res[1]
        filtered = [x["entity"] for x in tagged if x["type"] == filter]
        if len(filtered) > 0:
            print("- - - - - - Original text - - - - -")
            pprint([x for x in text.split(sep='. ')])
            print(f"- - Terms tagged of type '{filter}' - -")
            pprint(filtered)

filter_res(e, "mutant")

- - - - - - Original text - - - - -
['Antibodies are critical tools in medicine and research, and their affinity '
 'for their target antigens is a key determinant of their efficacy',
 'Traditional antibody affinity maturation and interaction analyses are often '
 'hampered by time‐consuming steps such as cloning, expression, purification, '
 'and interaction assays',
 'To address this, we have developed FASTIA (Fast Affinity Screening '
 'Technology for Interaction Analysis), a novel platform that integrates rapid '
 'gene fragment preparation, cell‐free protein synthesis, and bio‐layer '
 'interferometry with non‐regenerative analysis',
 'Using this approach, we can analyze the intermolecular interactions of over '
 '20 variants over 2\u2009days, requiring only the parent protein expression '
 'plasmid and basic equipment',
 'We have demonstrated the ability of FASTIA to discriminate between '
 'single‐domain antibody variants with different binding affinities using the '
 'anti‐HEL 

In [102]:
for p in papers:
    e = run_model(input_text=p)

  return forward_call(*args, **kwargs)


In [48]:
# Some gemini example sentences, these are more obvious examples of variants
protein_variant_sentences = [
    "Studies on the protein-protein interaction network revealed that the R175H variant of p53 dramatically alters its binding partners, shifting its function from a tumor suppressor to an oncogenic driver.",
    "The structural consequences of the E6V substitution in hemoglobin are profound, leading to the formation of rigid polymers that deform red blood cells into their characteristic sickle shape.",
    "The D90A mutation in superoxide dismutase 1 (SOD1) highlights the complexity of disease mechanisms, as its pathogenicity in familial ALS is not due to a loss of catalytic activity but rather to a gain of toxic function.",
    "The ΔF508 deletion in the CFTR protein prevents its proper folding, triggering the cell's quality control system to target the misfolded protein for destruction in the endoplasmic reticulum.",
    "Kinase inhibitors are now a standard therapy for melanoma patients with the V600E mutation in BRAF, which constitutively activates the downstream signaling cascade.",
    "The single amino acid change A67T in transthyretin increases its propensity to misfold and aggregate, forming amyloid fibrils that deposit in the heart and other tissues.",
    "The pathogenicity of the A223P variant in fibrillin-1 lies in its disruption of calcium-binding sites, which are critical for the formation of stable microfibrils in the extracellular matrix.",
    "Patients with familial chylomicronemia resulting from the S267L variant of lipoprotein lipase exhibit severely impaired lipid clearance due to the protein's reduced catalytic efficiency.",
    "The R117H amino acid substitution in Factor V makes it resistant to cleavage by activated protein C, a critical step in down-regulating blood coagulation.",
    "The P101S amino acid change in the prion protein (PrP) is a crucial determinant of disease pathology, as it favors the conversion of the normal PrP ($PrP^C$) into the disease-associated scrapie form ($PrP^{Sc}$).",
    "The E280A mutation in presenilin 1 alters the cleavage of the amyloid precursor protein (APP), leading to an overproduction of the highly aggregation-prone Aβ42 peptide.",
    "Achondroplasia is caused by a G380R variant in fibroblast growth factor receptor 3 (FGFR3), which leads to the constitutive activation of the receptor even in the absence of its ligand.",
    "The K456N variant of cardiac troponin I impairs the calcium-dependent regulation of myocardial contraction, which is the underlying cause of hypertrophic cardiomyopathy.",
    "The C278F substitution in fibroblast growth factor receptor 2 (FGFR2) is a well-known example of a gain-of-function mutation, leading to premature fusion of the cranial sutures.",
    "The C282Y variant in the HFE protein disrupts its interaction with the transferrin receptor, leading to a breakdown in the regulation of iron absorption.",
    "The p.L320P variant of lamin A/C causes dilated cardiomyopathy by compromising the structural integrity of the nuclear lamina and disrupting normal gene expression patterns.",
    "The S295C mutation in myotilin creates a new cysteine residue, which can form inappropriate disulfide bonds, leading to the formation of aggregates that interfere with muscle function.",
    "The R402Q variant in the ATP-binding cassette transporter A4 (ABCA4) disrupts its ability to transport retinoids, leading to the accumulation of toxic compounds in the retinal pigment epithelium.",
    "A study of familial ALS patients identified the H46R mutation in SOD1, which was shown to lead to a misfolding and aggregation of the protein that is toxic to motor neurons.",
    "The E258K mutation in lysosomal acid lipase is a clear example of a loss-of-function variant, as it severely impairs the protein's ability to hydrolyze cholesterol esters.",
    "Individuals homozygous for the M34T variant in connexin 26 (GJB2) often present with non-syndromic hearing loss due to the disruption of gap junction function in the inner ear.",
    "Congenital contractural arachnodactyly is associated with the K197R mutation in fibrillin-2, a protein crucial for elastic fiber formation in connective tissues.",
    "The L268P mutation in type II collagen (COL2A1) results in a protein with impaired triple helix formation, leading to the skeletal abnormalities characteristic of Stickler syndrome.",
    "The P1213L variant in dynein heavy chain 1 (DYNC1H1) compromises the protein's motor function, resulting in defects in retrograde axonal transport that cause spinal muscular atrophy.",
    "The R782W mutation in α₁-antitrypsin (SERPINA1) results in a misfolded protein that is trapped in the liver, leading to a lack of circulating α₁-antitrypsin in the lungs.",
    "The R136Q amino acid substitution in ubiquitin carboxyl-terminal hydrolase L1 (UCHL1) affects its deubiquitinating activity, which may contribute to the accumulation of protein aggregates observed in Parkinson's disease.",
    "The P270S variant in glucocerebrosidase (GBA1) has been shown to reduce the enzyme's activity, which is a known risk factor for Parkinson's disease.",
    "The A30P mutation in α-synuclein is a rare but well-documented cause of familial Parkinson's disease, as it promotes the formation of neurotoxic fibrils and Lewy bodies.",
    "The R46L variant in proprotein convertase subtilisin/kexin type 9 (PCSK9) is a functional polymorphism that reduces the protein's ability to degrade LDL receptors, leading to lower levels of circulating LDL cholesterol.",
    "The A152T variant in the protein tau (MAPT) is a known risk factor for frontotemporal dementia, as it enhances the aggregation and misfolding of the protein."
]

In [114]:
# Gemini mimicking descriptions of paper results, for protein variants then non-coding genetic variants to see what happens
protein_variant_sentences_experimental_results = [
    "Circular dichroism spectroscopy of the R175H p53 variant revealed a significant decrease in α-helical content compared to the wild-type protein, indicating a loss of structural integrity.",
    "In our gel electrophoresis assays, the E6V hemoglobin variant showed a distinct band shift in non-denaturing conditions, consistent with polymer formation under deoxygenated states.",
    "Immunoblot analysis of cells expressing the D90A SOD1 variant demonstrated a marked increase in detergent-insoluble protein, suggesting a propensity for aggregation.",
    "Western blot analysis of the ΔF508 CFTR variant showed a complete absence of the mature, fully glycosylated band, confirming that the protein is not trafficked to the cell surface.",
    "In vitro kinase assays confirmed a 500-fold increase in the phosphorylation of ERK and MEK substrates when using the V600E BRAF variant, demonstrating constitutive activation.",
    "Fluorescence microscopy with Thioflavin S staining revealed abundant amyloid deposits in cardiac tissue sections from mice expressing the A67T transthyretin variant.",
    "Our surface plasmon resonance (SPR) data demonstrated that the A223P fibrillin-1 variant has a significantly reduced binding affinity for calcium ions (Kd = 450 nM vs 50 nM for WT).",
    "Lipolysis assays using the S267L lipoprotein lipase variant showed only 15% of the catalytic activity observed with the wild-type enzyme.",
    "Coagulation assays of plasma containing the R117H Factor V variant showed a 12-fold increase in clotting time after the addition of activated protein C, confirming its resistance to inactivation.",
    "Real-time quaking-induced conversion (RT-QuIC) assays demonstrated that the P101S prion protein variant seeded the rapid formation of fibrils at a concentration 100-fold lower than the wild-type protein.",
    "Cell-free cleavage assays confirmed that the E280A presenilin 1 variant preferentially generated the highly amyloidogenic Aβ42 peptide over the Aβ40 peptide.",
    "Dual-luciferase reporter assays confirmed a 20-fold increase in STAT3 signaling in cells expressing the G380R FGFR3 variant, even in the absence of exogenous FGF ligand.",
    "Calcium-sensitizing assays on skinned cardiac muscle fibers revealed that the K456N troponin I variant increased the calcium sensitivity of force generation by 2.5-fold.",
    "Immunofluorescence staining revealed that the C278F FGFR2 variant localized to the cell membrane at a higher density and for a longer duration than the wild-type protein.",
    "In our iron uptake experiments, cells expressing the C282Y HFE variant failed to show the characteristic down-regulation of transferrin receptor expression in response to high iron levels.",
    "Immunoblotting of nuclear fractions showed a fragmented lamin A/C network in cells from patients with the p.L320P variant, indicating a loss of nuclear integrity.",
    "Pull-down assays with a cysteine-reactive probe demonstrated that the S295C myotilin mutation forms intermolecular disulfide bonds, leading to the formation of high-molecular-weight aggregates.",
    "Liposome-based transport assays using the R402Q ABCA4 variant showed a 90% reduction in the translocation of all-trans-retinal across the membrane.",
    "Immunohistochemistry of spinal cord sections from transgenic mice expressing the H46R SOD1 variant revealed extensive intracellular inclusions in motor neurons, which were immunoreactive for ubiquitin.",
    "In vitro enzyme kinetics demonstrated that the E258K lysosomal acid lipase variant had a Vmax that was less than 5% of the wild-type enzyme, confirming a severe loss-of-function.",
    "Dye-transfer assays showed a complete failure of Lucifer yellow transfer between cells expressing the M34T connexin 26 variant, indicating a loss of gap junction function.",
    "Atomic force microscopy of extracellular matrix fibers from cells expressing the K197R fibrillin-2 variant revealed an aberrant, disorganized microfibril network.",
    "A triple helix formation assay using purified L268P collagen type II demonstrated a significant reduction in the protein's ability to form stable triple helices.",
    "Live-cell imaging of retrograde axonal transport in neurons with the P1213L dynein heavy chain 1 variant showed a 75% decrease in the speed of cargo movement toward the cell body.",
    "Immunofluorescence of liver tissue from a patient with the R782W α₁-antitrypsin variant revealed abundant inclusions of the protein within hepatocytes, confirming its failure to be secreted.",
    "The R136Q UCHL1 variant showed a 5-fold decrease in its isopeptidase activity against a ubiquitinated substrate in a mass spectrometry-based enzymatic assay.",
    "In a fluorometric enzyme assay, the P270S glucocerebrosidase variant exhibited only 40% of the catalytic activity observed with the wild-type protein.",
    "Transmission electron microscopy of neuronal cultures expressing the A30P α-synuclein variant showed the presence of abundant fibrillar aggregates, similar to those found in Lewy bodies.",
    "In our binding assays, the R46L PCSK9 variant demonstrated a 3-fold decrease in its binding affinity for the LDL receptor, leading to a reduced rate of receptor degradation.",
    "Confocal microscopy of cells expressing the A152T tau variant revealed a significant increase in the formation of puncta and filamentous structures, suggesting enhanced aggregation."
]

non_coding_variant_sentences = [
    "Using a dual-luciferase reporter assay, we demonstrated that the SNP rs6983267, located in a long-range enhancer for the MYC oncogene, significantly increased reporter gene expression by 2.5-fold in colon cancer cell lines.",
    "Chromatin immunoprecipitation followed by sequencing (ChIP-seq) for the transcription factor TCF7L2 revealed a loss of binding at the promoter of the TCF7L2 gene in the presence of the rs7903146 risk allele.",
    "Our massively parallel reporter assay (MPRA) data showed that the A>T variant in the 3' UTR of the NOS1AP gene reduced the expression of a linked fluorescent reporter by 40% in cardiomyocyte cells.",
    "Sanger sequencing of the TERT promoter in melanoma samples identified two recurrent non-coding variants, c.-124C>T and c.-146C>T, which were shown by RT-qPCR to increase TERT mRNA expression by 15- and 20-fold, respectively.",
    "A quantitative chromatin conformation capture (3C) assay demonstrated that the rs9349379 SNP in a genomic insulator element disrupts the physical loop connecting the ESR1 promoter to its upstream enhancer.",
    "Minigene splicing assays confirmed that the deep intronic variant IVS21-12C>T in the CFTR gene leads to the creation of a cryptic splice site, resulting in a mis-spliced transcript lacking exon 22.",
    "In a yeast-based screen, we found that a 15-bp deletion within the HBB promoter region completely abolished the binding of the GATA-1 transcription factor, consistent with the observed phenotype in beta-thalassemia patients.",
    "We performed allelic expression analysis on post-mortem brain tissue and found that the rs12913832 SNP in the OCA2 gene enhancer was strongly associated with a 50% decrease in its expression.",
    "Our electro mobility shift assay (EMSA) results showed that the C>G SNP in the MDM2 promoter creates a novel high-affinity binding site for the SP1 transcription factor, leading to increased MDM2 transcription.",
    "RNA-seq analysis of patient-derived lymphoblastoid cell lines with the rs2201397 SNP in the TRIM59 3' UTR showed a significant upregulation of TRIM59 transcript levels, which we hypothesize is due to altered microRNA binding.",
    "Using CRISPR-Cas9 to introduce the rs7552554 variant in the promoter region of the MYB oncogene, we observed a 3-fold increase in H3K27ac histone marks, indicating enhanced enhancer activity.",
    "The 5-kb deletion upstream of the SOX2 gene was found to completely abrogate the expression of the gene in embryonic stem cells, suggesting the removal of a critical long-range enhancer.",
    "In a high-throughput screen, we identified an A>G variant in the promoter of a lncRNA that significantly decreased its transcription by 75% in a disease-relevant cell model.",
    "Our Northern blot analysis revealed that the rs10825036 SNP in the 3' UTR of the MIR-499 gene resulted in a mature miRNA transcript that was 50% less abundant compared to the wild-type allele.",
    "Allele-specific chromatin accessibility assays (ATAC-seq) revealed a loss of an open chromatin region at the locus containing the rs12740374 variant, suggesting a disruption of a transcription factor binding site.",
    "The 3' UTR of the KRAS gene containing the rs61764370 variant was cloned into a reporter plasmid, and subsequent luciferase assays showed a significant reduction in luciferase activity, consistent with altered miRNA-mediated repression.",
    "Chr-RNA FISH experiments confirmed that the C>T mutation in the SHH limb enhancer region failed to form a chromatin loop with the SHH promoter, explaining the observed limb malformations.",
    "Reporter gene assays showed that the rs11603334 SNP in the ARAP1 promoter led to a 2-fold upregulation of gene expression in pancreatic beta cells, which was corroborated by decreased binding of PAX4 and PAX6 transcription factors in EMSA.",
    "We used CRISPR-based tiling scans to systematically test variants in the LDLR promoter, finding that a specific A>C substitution reduced promoter activity by 60%, a key finding for familial hypercholesterolemia.",
    "RNA-seq of patient tumor samples with the rs73183594 variant in the IRF4 enhancer showed a significant increase in IRF4 mRNA expression levels, suggesting a gain-of-function mechanism.",
    "Our allele-specific sequencing analysis of RNA from heterozygous individuals demonstrated that the rs3796548 SNP, a non-coding variant, leads to a 3-fold increase in the expression of the TULP3 gene.",
    "The deletion of an enhancer 150 kb upstream of the GATA6 gene was found to cause a loss of GATA6 protein expression in pancreatic progenitor cells, as confirmed by immunostaining.",
    "Using a high-resolution 4C-seq, we mapped the chromosomal interactions of the genomic region containing a non-coding variant, revealing the creation of a novel chromatin loop between a distant enhancer and an oncogene.",
    "A minigene assay showed that a single nucleotide variant (G>A) in the 5' untranslated region of an unknown gene caused an increase in ribosome readthrough and protein translation.",
    "Quantitative real-time PCR (qRT-PCR) from patient fibroblasts with a deep intronic variant in the IDUA gene revealed a reduced level of the fully spliced transcript and an increase in a mis-spliced isoform.",
    "The C>G variant in the promoter of the CASP8 gene, located in a CpG island, was shown by methylation-specific PCR to be associated with a complete loss of DNA methylation and subsequent gene silencing.",
    "In our chromatin accessibility assay, the A>G SNP in the FTO enhancer was found to disrupt the binding of the ARID5B transcription factor, leading to a downstream effect on gene regulation.",
    "Our reporter assay demonstrated that the rs1403565 variant in the PITX2 enhancer increased the transcriptional activity of a reporter construct by 30% in cardiac tissue cells.",
    "The insertion of a SINE retrotransposon into the promoter of the FGFR2 gene was shown by reporter assays to completely abolish the gene's promoter activity in fibroblasts.",
    "CRISPR activation experiments using a sgRNA targeting the promoter region of a lncRNA confirmed that a non-coding variant within this region was responsible for a significant reduction in gene expression."
]

In [147]:
gemini_egs = run_model(input_text=protein_variant_sentences)
filter_res(gemini_egs, "mutant")

- - - - - - Original text - - - - -
['Studies on the protein-protein interaction network revealed that the R175H '
 'variant of p53 dramatically alters its binding partners, shifting its '
 'function from a tumor suppressor to an oncogenic driver.']
- - Terms tagged of type 'mutant' - -
['r1', '75', 'h']
- - - - - - Original text - - - - -
['The structural consequences of the E6V substitution in hemoglobin are '
 'profound, leading to the formation of rigid polymers that deform red blood '
 'cells into their characteristic sickle shape.']
- - Terms tagged of type 'mutant' - -
['e6', 'v']
- - - - - - Original text - - - - -
['The D90A mutation in superoxide dismutase 1 (SOD1) highlights the complexity '
 'of disease mechanisms, as its pathogenicity in familial ALS is not due to a '
 'loss of catalytic activity but rather to a gain of toxic function.']
- - Terms tagged of type 'mutant' - -
['d', '90', 'a']
- - - - - - Original text - - - - -
['The ΔF508 deletion in the CFTR protein preve

In [148]:
gemini_exps = run_model(input_text=protein_variant_sentences_experimental_results)
filter_res(gemini_exps, "mutant")

- - - - - - Original text - - - - -
['Circular dichroism spectroscopy of the R175H p53 variant revealed a '
 'significant decrease in α-helical content compared to the wild-type protein, '
 'indicating a loss of structural integrity.']
- - Terms tagged of type 'mutant' - -
['r1', '75', 'h']
- - - - - - Original text - - - - -
['In our gel electrophoresis assays, the E6V hemoglobin variant showed a '
 'distinct band shift in non-denaturing conditions, consistent with polymer '
 'formation under deoxygenated states.']
- - Terms tagged of type 'mutant' - -
['e6', 'v']
- - - - - - Original text - - - - -
['Immunoblot analysis of cells expressing the D90A SOD1 variant demonstrated a '
 'marked increase in detergent-insoluble protein, suggesting a propensity for '
 'aggregation.']
- - Terms tagged of type 'mutant' - -
['d', '90', 'a']
- - - - - - Original text - - - - -
['Western blot analysis of the ΔF508 CFTR variant showed a complete absence of '
 'the mature, fully glycosylated band, con

In [149]:
gemini_gene = run_model(input_text=non_coding_variant_sentences)
filter_res(gemini_gene, "mutant")

- - - - - - Original text - - - - -
['Our massively parallel reporter assay (MPRA) data showed that the A>T '
 "variant in the 3' UTR of the NOS1AP gene reduced the expression of a linked "
 'fluorescent reporter by 40% in cardiomyocyte cells.']
- - Terms tagged of type 'mutant' - -
['a > t']
- - - - - - Original text - - - - -
['Sanger sequencing of the TERT promoter in melanoma samples identified two '
 'recurrent non-coding variants, c.-124C>T and c.-146C>T, which were shown by '
 'RT-qPCR to increase TERT mRNA expression by 15- and 20-fold, respectively.']
- - Terms tagged of type 'mutant' - -
['c . - 124 c > t', 'c . - 146 c > t']
- - - - - - Original text - - - - -
['Minigene splicing assays confirmed that the deep intronic variant '
 'IVS21-12C>T in the CFTR gene leads to the creation of a cryptic splice site, '
 'resulting in a mis-spliced transcript lacking exon 22.']
- - Terms tagged of type 'mutant' - -
['iv', 's2', '1 - 12 c > t']
- - - - - - Original text - - - - -
['Our e

In [156]:
filter_res(gemini_gene, "protein")

- - - - - - Original text - - - - -
['Chromatin immunoprecipitation followed by sequencing (ChIP-seq) for the '
 'transcription factor TCF7L2 revealed a loss of binding at the promoter of '
 'the TCF7L2 gene in the presence of the rs7903146 risk allele.']
- - Terms tagged of type 'protein' - -
['tcf', '7', 'l2', 'tcf', '7', 'l2']
- - - - - - Original text - - - - -
['Our massively parallel reporter assay (MPRA) data showed that the A>T '
 "variant in the 3' UTR of the NOS1AP gene reduced the expression of a linked "
 'fluorescent reporter by 40% in cardiomyocyte cells.']
- - Terms tagged of type 'protein' - -
['nos', '1a', 'p']
- - - - - - Original text - - - - -
['Sanger sequencing of the TERT promoter in melanoma samples identified two '
 'recurrent non-coding variants, c.-124C>T and c.-146C>T, which were shown by '
 'RT-qPCR to increase TERT mRNA expression by 15- and 20-fold, respectively.']
- - Terms tagged of type 'protein' - -
['tert', 'tert']
- - - - - - Original text - - - - -

In [157]:
gemini_gene

[('Using a dual-luciferase reporter assay, we demonstrated that the SNP rs6983267, located in a long-range enhancer for the MYC oncogene, significantly increased reporter gene expression by 2.5-fold in colon cancer cell lines.',
  [{'entity': 'dual - luciferase reporter assay',
    'type': 'experimental_method'},
   {'entity': 'rs', 'type': 'gene'},
   {'entity': '69', 'type': 'gene'},
   {'entity': '83', 'type': 'gene'},
   {'entity': '267', 'type': 'gene'},
   {'entity': 'long - range enhancer', 'type': 'structure_element'},
   {'entity': 'myc', 'type': 'protein_type'}]),
 ('Chromatin immunoprecipitation followed by sequencing (ChIP-seq) for the transcription factor TCF7L2 revealed a loss of binding at the promoter of the TCF7L2 gene in the presence of the rs7903146 risk allele.',
  [{'entity': 'chromatin immunoprecipitation followed by sequencing',
    'type': 'experimental_method'},
   {'entity': 'chip - seq', 'type': 'experimental_method'},
   {'entity': 'transcription factor', 't

### AE classifier - not worth chasing

In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# This model is a classifier: AE is / is not present in text
tokenizer = AutoTokenizer.from_pretrained("jocforero/longformer-adverse-event-classifier")
ae_model = AutoModelForSequenceClassification.from_pretrained("jocforero/longformer-adverse-event-classifier")

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/961 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/595M [00:00<?, ?B/s]

In [17]:
gemini_ae_egs = [
    "Gastrointestinal bleeding occurred in two patients receiving the active treatment."
    "One participant in the control group experienced anaphylactic shock requiring immediate medical intervention.",
    "The most frequently reported adverse event was mild-to-moderate headache, observed in 15% of the study population.",
    "Three cases of acute renal failure were documented, leading to discontinuation of the study drug.",
    "A dose-dependent increase in serum transaminases was observed in a subset of participants.",
    "The device's implantation was associated with local tissue necrosis in 5% of subjects.",
    "A single instance of pulmonary embolism was reported in a patient with no prior history of thrombotic events.",
    "Patients on the higher dose of the compound exhibited a significant increase in systolic blood pressure.",
    "Study withdrawal was attributed to unmanageable nausea and vomiting in four subjects.",
    "One participant developed a maculopapular rash that was determined to be a hypersensitivity reaction to the study medication.",
    "This is a test, nothing to see here.", # Should be false, completely out of context
    "The study was conducted in a double-blind, placebo-controlled manner.", # Should be false, in context but 0 AE
    "An adverse event of headache was reported." # Can't get much more obvious a statement
    ]

In [16]:
import torch

batch_size=4
max_length=512

all_results = []
    
# Process in batches
for i in range(0, len(gemini_ae_egs), batch_size):
    # Get the current batch
    batch_texts = gemini_ae_egs[i:i + batch_size]
    
    # Tokenize and pass to model
    inputs = tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True, max_length=max_length)
    print(batch_texts)
    with torch.no_grad():
        outputs = ae_model(**inputs)

    # Decode predictions and configure id to label map
    predictions = torch.argmax(outputs.logits, dim=1)
    print(predictions)
    id_to_label = ae_model.config.id2label
    print(id_to_label)

['Gastrointestinal bleeding occurred in two patients receiving the active treatment.One participant in the control group experienced anaphylactic shock requiring immediate medical intervention.', 'The most frequently reported adverse event was mild-to-moderate headache, observed in 15% of the study population.', 'Three cases of acute renal failure were documented, leading to discontinuation of the study drug.', 'A dose-dependent increase in serum transaminases was observed in a subset of participants.']
tensor([0, 0, 0, 0])
{0: 'LABEL_0', 1: 'LABEL_1'}
["The device's implantation was associated with local tissue necrosis in 5% of subjects.", 'A single instance of pulmonary embolism was reported in a patient with no prior history of thrombotic events.', 'Patients on the higher dose of the compound exhibited a significant increase in systolic blood pressure.', 'Study withdrawal was attributed to unmanageable nausea and vomiting in four subjects.']
tensor([0, 0, 0, 0])
{0: 'LABEL_0', 1: '