In [1]:
import sys
import os
from pathlib import Path
from glob import glob
import pandas as pd
import json
from ast import literal_eval
import numpy as np
from classes.literature import literature
lit = literature()

## GET SELF-RESISTANCE LITERATURE

### PUBMED

In [None]:
srdf = pd.read_csv('data/abstracts_self_resistance_and_gene.csv')
srdf['pmid'] = srdf['pmid'].apply(lambda x: str(x)) # need to convert to string for europepmc query
srpmids = list(srdf['pmid'])

##### GET PMCIDS FOR PMIDS

In [None]:
lit.xrf_pmc('self_resistance_pubmed','data',srpmids)

##### GET PMC XMLS FROM EUROPEPMC API

In [None]:
lit.get_ft_xml('data/pmc_xml','data/self_resistance_pubmed_pmcids.json')

##### PARSE PMC XML AND EXTRACT SECIONS TEXTS

In [None]:
lit.get_ft_text('data/pmc_xml',['data/pmc_txt','data/pmc_json'],'data/self_resistance_pubmed_pmcids.json')

### PMC

In [None]:
pmcids = lit.get_ncbi_db_ids('pmc','self-resistance AND gene')
pmcids

##### NUCCORE IDS

In [None]:
nids = lit.map_ncbi_db_ids('pmc','nuccore',pmcids,100)
nids

##### ACCESSION IDS

In [None]:
accids = lit.get_ncbi_ids_summary('nuccore',nids,'all',100)
accids

In [None]:
pd.DataFrame(accids).to_csv('data/self_resistance_nuccore_ids.csv')

##### PMCIDS

In [None]:
pmcids = ['PMC'+str(pmcid) for pmcid in pmcids]

In [None]:
lit.xrf_pmc('self_resistance','data',pmcids)

In [None]:
lit.get_ft_xml('data/pmc_xml','data/self_resistance_pmcids.json')

In [None]:
lit.get_ft_text('data/pmc_xml',['data/pmc_txt','data/pmc_json'],'data/self_resistance_pmcids.json')

## GET SELF-RESISTANCE GENE ENTITIES USING LLMs

##### notes
- This paper https://www.ncbi.nlm.nih.gov/pmc/articles/PMC10031226/ didn't have accession for BGCs cluster but on searching for "SacH" AND "Pseudomonas fluorescens" a cluster was found https://www.ncbi.nlm.nih.gov/nuccore/116552886 but the species name need to be not abbreviated 

#### TASK AND EXAMPLES

In [2]:
task = "Read the following context and identify only self-resistance entities using the following format: ['entity_1|self-resistance-gene-protein/self-resistance-compound/self-resistance-mechanism/accession/self-resistance-regulator/organism','entity_2|self-resistance-gene-protein/self-resistance-compound/self-resistance-mechanism/accession/self-resistance-regulator/organism','entity_3|self-resistance-gene-protein/self-resistance-compound/self-resistance-mechanism/accession/self-resistance-regulator/organism',...]. These can include self-resistance-gene-protein, self-resistance-compound, self-resistance-mechanism, accession, self-resistance-regulator, organism. Avoid creating sub-lists or combining entities within sub-lists. Do not provide any additional explanations, commentaries, or context. Your output should only contain the identified entities, not any additional information or interpretation.If the text does not contain any self-resistance-related entities, your output should be an empty list []."

In [3]:
content_output = [
    {
        'content':"However, similar to the gene-deletion experiments of berkC17, a self-resistance gene in Penicillium egyptiacum, after enormous attempts and screenings with different sucrose concentrations, it was still failed to get the sacH-deletion strain. Then E. coli BL21 (DE3) cells were used as the test strain to examine the relationship between sacH-expression and the resistance against SAC-B. The results showed that the survival ratio of sacH-expression strain did not decrease with the increase of SAC-B concentration, compared with the control strain containing vector pET28a (Fig.\xa02F). These collective results unambiguously supported that SacH, the DHFR-like protein, endows host cells resistance to SAC-B for self-protection via reductive inactivation of not only the end-product but also the hemiaminal pharmacophore-containing biosynthetic intermediate.",
        'output':"['berkC17|self-resistance-gene-protein', 'Penicillium egyptiacum|organism', 'SAC-B|self-resistance-compound', 'SacH|self-resistance-gene-protein', 'reductive inactivation|self-resistance-mechanism']"
    },
    {
        'content':"This result further supports that DHFR-like protein SacH from P. fluorescens plays the key role in reductively inactivating the hemiaminal pharmacophore for self-resistance with broad substrate scope, which functionally resembles SfmO1 in SFM biosynthesis from S. lavendulae.",
        'output':"['SacH|self-resistance-gene-protein', 'P. fluorescens|organism', 'reductively inactivating|self-resistance-mechanism','SfmO1|self-resistance-gene-protein,'SFM|self-resistance-compound','S. lavendulae|organism']"
    },
    {
        'content':"We focused on two strains, BCI1 and BCI2, which are both Pseudonocardia sp. collected from Apterostigma dentigerum ant colonies on Barro Colorado Island, Panama. BCI2's high tolerance for 9-methoxyrebeccamycin implies a resistance mechanism on the pBCI2-2 plasmid. The integral membrane transporter RebT, a putative efflux pump in the major facilitator family, has been shown to confer rebeccamycin resistance when heterologously expressed in an otherwise sensitive Actinomycete, Streptomyces albus.15 A distinct member of this transporter family is encoded in both the BCI2 9-methoxyrebeccamycin and eDNA AB857 clusters (Figure 2B, brown shaded arrows) and is the most likely candidate for self-resistance. An uncharacterized putative transporter gene, rebU, is also present in all three clusters and could also contribute to resistance.",
        'output':"['Pseudonocardia sp|organism','BCI1|organism','BCI2|organism','9-methoxyrebeccamycin|self-resistance-compound','RebT|self-resistance-gene-protein','efflux pump|self-resistance-mechanism','rebeccamycin|self-resistance-compound','rebU|self-resistance-gene-protein','transporter gene|self-resistance-mechanism']"
    },
    {
        'content':"Nucleotide sequence accession number. The nucleotide sequences of rebeccamycin biosynthetic genes have been submitted to DDBJ under accession number AB090952.",
        'output':"['AB090952|accession','rebeccamycin|self-resistance-compound']"
    },
    {
        'content':"Here we report the unexpected discovery that althiomycin is produced by this organism, as a metabolic product of a previously unidentified biosynthetic gene cluster. We observed that S. marcescens Db10 produced a diffusible metabolite able to inhibit the growth of Bacillus subtilis. A cluster of six genes encoding a hybrid NRPS-PKS assembly line, two tailoring enzymes and a putative self-resistance protein were identified as responsible for production of the antimicrobial. In antibiotic producing organisms, resistance genes are commonly located within the antibiotic biosynthetic gene cluster [34]. We therefore hypothesised that Alb1 was required for self-resistance to althiomycin and/or export of althiomycin from the cell into the surrounding environment. We predicted that if alb1 was essential for self-resistance, it might not be possible to make an althiomycin-producing alb1 mutant due to lethality.",
        'output':"['althiomycin|self-resistance-compound','S. marcescens Db10|organism','Alb1|self-resistance-gene-protein','efflux|self-resistance-mechanism']"
    },
]

In [4]:
content_output[0]['content']

'However, similar to the gene-deletion experiments of berkC17, a self-resistance gene in Penicillium egyptiacum, after enormous attempts and screenings with different sucrose concentrations, it was still failed to get the sacH-deletion strain. Then E. coli BL21 (DE3) cells were used as the test strain to examine the relationship between sacH-expression and the resistance against SAC-B. The results showed that the survival ratio of sacH-expression strain did not decrease with the increase of SAC-B concentration, compared with the control strain containing vector pET28a (Fig.\xa02F). These collective results unambiguously supported that SacH, the DHFR-like protein, endows host cells resistance to SAC-B for self-protection via reductive inactivation of not only the end-product but also the hemiaminal pharmacophore-containing biosynthetic intermediate.'

In [5]:
content_output[0]['output']

"['berkC17|self-resistance-gene-protein', 'Penicillium egyptiacum|organism', 'SAC-B|self-resistance-compound', 'SacH|self-resistance-gene-protein', 'reductive inactivation|self-resistance-mechanism']"

In [6]:
examples = []
for co in content_output:
    examples.append({'role':'user','content':task+'CONTEXT: '+co['content']})
    examples.append({'role':'assistant','content':co['output']})
examples[0:2]

[{'role': 'user',
  'content': "Read the following context and identify only self-resistance entities using the following format: ['entity_1|self-resistance-gene-protein/self-resistance-compound/self-resistance-mechanism/accession/self-resistance-regulator/organism','entity_2|self-resistance-gene-protein/self-resistance-compound/self-resistance-mechanism/accession/self-resistance-regulator/organism','entity_3|self-resistance-gene-protein/self-resistance-compound/self-resistance-mechanism/accession/self-resistance-regulator/organism',...]. These can include self-resistance-gene-protein, self-resistance-compound, self-resistance-mechanism, accession, self-resistance-regulator, organism. Avoid creating sub-lists or combining entities within sub-lists. Do not provide any additional explanations, commentaries, or context. Your output should only contain the identified entities, not any additional information or interpretation.If the text does not contain any self-resistance-related entities

In [8]:
prompt = {'task':task,'examples':examples}
with open(Path('data/llm/input/prompt.json'), 'w', encoding='utf-8') as j:json.dump(prompt,j,ensure_ascii=False,indent=4)

#### GENERATE LLM INPUT

In [16]:
files = list(Path('../bgcs/emerald_bgcs_annotations/pubmed_output/sent-json').glob('**/*.json')) 
llm_data = []
for f in files:
    if 'checkpoint' not in str(f):
        with open(f) as j: sents = np.unique(json.load(j)).tolist()
        for sent in sents:
            sid = sent['id']
            for ann in sent['anns']:
                for tag in ann['tags']:
                    sent_data = {'id':sid,'text':ann['exact'],'entity':ann['type'],'term':tag['name']}
                    llm_data.append(sent_data)    
llm_data_df = pd.DataFrame(llm_data)
llm_data_df = llm_data_df.drop_duplicates(subset=['id','text','entity','term'],keep='last').reset_index(drop=True)
# llm_data_df = llm_data_df[llm_data_df['entity'].isin(['bgc-gene-name','bgc-gene-product-name'])]
llm_data_dfg = llm_data_df.groupby(['id','text']).agg({'entity':list,'term':list}).reset_index()
llm_data_dfg

Unnamed: 0,id,text,entity,term
0,PMC10100954,(A) EMSA of LodR1 binding to the intergenic re...,"[bgc-gene-name, bgc-gene-name, bgc-gene-name, ...","[lodR1–lodA, lodA, lodR2–lodR3, lodR3]"
1,PMC10100954,(A) EMSA of LodR2 binding to the intergenic re...,"[bgc-gene-name, bgc-gene-name]","[lodE–lodR2, lodR2]"
2,PMC10100954,(A) Lasalocid A production in the S. lasalocid...,"[bgc-compound, bgc-organism]","[Lasalocid A, S. lasalocidi ATCC 31180T]"
3,PMC10100954,(A) Schematic diagram of gusA transcriptional ...,[bgc-gene-name],[gusA]
4,PMC10100954,(B) HPLC analysis of lasalocid A production in...,"[bgc-compound, bgc-gene-name, bgc-gene-name, b...","[lasalocid A, las4, las4C, lodR3, lodR3C]"
...,...,...,...,...
9542,PMC9854587,Within the gene cluster of tiancimycin (Figure...,"[bgc-compound, bgc-action, bgc-gene-name, bgc-...","[tiancimycin, resistance, tnmS1, tnmS2, tnmS3]"
9543,PMC9854587,Wright et al. identified the caseinolytic prot...,"[bgc-compound, bgc-gene-product-name, bgc-acti...","[clipibicyclene, ClpP, inhibitor, antibiotic, ..."
9544,PMC9854587,YTM is a potent genotoxic agent belonging to t...,"[bgc-compound, bgc-compound, bgc-action, bgc-a...","[YTM, YTM-producer, genotoxic, cytotoxicity, t..."
9545,PMC9854587,YtkR6 is homologous to the drug-resistance tra...,[bgc-gene-product-name],[YtkR6]


In [None]:
llm_data_dfg.to_csv(Path('data/llm/input/input.csv'))

In [None]:
# llm_data_dfg[0:10].to_csv(Path('data/llm/input/test.csv'))

In [None]:
! python classes/llm.py --input 'data/llm/input/test.csv' --prompt 'data/llm/input/prompt.json' --model 'meta-llama/Llama-3.1-70B-Instruct' --output 'data/llm/output/output.csv' --mconfig 'quantize' --hf_token 'xxxxxxx'