In [None]:
import requests
from xml.etree import ElementTree
import pandas as pd

def get_taxon_genome_features(taxon_id):
    # Base URL for E-utilities
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
    
    # Use esearch to search for the taxon identifier in the taxonomy database
    search_url = f"{base_url}esearch.fcgi?db=taxonomy&term={taxon_id}&retmode=json"
    search_response = requests.get(search_url)
    search_data = search_response.json()
    
    # Obtain the Taxonomy ID from search response
    tax_id = search_data["esearchresult"]["idlist"][0]

    # Use efetch to get genome features using the taxonomy ID
    fetch_url = f"{base_url}efetch.fcgi?db=genome&id={tax_id}&rettype=fasta&retmode=xml"
    fetch_response = requests.get(fetch_url)
    genome_data = ElementTree.fromstring(fetch_response.content)
    
    # Parse the XML to extract genome features
    features = {}
    for feature in genome_data.findall('.//Feature'):
        feature_key = feature.find('Feature_key').text
        feature_value = feature.find('Feature_value').text
        if feature_key and feature_value:
            features[feature_key] = features.get(feature_key, []) + [feature_value]
    
    return features

def save_to_tsv(data, output_file):
    # Convert the dictionary to a pandas DataFrame
    df = pd.DataFrame.from_dict(data, orient='index')
    
    # Transpose the DataFrame so that taxon identifiers are row names
    df = df.transpose()
    
    # Save the DataFrame as TSV
    df.to_csv(output_file, sep='\t', header=True, index=True)

# Example Taxon ID
taxon_id = "NCBITaxon:100053"

# Strip 'NCBITaxon:' prefix to use with NCBI API
clean_taxon_id = taxon_id.replace("NCBITaxon:", "")

# Obtain genome features
genome_features = get_taxon_genome_features(clean_taxon_id)

# Specify the output file name
output_tsv = "genome_features.tsv"

# Save the genome features to a TSV file
save_to_tsv(genome_features, output_tsv)

print(f"Genome features saved to {output_tsv}")

In [None]:
import requests

def get_uniprot_proteome_id(ncbi_taxon_id):
    # Remove the prefix 'NCBITaxon:' if present
    ncbi_taxon_id = ncbi_taxon_id.replace('NCBITaxon:', '')

    # UniProt search URL
    url = 'https://rest.uniprot.org/proteomes/search'
    params = {
        'query': f'taxonomy:{ncbi_taxon_id}',
        'format': 'list'
    }

    response = requests.get(url, params=params)

    if response.status_code == 200:
        # Parse the response text for proteome IDs
        proteome_ids = response.text.strip().split('\n')
        if proteome_ids:
            return proteome_ids  # Return a list of found proteome IDs
        else:
            print(f"No proteome found for NCBI Taxon ID: {ncbi_taxon_id}")
            return None
    else:
        print(f"Failed to retrieve data from UniProt for NCBI Taxon ID: {ncbi_taxon_id}")
        return None

# Example usage
ncbi_taxon_id = 'NCBITaxon:100053'
clean_taxon_id = ncbi_taxon_id.replace("NCBITaxon:", "")
proteome_ids = get_uniprot_proteome_id(clean_taxon_id)

if proteome_ids:
    for proteome_id in proteome_ids:
        print(f"Found UniProtKB proteome identifier: {proteome_id}")


In [None]:
for taxon in data_pairs_clean.subject:
    taxon = taxon.replace("NCBITaxon:", "")
    proteome_ids = get_uniprot_proteome_id(taxon)

    if proteome_ids:
        for proteome_id in proteome_ids:
            print(f"Found UniProtKB proteome identifier: {proteome_id}")

In [None]:
import requests
from xml.etree import ElementTree

def get_taxonomic_lineage(ncbi_taxon_id):
    # NCBI EFetch URL for taxonomy database
    efetch_url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"

    # Parameters for fetching taxonomy information
    params = {
        "db": "taxonomy",
        "id": ncbi_taxon_id,
        "retmode": "xml"
    }

    # Make the request to NCBI EFetch
    response = requests.get(efetch_url, params=params)
    
    # Check if the request was successful
    if response.status_code == 200:
        # Parse the XML response
        root = ElementTree.fromstring(response.content)
        # Find the lineage string
        lineage = root.find(".//Lineage")
        if lineage is not None:
            return lineage.text
        else:
            print(f"Lineage information not found for Taxon ID {ncbi_taxon_id}")
            return None
    else:
        print(f"Failed to retrieve data from NCBI for Taxon ID {ncbi_taxon_id}")
        return None

# Example usage:
ncbi_taxon_id = "9606"  # Homo sapiens Taxon ID
lineage = get_taxonomic_lineage(ncbi_taxon_id)

if lineage:
    print(f"Taxonomic lineage for NCBI Taxon ID {ncbi_taxon_id}: {lineage}")


In [None]:
lineages = []
counter =0
for taxon in data_pairs_clean.subject:
    taxon = taxon.replace("NCBITaxon:", "")
    print(f'index: {counter}\ttaxon:{taxon}')
    lineage = get_taxonomic_lineage(taxon)
    lineages.append(lineage)
    counter = counter +1 