# Genes
The following notebook retrieves all the genes involved in the reactions from the **"Rxns" Sheet** in Google Sheet file. Then, information regarding the genes is retrieved from different databases and a **"Genes" Sheet** is generated and updated.

In [1]:
import pandas as pd
import numpy as np
import time
from tqdm.notebook import tqdm

from google_sheet import GoogleSheet
from utils import get_gene_info

### 1. Generate "rxns" and "genes" datasets
The "rxns" dataset contains all the reactions with their GPR associations. The "genes" dataset contains all the information of the genes/GPR involved in our reconstruction.

In [2]:
KEY_FILE_PATH = 'credentials.json'
SPREADSHEET_ID = '1MlBXeHIKw8k8fZyXm-sN__AHTRSunJxar_-bqvukZws'

# Initialize the GoogleSheet object
sheet = GoogleSheet(SPREADSHEET_ID, KEY_FILE_PATH)

# Read data from the Google Sheet
sheet_rxns = 'Rxns'
sheet_genes = 'Genes'

rxns = sheet.read_google_sheet(sheet_rxns)
genes = sheet.read_google_sheet(sheet_genes)

### 2. Create the a "gene_list" with all the genes involved in our reconstruction
We extract the gene IDs from the GPR annotations in the **"rxns" df**. This information is only retrieved from the **GPR_final** column.

In [3]:
# Generation of gene_list from all the genes in the "Whole Cell Network Reconstruction in CHO Cells" dataset
import re

gene_list = []
human_gene_list = []
for index, row in rxns.iterrows():
    if row['GPR_final'] != '':
        gprs = str(row['GPR_final'])
        gpr = re.findall(r'\b[hH]?\d+\b', gprs)
        for g in gpr:
            if g.lower().startswith('h'):  # we convert to lower case to catch both 'h' and 'H'
                human_gene_list.append(int(g[1:]))  # remove 'h' and convert to int before appending
            else:
                gene_list.append(int(g))  # convert to int before appending

gene_list = list(set(gene_list))
len(gene_list)

3045

### 3. Replace remaining Human Gene IDs in our dataset with CHO Gene IDs
Using the **cho2human_mapping** file, we replace the Human gene IDs with those of CHO, these IDs were not spotted when we first added the Recon3D reactions.

In [4]:
# Build a dictionary with human gene IDs as keys and CHO gene IDs as values
orthologs = pd.read_csv('../Data/Orthologs/cho2human_mapping.tsv', sep='\t')
human_to_cho_dict = dict(zip(orthologs['HUMAN_ID'], orthologs['CHO_ID']))

def replace_human_ids_with_cho_ids(s):
    replaced_dict = {}
    if isinstance(s, str):
        genes = re.findall(r'\b[hH]?\d+\b', s)
        for gene in genes:
            if gene.lower().startswith('h'):  # if gene ID starts with 'h' or 'H'
                human_id = int(gene[1:])  # remove 'h' and convert to int
                if human_id in human_to_cho_dict:  # if human gene ID is in the dictionary
                    cho_id = human_to_cho_dict[human_id]  # get corresponding CHO gene ID
                    s = s.replace(gene, str(cho_id))  # replace human gene ID with CHO gene ID in the string
                    replaced_dict[gene] = cho_id
    return s, replaced_dict

# Initialize a dictionary to store replacements
replacements = {}

# Apply the function and update the replacements dictionary
for i in range(len(rxns)):
    rxns.at[i, 'GPR_final'], replacements_dict = replace_human_ids_with_cho_ids(rxns.at[i, 'GPR_final'])
    replacements.update(replacements_dict)

# Now, 'replacements' is a dictionary where keys are the original gene IDs and values are the replaced CHO gene IDs
replacements

{}

In [5]:
######################################
#### ---------------------------- ####
#### ---- Update Rxns Sheet ----- ####
#### ---------------------------- ####
######################################
if replacements:
    sheet.update_google_sheet(sheet_rxns, rxns)
    print("Google Sheet updated.")
elif not replacements:
    print("No changes to the original Rxns Sheet")

No changes to the original Rxns Sheet


### 3. Add the genes from the "gene_list" to the genes df
Using a list of all the genes included in the dataset we can retrieve information from the NIH database regarding Gene Symbol, Gene Name, Gene Ensembl ID, and mRNA ID and protein ID using the function get_gene_info().

In [7]:
import time
from urllib.error import HTTPError
from Bio.Entrez.Parser import ValidationError

max_retries = 5  # Set the maximum number of retries
c = 0

for g in tqdm(gene_list):
    if str(g) not in list(genes['Gene Entrez ID']):
        retry = True
        retries = 0
        while retry:
            try:
                print(g)
                organism, gene_symbol, gene_name, gene_description, picr_ensembl_id, chok1gs_ensembl_id, mRNA_ncbi_id, protein_ncbi_id, go_terms = get_gene_info(g)
                if organism == 'Cricetulus griseus':
                    new_row_data = {'Gene Entrez ID': g, 'Gene Symbol': gene_symbol, 'Gene Name': gene_name, 'Gene Description': gene_description,
                                    'PICR Ensembl ID': picr_ensembl_id, 'CHOK1GS Ensembl ID': chok1gs_ensembl_id, 'Transcript ID': mRNA_ncbi_id,
                                    'Protein ID': protein_ncbi_id, 'GO Terms': go_terms}
                    new_row_df = pd.DataFrame(new_row_data, index=[len(genes)])
                    genes = pd.concat([genes, new_row_df])
                    c+=1
                    retry = False
                elif organism != 'Cricetulus griseus':
                    print(f'Gene {g,gene_symbol} is a {organism} Gene')
                    break
            except ValidationError:
                print(f'Gene {g} not found')
                retry = False
                continue
            except HTTPError:
                retries += 1
                if retries > max_retries:
                    print(f'HTTP Error, reached maximum retries ({max_retries}) for gene {g}')
                    break
                else:
                    print(f'HTTP Error, retrying with gene {g}')
                    time.sleep(1)
                    
print(f'A total of {c} Genes were added to the dataset')

  0%|          | 0/3045 [00:00<?, ?it/s]

30100754813
Gene 30100754813 does not have PICR Ensembl ID
Gene (30100754813, 'ycf15') is a Vitis cinerea Gene
A total of 0 Genes were added to the dataset


In [8]:
#######################################
#### ----------------------------- ####
#### ---- Update Genes Sheet ----- ####
#### ----------------------------- ####
#######################################
if c > 0:
    sheet.update_google_sheet(sheet_genes, genes)
    print("Google Sheet updated.")
elif c == 0:
    print("No new genes added.")
    print("No changes to the original Genes Sheet.")

No new genes added.
No changes to the original Genes Sheet.


### 4. Eliminate unwanted genes
We iterate over the entire dataset to spot **Human genes** in order to eliminate them

In [None]:
# Make a copy of the DataFrame to avoid changing the original while iterating
erased = 0
genes_copy = genes.copy()

for index, row in tqdm(genes.iterrows(), total=genes.shape[0]):
    g = row['Gene Entrez ID']
    retry = True
    retries = 0
    while retry:
        try:
            organism, gene_symbol, gene_name, gene_description, picr_ensembl_id, chok1gs_ensembl_id, mRNA_ncbi_id, protein_ncbi_id, go_terms = get_gene_info(g)
            if organism == 'Homo sapiens':
                genes_copy = genes_copy.drop(index)
                print(f'Gene {g,gene_symbol} erased from dataset')
                erased+=1
                retry = False
            else:
                retry = False
        except ValidationError:
            print(f'Gene {g} not found')
            retry = False
            continue
        except HTTPError:
            retries += 1
            if retries > max_retries:
                print(f'HTTP Error, reached maximum retries ({max_retries}) for gene {g}')
                break
            else:
                print(f'HTTP Error, retrying with gene {g}')
                time.sleep(1)

# Replace the original DataFrame with the updated one
genes = genes_copy
print(f'A total of {erased} Genes were erased from the original dataset')

In [None]:
#######################################
#### ----------------------------- ####
#### ---- Update Genes Sheet ----- ####
#### ----------------------------- ####
#######################################
if erased > 0:
    sheet.update_google_sheet(sheet_genes, genes)
    print("Google Sheet updated.")
elif erased == 0:
    print("No erased genes.")
    print("No changes to the original Genes Sheet.")

In [None]:
for g in tqdm(gene_list):
    if g not in list(genes['Gene Entrez ID']):
        try:
            gene_symbol, gene_name, gene_description, picr_ensembl_id, chok1gs_ensembl_id, mRNA_ncbi_id, protein_ncbi_id, go_terms = get_gene_info(g)
            new_row_data = {'Gene Entrez ID': g, 'Gene Symbol': gene_symbol, 'Gene Name': gene_name, 'Gene Description': gene_description,
                            'PICR Ensembl ID': picr_ensembl_id, 'CHOK1GS Ensembl ID': chok1gs_ensembl_id, 'Transcript ID': mRNA_ncbi_id,
                            'Protein ID': protein_ncbi_id, 'GO Terms': go_terms}
            new_row_df = pd.DataFrame(new_row_data, index=[len(genes)])
            genes = pd.concat([genes, new_row_df])
        except KeyError:
            print(f'Gene {g} not found')
            continue

In [None]:
# Fetch information from the NIH database


# Complete null or blank information in the already generated "Genes Sheet" dataset
for i,row in genes.iterrows():
    if row['Gene Entrez ID'] == '':
        for g in gene_list:
            gene_sheet_list = [str(x) for x in genes['Gene Entrez ID']]
            if g not in gene_sheet_list:
                gene_symbol, gene_name, gene_description, picr_ensembl_id, chok1gs_ensembl_id, mRNA_ncbi_id, protein_ncbi_id, go_terms = get_gene_info(g)
                row['Gene Entrez ID'] = g
                row['Gene Symbol'] = gene_symbol
                row['Gene Name'] = gene_name
                row['Gene Description'] = gene_description
                row['PICR Ensembl ID'] = picr_ensembl_id
                row['CHOK1GS Ensembl ID'] = chok1gs_ensembl_id
                row['Transcript ID'] = mRNA_ncbi_id
                row['Protein ID'] = protein_ncbi_id
                row['GO Terms'] = go_terms
                break          
    elif row['Gene Entrez ID'] != '' and (row['Gene Symbol'] == '' or row['Gene Name'] == '' or row['PICR Ensembl ID'] == '' or row['Transcript ID'] == '' or row['Protein ID'] == ''):
        row['Gene Symbol'] = gene_symbol
        row['Gene Name'] = gene_name
        row['Gene Description'] = gene_description
        row['PICR Ensembl ID'] = picr_ensembl_id
        row['CHOK1GS Ensembl ID'] = chok1gs_ensembl_id
        row['Transcript ID'] = mRNA_ncbi_id
        row['Protein ID'] = protein_ncbi_id
        row['GO Terms'] = go_terms
'''
# Add genes from the gene_list that are not yet in the "Genes Sheet" dataset
for g in gene_list:
    # the first try/except is to avoid overwritting data in case there already some info in the dataset
    try:
        gene_sheet_list = [str(x) for x in df['Gene Entrez ID']]
        id = max(df['Index']) + 2
    except:
        gene_sheet_list = []
        id = 2
    if gene not in gene_sheet_list:
        try:
            gene_symbol, gene_name, gene_description, picr_ensembl_id, chok1gs_ensembl_id, mRNA_ncbi_id, protein_ncbi_id, go_terms = get_gene_info(gene)
            print(id-1)
            genes_sheet.update_cell(id,1,id-1)
            time.sleep(5)
            genes_sheet.update_cell(id,2,gene)
            time.sleep(5)
            genes_sheet.update_cell(id,3,gene_symbol)
            time.sleep(5)
            genes_sheet.update_cell(id,4,gene_name)
            time.sleep(5)
            genes_sheet.update_cell(id,5,gene_description)
            time.sleep(5)
            genes_sheet.update_cell(id,6,picr_ensembl_id)
            time.sleep(5)
            genes_sheet.update_cell(id,7,chok1gs_ensembl_id)
            time.sleep(5)
            genes_sheet.update_cell(id,8,mRNA_ncbi_id)
            time.sleep(5)
            genes_sheet.update_cell(id,9,protein_ncbi_id)
            time.sleep(5)
            genes_sheet.update_cell(id,10,go_terms)
        except:
            print('Google API quota exceeded')
            time.sleep(5)
            continue
            
'''

In [None]:
genes

In [None]:
get_gene_info('100750772')

In [None]:
from Bio import Entrez
Entrez.email = 'account1@theta-ocean-377718.iam.gserviceaccount.com'
handle = Entrez.efetch(db='gene', id='3979190', retmode='xml')
record = Entrez.read(handle)[0]

#gene_name = record['Entrezgene_gene']['Gene-ref']['Gene-ref_desc']
#gene_symbol = record['Entrezgene_gene']['Gene-ref']['Gene-ref_locus']

human_handle = Entrez.efetch(db='gene', id='3979190', retmode='xml')
human_record = Entrez.read(human_handle)[0]
if 'Entrezgene_comments' in human_record and 'Gene-commentary_comment' in human_record['Entrezgene_comments'][0]:
    human_gene_description = human_record['Entrezgene_comments'][0]['Gene-commentary_comment'][0]['String']
elif 'Entrezgene_summary' in human_record:
    human_gene_description = human_record['Entrezgene_summary']
else:
    human_gene_description = ''

#### Get the CHO genome and map the IDs

In [41]:
import pandas as pd
from Bio import Entrez
import math
import time
import ssl
from urllib.error import HTTPError
import socket
from Bio.Entrez.Parser import CorruptedXMLError

def get_uniprot_from_dict(dictionary, uniprotIDs, target_key, target_value1,  target_value2):
    for key, value in dictionary.items():
        if key == target_key and (value == target_value1 or value == target_value2):
            uniprotIDs.append(dictionary.get('Dbtag_tag', {}).get('Object-id', {}).get('Object-id_str'))            
        if isinstance(value, dict):
            get_uniprot_from_dict(value, uniprotIDs, target_key, target_value1, target_value2)
        elif isinstance(value, list):
            for item in value:
                if isinstance(item, dict):
                    get_uniprot_from_dict(item, uniprotIDs, target_key, target_value1, target_value2)        
    return uniprotIDs

def get_human_mouse_orthologs(dictionary, orthologs, target_key, target_value1):
    for key, value in dictionary.items():
        if key == target_key and (value == target_value1):
            orthologs.append(dictionary.get('Dbtag_tag', {}).get('Object-id', {}).get('Object-id_id'))        
        if isinstance(value, dict):
            get_human_mouse_orthologs(value, orthologs, target_key, target_value1)
        elif isinstance(value, list):
            for item in value:
                if isinstance(item, dict):
                    get_human_mouse_orthologs(item, orthologs, target_key, target_value1)        
    return orthologs
                    
# Entrez.email = "athand01@gmail.com"
Entrez.email = "a.antonakoudis@sartorius.com"
u_g = pd.read_excel('../CHO_Genome.xlsx')
GO_terms = pd.read_csv('orthologs&GO.txt')

In [192]:
gene_symbol_list = []

gene_synoyms = []
ncbi_protein_NP_weirdName = []
ncbi_geneID_list = []
ncbi_gene_exon_size = []
# ncbi_genomic_location = []
uniprot_IDs = []
orthologs_genes = []
orthologs_organism = []
aa_sequences = []
gene_record = []

counter = 0
# for gene_EntrezID in u_g['NCBI GeneID'][counter:1]:
if True:
    gene_EntrezID = 100774298
    protein_NP_weirdName = []
    if not math.isnan(gene_EntrezID):   
        gene_EntrezID = int(gene_EntrezID)                  
        max_retries = 3
        retries = 0
        gene_found = True
        while retries < max_retries and gene_found:

            try:

                search_results = Entrez.esearch(db="gene", term = gene_EntrezID)            
                record = Entrez.read(search_results)

                # Check if any gene records were found
                if int(record["Count"]) > 0:

                    # Get the Gene ID of the first result
                    gene_id = record["IdList"][0]

                    # Fetch the gene information, maximum 3 tries
                    fetch_tries = 0
                    fetch_found = True
                    while fetch_tries < 3 and fetch_found:

                        gene_info = Entrez.efetch(db="gene", id=gene_id, retmode="xml")           
                        fetch_tries += 1
                        try:
                            gene_record = Entrez.read(gene_info)
                            fetch_found = False
                        except CorruptedXMLError as xml_error:
                            time.sleep(3)
                            fetch_tries += 1

                    # Get the gene name and gene NCBI ID
                    if 'Gene-ref_locus' in gene_record[0]['Entrezgene_gene']['Gene-ref']:
                        gene_symbol = gene_record[0]['Entrezgene_gene']['Gene-ref']['Gene-ref_locus']                        
                    else:
                        gene_symbol = gene_record[0]['Entrezgene_gene']['Gene-ref']['Gene-ref_locus-tag']
                
                    # Get the Gene Synonyms
                    if 'Gene-ref_syn' in gene_record[0]['Entrezgene_gene']['Gene-ref']:
                        gene_synonyms = gene_record[0]['Entrezgene_gene']['Gene-ref']['Gene-ref_syn']
                    else:
                        gene_synonyms = []
                    gene_ncbiID = gene_record[0]['Entrezgene_track-info']['Gene-track']['Gene-track_geneid']

                    # Get the Gene Description

                    # Get UniprotIDs
                    gene_uniprotIDs = (get_uniprot_from_dict(gene_record[0], [], 'Dbtag_db', 'UniProtKB/TrEMBL', 'UniProtKB/Swiss-Prot'))

                    # Get the Gene Name
                    gene_name = gene_record[0]['Entrezgene_gene']['Gene-ref']['Gene-ref_desc']

                    # Get the chromosome
                    chromosome = int(gene_record[0]['Entrezgene_source']['BioSource']['BioSource_subtype'][0]['SubSource_name'])

                    # Add sub-cellular localization
                    # Add protein-protein interaction
                    
                    # Get the location in the genome
                    # locus = gene_record[0].get('Entrezgene_locus', {})
                    # start = locus[0]['Gene-commentary_seqs'][0]['Seq-loc_int']['Seq-interval']['Seq-interval_from']
                    # end = locus[0]['Gene-commentary_seqs'][0]['Seq-loc_int']['Seq-interval']['Seq-interval_to']
                    # gene_location = start

                    # Get number of Exons
                    exon_count = int(gene_record[0]['Entrezgene_properties'][0]['Gene-commentary_text'])

                    # Get the Strand
                    strand = gene_record[0]['Entrezgene_locus'][0]['Gene-commentary_seqs'][0]['Seq-loc_int']['Seq-interval']['Seq-interval_strand']['Na-strand'].attributes['value']
                    
                    # Get the gene weird NP name
                    if 'Gene-commentary_products' in gene_record[0]['Entrezgene_locus'][0]:
                        try:
                            protein_NP_weirdName = gene_record[0]['Entrezgene_locus'][0]['Gene-commentary_products'][0]['Gene-commentary_products'][0]['Gene-commentary_accession']
                            
                        except:
                            if 'Gene-commentary_accession' in gene_record[0]['Entrezgene_locus'][0]['Gene-commentary_products'][0]:
                                protein_NP_weirdName = gene_record[0]['Entrezgene_locus'][0]['Gene-commentary_products'][0]['Gene-commentary_accession']
                                
                            else:
                                if 'Gene-commentary_label' in gene_record[0]['Entrezgene_locus'][0]['Gene-commentary_products'][0]:
                                    protein_NP_weirdName = gene_record[0]['Entrezgene_locus'][0]['Gene-commentary_products'][0]['Gene-commentary_label']
                                else:
                                    protein_NP_weirdName = '-----'
                    else:
                        protein_NP_weirdName = '-----'

                    # Get Human and Mouse Orthologs
                    orthologs_organism_list = []
                    orthologs = []     
                    if 'Entrezgene_comments' in gene_record[0]:
                        for Entrezgene_comments_LOOP in gene_record[0]['Entrezgene_comments']:
                            if 'Gene-commentary_comment' in Entrezgene_comments_LOOP:
                                for Gene_commentary_comment_LOOP in Entrezgene_comments_LOOP['Gene-commentary_comment']:
                                    if 'Gene-commentary_source' in Gene_commentary_comment_LOOP:
                                        if 'Object-id_id' in Gene_commentary_comment_LOOP['Gene-commentary_source'][0]['Other-source_src']['Dbtag']['Dbtag_tag']['Object-id']:
                                            orthologs.append(Gene_commentary_comment_LOOP['Gene-commentary_source'][0]['Other-source_src']['Dbtag']['Dbtag_tag']['Object-id']['Object-id_id'])
                                        else:
                                            orthologs.append(Gene_commentary_comment_LOOP['Gene-commentary_source'][0]['Other-source_src']['Dbtag']['Dbtag_tag']['Object-id']['Object-id_str'])
                                        orthologs_organism_list.append(Gene_commentary_comment_LOOP['Gene-commentary_source'][0]['Other-source_anchor'])                    
                    human_ortholog_EntrezID = orthologs[orthologs_organism_list.index('human')]
                    mouse_ortholog_EntrezID = orthologs[orthologs_organism_list.index('mouse')]
                    
                    # Get Gene Description
                    search_results_human = Entrez.esearch(db="gene", term=orthologs[orthologs_organism_list.index('human')])     
                    record_human = Entrez.read(search_results_human)
                    gene_id_human = record_human["IdList"][0]
                    gene_info_human = Entrez.efetch(db="gene", id=gene_id_human, retmode="xml")           
                    gene_record_human = Entrez.read(gene_info_human)
                    gene_description = gene_record_human[0]['Entrezgene_summary']

                    # Get GO terms
                    result = GO_terms[GO_terms['CHO GeneID'] == str(gene_EntrezID)]['GO_ids'].values
                    if result.size > 0:
                        go_terms = result[0]
                    else:
                        go_terms = None

                    # Get the sequence                
                    aa_bool = 'NOT Found AA_Seq'
                    if protein_NP_weirdName != '-----':
                        try:
                            handle = Entrez.efetch(db="protein", id=protein_NP_weirdName, rettype="fasta")
                            record = handle.read()
                            amino_acid_sequence = ''.join(record.split('\n')[1:])
                            aa_bool = 'FOUND AA_Seq'
                        except:
                            handle = Entrez.efetch(db="nucleotide", id=protein_NP_weirdName, rettype="fasta")
                            record = handle.read()
                            amino_acid_sequence = ''.join(record.split('\n')[1:])
                            aa_bool = 'FOUND AA_Seq'
                    # Get the COG terms
                    cogs = []

                # If there are no matches for the EntrezID at NCBI then return -----
                else:
                    gene_ncbiID = '-----'
                    gene_symbol = '-----'
                    gene_name_alsoknownas = '-----'
                    protein_NP_weirdName = '-----'
                    exon = '-----'
                    # gene_location = '-----'                    
                    uniprot_IDs = '-----'
                    amino_acid_sequence = '-----'

                gene_found = False
                search_results.close()
    
            # Handle exceptions
            except HTTPError as e:

                retries += 1
                time.sleep(3)
            except ssl.SSLError as e:

                retries += 1
                time.sleep(3)
            except ConnectionResetError as e:

                retries += 1
                time.sleep(3)
            except socket.error as e:

                retries += 1
                time.sleep(3)
    
        # If the maximum fetch attempts reached, return ------
        if retries >= max_retries:
            gene_symbol = ncbi_gene
            gene_synonyms = '-----'
            gene_ncbiID = '-----'
            protein_NP_weirdName = '-----'
            exon = '-----'
            # gene_location = '-----'
            gene_uniprotIDs = '-----'
            orthologs = '-----'
            orthologs_organism_list = '-----'
            amino_acid_sequence = '-----'
    
    # If for some reason the ncbi_gene is nto a number, then return the ncbi_gene for all cases
    else:
        gene_symbol = ncbi_gene
        gene_synonyms = ncbi_gene
        gene_ncbiID = ncbi_gene
        protein_NP_weirdName = ncbi_gene
        exon = ncbi_gene
        # gene_location = ncbi_gene
        amino_acid_sequence = ncbi_gene

    # Append the lists with the data
    # gene_symbol_list.append(gene_symbol)
    # ncbi_geneName_alsoknownas_list.append(gene_name_alsoknownas)
    # ncbi_geneID_list.append(gene_ncbiID)
    # ncbi_protein_NP_weirdName.append(protein_NP_weirdName)
    # ncbi_gene_exon_size.append(exon)
    # # ncbi_genomic_location.append(gene_location)
    # uniprot_IDs.append(gene_uniprotIDs)
    # orthologs_genes.append(orthologs)
    # orthologs_organism.append(orthologs_organism_list)
    # aa_sequences.append(amino_acid_sequence)

    # print(counter, gene_name, gene_name_alsoknownas, gene_ncbiID, exon, aa_bool, protein_NP_weirdName, gene_uniprotIDs, orthologs, orthologs_organism_list)
    # print(gene_symbol, gene_synonyms, gene_EntrezID, gene_name, gene_description, go_terms, cogs, exon_count, chromosome, strand, human_ortholog_EntrezID, mouse_ortholog_EntrezID, assembly_specific_gene_symbol, gene_references)
    
    # Loop over the Genes
    print("GENE: ", gene_symbol, "-----", gene_synonyms, "-----", gene_EntrezID, "-----", gene_name, "-----", gene_description[0:25], "... -----", go_terms[0:25], "... -----", cogs, "-----", exon_count, "-----", chromosome, "-----", strand, "-----", human_ortholog_EntrezID, "-----", mouse_ortholog_EntrezID)
    
    # Loop over the assembly specifics
    for assembly_specific_gene in gene_record[0]['Entrezgene_locus']:
        assembly_id = assembly_specific_gene['Gene-commentary_heading']
        location_sequence = assembly_specific_gene['Gene-commentary_accession']
        start_range = assembly_specific_gene['Gene-commentary_seqs'][0]['Seq-loc_int']['Seq-interval']['Seq-interval_from']
        end_range = assembly_specific_gene['Gene-commentary_seqs'][0]['Seq-loc_int']['Seq-interval']['Seq-interval_to']
        transcript_sequence_ID = assembly_specific_gene['Gene-commentary_products'][0]['Gene-commentary_accession']
        print("ASSEMBLY SPECIFC GENE: S1, S2, ", gene_EntrezID, "-----", location_sequence, "-----", start_range, "-----", end_range, "-----", transcript_sequence_ID, "-----", assembly_id)

        # Loop over the Transcripts and proteins
        for transcripts in assembly_specific_gene['Gene-commentary_products']:
            
            # Loop over products
            for proteins in transcripts['Gene-commentary_products']:

                print("TRANSCRIPTS: ", transcript_sequence_ID, "-----", proteins['Gene-commentary_accession'], "-----")
                print("PROTEINS: ", proteins['Gene-commentary_accession'])
        


    counter += 1
    # if counter % 1000 == 0:
    #     data = {
    #         'NCBI_ID': ncbi_geneID_list,
    #         'Gene_Name': ncbi_geneName_list,
    #         'NP_Name': ncbi_protein_NP_weirdName,
    #         'Uniprot_ID': uniprot_IDs,
    #         'Ortholog_Genes': orthologs_genes,
    #         'Ortholog_Organisms': orthologs_organism,
    #         'Exon Number': ncbi_gene_exon_size,
    #         'Sequence': aa_sequences
    #     }
    #     df = pd.DataFrame(data)
    #     file_path = 'GeneIDs_' + str(counter) + '.xlsx'
    #     df.to_excel(file_path, index=False)


GENE:  Slco1b3 ----- [] ----- 100774298 ----- solute carrier organic anion transporter family member 1B3 ----- This gene encodes a liver ... ----- ['GO:0055085', 'GO:002285 ... ----- [] ----- 14 ----- 8 ----- minus ----- 28234 ----- 28253
XP_035312248
ASSEMBLY SPECIFC GENE: S1, S2,  100774298 ----- NW_003613944 ----- 886110 ----- 936246 ----- XM_035456357 ----- Reference CriGri_1.0 Primary Assembly
XP_035304859
ASSEMBLY SPECIFC GENE: S1, S2,  100774298 ----- NC_048601 ----- 7490477 ----- 7541006 ----- XM_035448968 ----- Alternate CriGri-PICRH-1.0


In [198]:
for transcripts in assembly_specific_gene['Gene-commentary_products']:
    
    # Loop over products
    for proteins in transcripts['Gene-commentary_products']:

        print("TRANSCRIPTS: ", transcript_sequence_ID, "-----", proteins['Gene-commentary_accession'], "-----")
        print("PROTEINS: ", proteins['Gene-commentary_accession'])

TRANSCRIPTS:  XM_035448968 ----- XP_035304859 -----
PROTEINS:  XP_035304859


In [196]:
assembly_specific_gene['Gene-commentary_products'][0]['Gene-commentary_products']

[{'Gene-commentary_type': StringElement('8', attributes={'value': 'peptide'}), 'Gene-commentary_heading': 'Reference', 'Gene-commentary_accession': 'XP_035304859', 'Gene-commentary_version': '1', 'Gene-commentary_genomic-coords': [{'Seq-loc_mix': {'Seq-loc-mix': [{'Seq-loc_int': {'Seq-interval': {'Seq-interval_from': '7540923', 'Seq-interval_to': '7541006', 'Seq-interval_strand': {'Na-strand': StringElement('', attributes={'value': 'minus'})}, 'Seq-interval_id': {'Seq-id': {'Seq-id_gi': '1859291611'}}}}}, {'Seq-loc_int': {'Seq-interval': {'Seq-interval_from': '7524876', 'Seq-interval_to': '7525017', 'Seq-interval_strand': {'Na-strand': StringElement('', attributes={'value': 'minus'})}, 'Seq-interval_id': {'Seq-id': {'Seq-id_gi': '1859291611'}}}}}, {'Seq-loc_int': {'Seq-interval': {'Seq-interval_from': '7520567', 'Seq-interval_to': '7520699', 'Seq-interval_strand': {'Na-strand': StringElement('', attributes={'value': 'minus'})}, 'Seq-interval_id': {'Seq-id': {'Seq-id_gi': '1859291611'}}

In [188]:
for assembly_specific_gene in gene_record[0]:
    print(assembly_specific_gene, gene_record[0][assembly_specific_gene])

Entrezgene_track-info {'Gene-track': {'Gene-track_geneid': '100689048', 'Gene-track_status': StringElement('0', attributes={'value': 'live'}), 'Gene-track_create-date': {'Date': {'Date_std': {'Date-std': {'Date-std_year': '2011', 'Date-std_month': '9', 'Date-std_day': '8'}}}}, 'Gene-track_update-date': {'Date': {'Date_std': {'Date-std': {'Date-std_year': '2023', 'Date-std_month': '10', 'Date-std_day': '22', 'Date-std_hour': '16', 'Date-std_minute': '41', 'Date-std_second': '0'}}}}}}
Entrezgene_type 6
Entrezgene_source {'BioSource': {'BioSource_genome': StringElement('1', attributes={'value': 'genomic'}), 'BioSource_origin': StringElement('1', attributes={'value': 'natural'}), 'BioSource_org': {'Org-ref': {'Org-ref_taxname': 'Cricetulus griseus', 'Org-ref_common': 'Chinese hamster', 'Org-ref_db': [{'Dbtag_db': 'taxon', 'Dbtag_tag': {'Object-id': {'Object-id_id': '10029'}}}], 'Org-ref_orgname': {'OrgName': {'OrgName_name': {'OrgName_name_binomial': {'BinomialOrgName': {'BinomialOrgName_g

In [161]:
t = gene_record[0]['Entrezgene_locus'][0]['Gene-commentary_products'][0]
for i in t:
    print(i, t[i])
print("END")

Gene-commentary_type 3
Gene-commentary_heading Reference
Gene-commentary_accession NM_001244036
Gene-commentary_version 1
Gene-commentary_genomic-coords [{'Seq-loc_mix': {'Seq-loc-mix': [{'Seq-loc_int': {'Seq-interval': {'Seq-interval_from': '910892', 'Seq-interval_to': '911112', 'Seq-interval_strand': {'Na-strand': StringElement('', attributes={'value': 'plus'})}, 'Seq-interval_id': {'Seq-id': {'Seq-id_gi': '351517502'}}, 'Seq-interval_fuzz-from': {'Int-fuzz': {'Int-fuzz_lim': StringElement('', attributes={'value': 'lt'})}}}}}, {'Seq-loc_int': {'Seq-interval': {'Seq-interval_from': '919004', 'Seq-interval_to': '919133', 'Seq-interval_strand': {'Na-strand': StringElement('', attributes={'value': 'plus'})}, 'Seq-interval_id': {'Seq-id': {'Seq-id_gi': '351517502'}}}}}, {'Seq-loc_int': {'Seq-interval': {'Seq-interval_from': '928903', 'Seq-interval_to': '929060', 'Seq-interval_strand': {'Na-strand': StringElement('', attributes={'value': 'plus'})}, 'Seq-interval_id': {'Seq-id': {'Seq-id_gi

In [9]:
import pandas as pd
All_Genes = pd.DataFrame()

for i in range(1000, 35000, 1000):
    Genes_temp = pd.read_excel('GeneIDs_' + str(i) + '.xlsx')
    All_Genes = pd.concat([All_Genes, Genes_temp], ignore_index=True)
All_Genes.to_excel('All_Genes.xlsx', index=False)