# Genes
The following notebook retrieves all the genes involved in the reactions from the **"Rxns" Sheet** in Google Sheet file. Then, information regarding the genes is retrieved from different databases and a **"Genes" Sheet** is generated and updated.

In [1]:
import pandas as pd
import numpy as np
import time
from tqdm.notebook import tqdm

from google_sheet import GoogleSheet
from utils import get_gene_info

### 1. Generate "rxns" and "genes" datasets
The "rxns" dataset contains all the reactions with their GPR associations. The "genes" dataset contains all the information of the genes/GPR involved in our reconstruction.

In [2]:
KEY_FILE_PATH = 'credentials.json'
SPREADSHEET_ID = '1MlBXeHIKw8k8fZyXm-sN__AHTRSunJxar_-bqvukZws'

# Initialize the GoogleSheet object
sheet = GoogleSheet(SPREADSHEET_ID, KEY_FILE_PATH)

# Read data from the Google Sheet
sheet_rxns = 'Rxns'
sheet_genes = 'Genes'

rxns = sheet.read_google_sheet(sheet_rxns)
genes = sheet.read_google_sheet(sheet_genes)

### 2. Create the a "gene_list" with all the genes involved in our reconstruction
We extract the gene IDs from the GPR annotations in the **"rxns" df**. This information is only retrieved from the **GPR_final** column.

In [3]:
# Generation of gene_list from all the genes in the "Whole Cell Network Reconstruction in CHO Cells" dataset
import re

gene_list = []
human_gene_list = []
for index, row in rxns.iterrows():
    if row['GPR_final'] != '':
        gprs = str(row['GPR_final'])
        gpr = re.findall(r'\b[hH]?\d+\b', gprs)
        for g in gpr:
            if g.lower().startswith('h'):  # we convert to lower case to catch both 'h' and 'H'
                human_gene_list.append(int(g[1:]))  # remove 'h' and convert to int before appending
            else:
                gene_list.append(int(g))  # convert to int before appending

gene_list = list(set(gene_list))
len(gene_list)

3045

### 3. Replace remaining Human Gene IDs in our dataset with CHO Gene IDs
Using the **cho2human_mapping** file, we replace the Human gene IDs with those of CHO, these IDs were not spotted when we first added the Recon3D reactions.

In [4]:
# Build a dictionary with human gene IDs as keys and CHO gene IDs as values
orthologs = pd.read_csv('../Data/Orthologs/cho2human_mapping.tsv', sep='\t')
human_to_cho_dict = dict(zip(orthologs['HUMAN_ID'], orthologs['CHO_ID']))

def replace_human_ids_with_cho_ids(s):
    replaced_dict = {}
    if isinstance(s, str):
        genes = re.findall(r'\b[hH]?\d+\b', s)
        for gene in genes:
            if gene.lower().startswith('h'):  # if gene ID starts with 'h' or 'H'
                human_id = int(gene[1:])  # remove 'h' and convert to int
                if human_id in human_to_cho_dict:  # if human gene ID is in the dictionary
                    cho_id = human_to_cho_dict[human_id]  # get corresponding CHO gene ID
                    s = s.replace(gene, str(cho_id))  # replace human gene ID with CHO gene ID in the string
                    replaced_dict[gene] = cho_id
    return s, replaced_dict

# Initialize a dictionary to store replacements
replacements = {}

# Apply the function and update the replacements dictionary
for i in range(len(rxns)):
    rxns.at[i, 'GPR_final'], replacements_dict = replace_human_ids_with_cho_ids(rxns.at[i, 'GPR_final'])
    replacements.update(replacements_dict)

# Now, 'replacements' is a dictionary where keys are the original gene IDs and values are the replaced CHO gene IDs
replacements

{}

In [5]:
######################################
#### ---------------------------- ####
#### ---- Update Rxns Sheet ----- ####
#### ---------------------------- ####
######################################
if replacements:
    sheet.update_google_sheet(sheet_rxns, rxns)
    print("Google Sheet updated.")
elif not replacements:
    print("No changes to the original Rxns Sheet")

No changes to the original Rxns Sheet


### 3. Add the genes from the "gene_list" to the genes df
Using a list of all the genes included in the dataset we can retrieve information from the NIH database regarding Gene Symbol, Gene Name, Gene Ensembl ID, and mRNA ID and protein ID using the function get_gene_info().

In [7]:
import time
from urllib.error import HTTPError
from Bio.Entrez.Parser import ValidationError

max_retries = 5  # Set the maximum number of retries
c = 0

for g in tqdm(gene_list):
    if str(g) not in list(genes['Gene Entrez ID']):
        retry = True
        retries = 0
        while retry:
            try:
                print(g)
                organism, gene_symbol, gene_name, gene_description, picr_ensembl_id, chok1gs_ensembl_id, mRNA_ncbi_id, protein_ncbi_id, go_terms = get_gene_info(g)
                if organism == 'Cricetulus griseus':
                    new_row_data = {'Gene Entrez ID': g, 'Gene Symbol': gene_symbol, 'Gene Name': gene_name, 'Gene Description': gene_description,
                                    'PICR Ensembl ID': picr_ensembl_id, 'CHOK1GS Ensembl ID': chok1gs_ensembl_id, 'Transcript ID': mRNA_ncbi_id,
                                    'Protein ID': protein_ncbi_id, 'GO Terms': go_terms}
                    new_row_df = pd.DataFrame(new_row_data, index=[len(genes)])
                    genes = pd.concat([genes, new_row_df])
                    c+=1
                    retry = False
                elif organism != 'Cricetulus griseus':
                    print(f'Gene {g,gene_symbol} is a {organism} Gene')
                    break
            except ValidationError:
                print(f'Gene {g} not found')
                retry = False
                continue
            except HTTPError:
                retries += 1
                if retries > max_retries:
                    print(f'HTTP Error, reached maximum retries ({max_retries}) for gene {g}')
                    break
                else:
                    print(f'HTTP Error, retrying with gene {g}')
                    time.sleep(1)
                    
print(f'A total of {c} Genes were added to the dataset')

  0%|          | 0/3045 [00:00<?, ?it/s]

30100754813
Gene 30100754813 does not have PICR Ensembl ID
Gene (30100754813, 'ycf15') is a Vitis cinerea Gene
A total of 0 Genes were added to the dataset


In [8]:
#######################################
#### ----------------------------- ####
#### ---- Update Genes Sheet ----- ####
#### ----------------------------- ####
#######################################
if c > 0:
    sheet.update_google_sheet(sheet_genes, genes)
    print("Google Sheet updated.")
elif c == 0:
    print("No new genes added.")
    print("No changes to the original Genes Sheet.")

No new genes added.
No changes to the original Genes Sheet.


### 4. Eliminate unwanted genes
We iterate over the entire dataset to spot **Human genes** in order to eliminate them

In [None]:
# Make a copy of the DataFrame to avoid changing the original while iterating
erased = 0
genes_copy = genes.copy()

for index, row in tqdm(genes.iterrows(), total=genes.shape[0]):
    g = row['Gene Entrez ID']
    retry = True
    retries = 0
    while retry:
        try:
            organism, gene_symbol, gene_name, gene_description, picr_ensembl_id, chok1gs_ensembl_id, mRNA_ncbi_id, protein_ncbi_id, go_terms = get_gene_info(g)
            if organism == 'Homo sapiens':
                genes_copy = genes_copy.drop(index)
                print(f'Gene {g,gene_symbol} erased from dataset')
                erased+=1
                retry = False
            else:
                retry = False
        except ValidationError:
            print(f'Gene {g} not found')
            retry = False
            continue
        except HTTPError:
            retries += 1
            if retries > max_retries:
                print(f'HTTP Error, reached maximum retries ({max_retries}) for gene {g}')
                break
            else:
                print(f'HTTP Error, retrying with gene {g}')
                time.sleep(1)

# Replace the original DataFrame with the updated one
genes = genes_copy
print(f'A total of {erased} Genes were erased from the original dataset')

In [None]:
#######################################
#### ----------------------------- ####
#### ---- Update Genes Sheet ----- ####
#### ----------------------------- ####
#######################################
if erased > 0:
    sheet.update_google_sheet(sheet_genes, genes)
    print("Google Sheet updated.")
elif erased == 0:
    print("No erased genes.")
    print("No changes to the original Genes Sheet.")

In [None]:
for g in tqdm(gene_list):
    if g not in list(genes['Gene Entrez ID']):
        try:
            gene_symbol, gene_name, gene_description, picr_ensembl_id, chok1gs_ensembl_id, mRNA_ncbi_id, protein_ncbi_id, go_terms = get_gene_info(g)
            new_row_data = {'Gene Entrez ID': g, 'Gene Symbol': gene_symbol, 'Gene Name': gene_name, 'Gene Description': gene_description,
                            'PICR Ensembl ID': picr_ensembl_id, 'CHOK1GS Ensembl ID': chok1gs_ensembl_id, 'Transcript ID': mRNA_ncbi_id,
                            'Protein ID': protein_ncbi_id, 'GO Terms': go_terms}
            new_row_df = pd.DataFrame(new_row_data, index=[len(genes)])
            genes = pd.concat([genes, new_row_df])
        except KeyError:
            print(f'Gene {g} not found')
            continue

In [None]:
# Fetch information from the NIH database


# Complete null or blank information in the already generated "Genes Sheet" dataset
for i,row in genes.iterrows():
    if row['Gene Entrez ID'] == '':
        for g in gene_list:
            gene_sheet_list = [str(x) for x in genes['Gene Entrez ID']]
            if g not in gene_sheet_list:
                gene_symbol, gene_name, gene_description, picr_ensembl_id, chok1gs_ensembl_id, mRNA_ncbi_id, protein_ncbi_id, go_terms = get_gene_info(g)
                row['Gene Entrez ID'] = g
                row['Gene Symbol'] = gene_symbol
                row['Gene Name'] = gene_name
                row['Gene Description'] = gene_description
                row['PICR Ensembl ID'] = picr_ensembl_id
                row['CHOK1GS Ensembl ID'] = chok1gs_ensembl_id
                row['Transcript ID'] = mRNA_ncbi_id
                row['Protein ID'] = protein_ncbi_id
                row['GO Terms'] = go_terms
                break          
    elif row['Gene Entrez ID'] != '' and (row['Gene Symbol'] == '' or row['Gene Name'] == '' or row['PICR Ensembl ID'] == '' or row['Transcript ID'] == '' or row['Protein ID'] == ''):
        row['Gene Symbol'] = gene_symbol
        row['Gene Name'] = gene_name
        row['Gene Description'] = gene_description
        row['PICR Ensembl ID'] = picr_ensembl_id
        row['CHOK1GS Ensembl ID'] = chok1gs_ensembl_id
        row['Transcript ID'] = mRNA_ncbi_id
        row['Protein ID'] = protein_ncbi_id
        row['GO Terms'] = go_terms
'''
# Add genes from the gene_list that are not yet in the "Genes Sheet" dataset
for g in gene_list:
    # the first try/except is to avoid overwritting data in case there already some info in the dataset
    try:
        gene_sheet_list = [str(x) for x in df['Gene Entrez ID']]
        id = max(df['Index']) + 2
    except:
        gene_sheet_list = []
        id = 2
    if gene not in gene_sheet_list:
        try:
            gene_symbol, gene_name, gene_description, picr_ensembl_id, chok1gs_ensembl_id, mRNA_ncbi_id, protein_ncbi_id, go_terms = get_gene_info(gene)
            print(id-1)
            genes_sheet.update_cell(id,1,id-1)
            time.sleep(5)
            genes_sheet.update_cell(id,2,gene)
            time.sleep(5)
            genes_sheet.update_cell(id,3,gene_symbol)
            time.sleep(5)
            genes_sheet.update_cell(id,4,gene_name)
            time.sleep(5)
            genes_sheet.update_cell(id,5,gene_description)
            time.sleep(5)
            genes_sheet.update_cell(id,6,picr_ensembl_id)
            time.sleep(5)
            genes_sheet.update_cell(id,7,chok1gs_ensembl_id)
            time.sleep(5)
            genes_sheet.update_cell(id,8,mRNA_ncbi_id)
            time.sleep(5)
            genes_sheet.update_cell(id,9,protein_ncbi_id)
            time.sleep(5)
            genes_sheet.update_cell(id,10,go_terms)
        except:
            print('Google API quota exceeded')
            time.sleep(5)
            continue
            
'''

In [None]:
from Bio import Entrez
Entrez.email = 'account1@theta-ocean-377718.iam.gserviceaccount.com'
handle = Entrez.efetch(db='gene', id='3979190', retmode='xml')
record = Entrez.read(handle)[0]

#gene_name = record['Entrezgene_gene']['Gene-ref']['Gene-ref_desc']
#gene_symbol = record['Entrezgene_gene']['Gene-ref']['Gene-ref_locus']

human_handle = Entrez.efetch(db='gene', id='3979190', retmode='xml')
human_record = Entrez.read(human_handle)[0]
if 'Entrezgene_comments' in human_record and 'Gene-commentary_comment' in human_record['Entrezgene_comments'][0]:
    human_gene_description = human_record['Entrezgene_comments'][0]['Gene-commentary_comment'][0]['String']
elif 'Entrezgene_summary' in human_record:
    human_gene_description = human_record['Entrezgene_summary']
else:
    human_gene_description = ''

#### Get the CHO genome and map the IDs

In [50]:
import pandas as pd
from Bio import Entrez
import math
import time
import ssl
from urllib.error import HTTPError
import socket
from Bio.Entrez.Parser import CorruptedXMLError
import requests

def get_uniprot_from_dict(dictionary, uniprotIDs, target_key, target_value1,  target_value2):
    for key, value in dictionary.items():
        if key == target_key and (value == target_value1 or value == target_value2):
            uniprotIDs.append(dictionary.get('Dbtag_tag', {}).get('Object-id', {}).get('Object-id_str'))            
        if isinstance(value, dict):
            get_uniprot_from_dict(value, uniprotIDs, target_key, target_value1, target_value2)
        elif isinstance(value, list):
            for item in value:
                if isinstance(item, dict):
                    get_uniprot_from_dict(item, uniprotIDs, target_key, target_value1, target_value2)        
    return uniprotIDs

def get_human_mouse_orthologs(dictionary, orthologs, target_key, target_value1):
    for key, value in dictionary.items():
        if key == target_key and (value == target_value1):
            orthologs.append(dictionary.get('Dbtag_tag', {}).get('Object-id', {}).get('Object-id_id'))        
        if isinstance(value, dict):
            get_human_mouse_orthologs(value, orthologs, target_key, target_value1)
        elif isinstance(value, list):
            for item in value:
                if isinstance(item, dict):
                    get_human_mouse_orthologs(item, orthologs, target_key, target_value1)        
    return orthologs           



# Entrez.email = "athand01@gmail.com"
Entrez.email = "a.antonakoudis@sartorius.com"
u_g = pd.read_excel('../CHO_Genome.xlsx')
GO_terms = pd.read_csv('orthologs&GO.txt')

In [51]:
def UniProt_Data_Request(uniprot_id, session=None):

    name = []
    weight = []
    length = []
    annotation_score = []
    existance = []
    subcellular_locations_list = []
    tissue_specificity = []
    max_retries = 3
    retries = 0
    if session is None:
        session = requests.Session()
        certificate_path = '../../../../../../Downloads/Zscaler Root CA.crt'
        session.verify = certificate_path

    while retries < max_retries:
        try:
            url = f"https://www.uniprot.org/uniprot/{uniprot_id}.json"
            response = session.get(url, headers={"Accept": "application/json"}, timeout=150)
            if response.status_code == 200:
                entry = response.json()
                if 'recommendedName' in entry['proteinDescription']:
                    name  = entry['proteinDescription']['recommendedName']['fullName']['value']
                elif 'submissionNames' in entry['proteinDescription']:
                    name = entry['proteinDescription']['submissionNames'][0]['fullName']['value']
                annotation_score = entry['annotationScore']
                weight = entry['sequence']['molWeight']
                length = entry['sequence']['length']
                existance = entry['proteinExistence']              

                for references in entry['references']:
                    if 'referenceComments' in references:     
                        for ref_comments in references['referenceComments']:
                            if ref_comments['type'] == 'TISSUE':
                                for ref_comments_tissues in ref_comments['type']:
                                    tissue_specificity = ref_comments['value']

                subcellular_locations_list = []
                if 'comments' in entry:
                    for i in entry['comments']:
                        if 'subcellularLocations' in  i:
                            for k in i['subcellularLocations']:
                                subcellular_location = k['location']['value']
                                subcellular_locations_list.append(subcellular_location)                
                return name, weight, length, annotation_score, existance, subcellular_locations_list, tissue_specificity
            else:
                raise Exception(f"API request failed for UniProt ID {uniprot_id}: Status code {response.status_code}")

        except requests.exceptions.RequestException as e:
            # Handle request-related errors
            print(f"Request Error: {e}")
            retries += 1
            if retries < max_retries:
                print("Retrying after a delay...")
                time.sleep(1)  # Wait for 5 seconds before retrying

        except Exception as e:
            # Handle unexpected errors
            print(f"An unexpected error occurred: {e}")
            retries += 1
            if retries < max_retries:
                print("Retrying after a delay...")
                time.sleep(1)  # Wait for 5 seconds before retrying

    return name, weight, length, annotation_score, existance, subcellular_locations_list, tissue_specificity

In [57]:
import os
from Bio import SeqIO

def get_exon_info(gff_file, genome_file, transcript_id):
    # Find the location ID
    location_id = None
    with open(gff_file, 'r') as gff:
        for line in gff:
            if not line.startswith('#'):
                if f'ID=rna-{transcript_id}.' in line:
                    location_id = line.split('\t')[0]
                    break

    if location_id is None:
        print(f"Location ID not found for transcript ID '{transcript_id}' in the GFF file.")
        return

    # Load the whole genome
    sequences = list(SeqIO.parse(genome_file, "fasta"))
    matching_sequences = [record for record in sequences if record.name.startswith(location_id)]

    output_file = f'../Notebooks/Website Data/{transcript_id}-EXONS.txt'

    if os.path.exists(output_file):
        print(f"ERROR: File '{output_file}' already exists. Please choose a different transcript_id.")
    else:
        match_count = 0
        with open(output_file, 'w') as output, open(gff_file, 'r') as gff:
            for line in gff:
                if not line.startswith('#'):
                    if f'ID=exon-{transcript_id}' in line:
                        parts = line.split('\t')
                        if len(parts) >= 5:
                            start, end = map(int, parts[3:5])
                            match_count += 1
                            output.write(f"Exon-{match_count}: Start: {start}, End: {end}, Sequence: {matching_sequences[0].seq[start:end]}\n")

        if match_count > 0:
            print(f"Total matches: {match_count}")
            print(f"Exon information saved to '{output_file}'")
        else:
            print(f"No matches found for the pattern 'exon-{transcript_id}' in the GFF file.")

gff_file = '../../ncbi_dataset/ncbi_dataset/data/GCF_000223135.1/genomic.gff'
genome_file = '../../ncbi_dataset/ncbi_dataset/data/GCF_000223135.1/GCF_000223135.1_CriGri_1.0_genomic.fna'


In [48]:
# Define lists
# Genome Sheet
Assembly_ID_List = []
GenBank_List = []
RefSeq_List = []
WGS_Accession_List = []
Release_Data_List = []
References_List = []
Nb_Genes_List = []
Assembly_Type_List = []
Assembly_Level_List = []
Description_List = []
Sumbitter_List = []
Cell_Line_LIst = []

# Gene Sheet
Gene_Symbol_List = []
Gene_Synonyms_List = []
Gene_Name_List = []
Gene_Description_List = []
GO_Terms_List = []
COGs_List = []
Exon_Count_List = []
Chromosome_List = []
Strand_List = []
Human_Ortholog_Entrez_List = []
Mouse_Ortholog_Entrez_List = []
Assembly_Specific_Gene_Symbol_List = []
Gene_References_List = []

# Assembly Specific Genes Sheet
Assembly_Specific_Gene_Symbol_List = []
Gene_Symbol_List = []
Gene_Entrez_ID_List = []
Location_Sequence_List = []
Position_List = []
Transcript_Sequence_ID_List = []
Assembly_ID_List = []

# Transcripts Sheet
Transcript_Sequence_ID_List = []
Protein_Sequence_ID_List = []
Transcript_Sequence_List = []
Protein_Uniprot_ID_List = []
Exon_Info_List = []
Assembly_ID_List = []

# Proteins Sheet
Protein_Uniprot_ID_List = []
Protein_Name_List = []
Function_List = []
Protein_existence_List = []
Annotation_Score_List = []
Amino_Acid_Count_List = []
Protein_Sequence_List = []
Weigth_Da_List = []
Gene_Symbol_List = []
Subcellular_Location_List = []
Tissue_Specificity_List = []
PDB_Structure_List = []
PDB_Visualization_List = []
Protein_Sequence_ID_List = []

In [58]:
counter = 0
for gene_EntrezID in u_g['NCBI GeneID'][counter:counter + 1]:
    if not math.isnan(gene_EntrezID):   
        gene_EntrezID = int(gene_EntrezID)        
        go_terms = []          
        max_retries = 3
        retries = 0
        gene_found = True
        while retries < max_retries and gene_found:
            try:
                search_results = Entrez.esearch(db="gene", term = gene_EntrezID)            
                record = Entrez.read(search_results)

                # Check if any gene records were found
                if int(record["Count"]) > 0:

                    # Get the Gene ID of the first result
                    gene_id = record["IdList"][0]

                    # Fetch the gene information, maximum 3 tries
                    fetch_tries = 0
                    fetch_found = True
                    while fetch_tries < 3 and fetch_found:

                        gene_info = Entrez.efetch(db="gene", id=gene_id, retmode="xml")           
                        fetch_tries += 1
                        try:
                            gene_record = Entrez.read(gene_info)
                            fetch_found = False
                        except CorruptedXMLError as xml_error:
                            time.sleep(3)
                            fetch_tries += 1

                    # Get the gene name and gene NCBI ID
                    if 'Gene-ref_locus' in gene_record[0]['Entrezgene_gene']['Gene-ref']:
                        gene_symbol = gene_record[0]['Entrezgene_gene']['Gene-ref']['Gene-ref_locus']                        
                    else:
                        gene_symbol = gene_record[0]['Entrezgene_gene']['Gene-ref']['Gene-ref_locus-tag']
                
                    # Get the Gene Synonyms
                    if 'Gene-ref_syn' in gene_record[0]['Entrezgene_gene']['Gene-ref']:
                        gene_synonyms = gene_record[0]['Entrezgene_gene']['Gene-ref']['Gene-ref_syn']
                    else:
                        gene_synonyms = []
                    gene_ncbiID = gene_record[0]['Entrezgene_track-info']['Gene-track']['Gene-track_geneid']

                    # Get the Gene Name
                    gene_name = gene_record[0]['Entrezgene_gene']['Gene-ref']['Gene-ref_desc']

                    # Get the chromosome
                    chromosome = int(gene_record[0]['Entrezgene_source']['BioSource']['BioSource_subtype'][0]['SubSource_name'])

                    # Get number of Exons
                    exon_count = int(gene_record[0]['Entrezgene_properties'][0]['Gene-commentary_text'])

                    # Get the Strand
                    strand = gene_record[0]['Entrezgene_locus'][0]['Gene-commentary_seqs'][0]['Seq-loc_int']['Seq-interval']['Seq-interval_strand']['Na-strand'].attributes['value']
                    
                    # Get Human and Mouse Orthologs
                    orthologs_organism_list = []
                    orthologs = []     
                    if 'Entrezgene_comments' in gene_record[0]:
                        for Entrezgene_comments_LOOP in gene_record[0]['Entrezgene_comments']:
                            if 'Gene-commentary_comment' in Entrezgene_comments_LOOP:
                                for Gene_commentary_comment_LOOP in Entrezgene_comments_LOOP['Gene-commentary_comment']:
                                    if 'Gene-commentary_source' in Gene_commentary_comment_LOOP:
                                        if 'Object-id_id' in Gene_commentary_comment_LOOP['Gene-commentary_source'][0]['Other-source_src']['Dbtag']['Dbtag_tag']['Object-id']:
                                            orthologs.append(Gene_commentary_comment_LOOP['Gene-commentary_source'][0]['Other-source_src']['Dbtag']['Dbtag_tag']['Object-id']['Object-id_id'])
                                        else:
                                            orthologs.append(Gene_commentary_comment_LOOP['Gene-commentary_source'][0]['Other-source_src']['Dbtag']['Dbtag_tag']['Object-id']['Object-id_str'])
                                        orthologs_organism_list.append(Gene_commentary_comment_LOOP['Gene-commentary_source'][0]['Other-source_anchor'])                    
                    if 'human' in orthologs_organism_list:
                        human_ortholog_EntrezID = orthologs[orthologs_organism_list.index('human')]                                            
                        search_results_human = Entrez.esearch(db="gene", term=human_ortholog_EntrezID)     
                        record_human = Entrez.read(search_results_human)
                        gene_id_human = record_human["IdList"][0]
                        gene_info_human = Entrez.efetch(db="gene", id=gene_id_human, retmode="xml")           
                        gene_record_human = Entrez.read(gene_info_human)
                        gene_description = gene_record_human[0]['Entrezgene_summary']
                    else:
                        human_ortholog_EntrezID = []
                        gene_description = []
                    if 'mouse' in orthologs_organism_list:
                        mouse_ortholog_EntrezID = orthologs[orthologs_organism_list.index('mouse')]
                    else:
                        mouse_ortholog_EntrezID = []
                    
                    # Get GO terms
                    result = GO_terms[GO_terms['CHO GeneID'] == str(gene_EntrezID)]['GO_ids'].values
                    if result.size > 0:
                        go_terms = result[0]
                    else:
                        go_terms = []

                    # Get the COG terms
                    cogs = []

                gene_found = False
                search_results.close()
    
            # Handle exceptions
            except HTTPError as e:

                retries += 1
                time.sleep(3)
            except ssl.SSLError as e:

                retries += 1
                time.sleep(3)
            except ConnectionResetError as e:

                retries += 1
                time.sleep(3)
            except socket.error as e:

                retries += 1
                time.sleep(3)
        
    # Loop over the Genes
    print(counter, "|GENE: ", gene_symbol, "-----", gene_synonyms, "-----", gene_EntrezID, "-----", gene_name, "-----", gene_description[0:25], "... -----", go_terms[0:25], "... -----", cogs, "-----", exon_count, "-----", chromosome, "-----", strand, "-----", human_ortholog_EntrezID, "-----", mouse_ortholog_EntrezID)
    
    # Append information
    Gene_Symbol_List.append(gene_symbol)
    Gene_Synonyms_List.append(gene_synonyms)
    Gene_Name_List.append(gene_name)
    Gene_Description_List.append(gene_description)
    GO_Terms_List.append(go_terms)
    COGs_List.append(cogs)
    Exon_Count_List.append(exon_count)
    Chromosome_List.append(chromosome)
    Strand_List.append(strand)
    Human_Ortholog_Entrez_List.append(human_ortholog_EntrezID)
    Mouse_Ortholog_Entrez_List.append(mouse_ortholog_EntrezID)
    Assembly_Specific_Gene_Symbol_List.append([])
    Gene_References_List.append([])

    # Loop over the assembly specifics
    for assembly_specific_info in gene_record[0]['Entrezgene_locus']:    
        assembly_id = assembly_specific_info['Gene-commentary_heading']
        location_sequence = assembly_specific_info['Gene-commentary_accession']
        start_range = assembly_specific_info['Gene-commentary_seqs'][0]['Seq-loc_int']['Seq-interval']['Seq-interval_from']
        end_range = assembly_specific_info['Gene-commentary_seqs'][0]['Seq-loc_int']['Seq-interval']['Seq-interval_to']
        
        # Loop over the transcripts of the Assembly
        for assembly_specific_transcript in assembly_specific_info['Gene-commentary_products']:
            transcript_sequence_ID = assembly_specific_transcript['Gene-commentary_accession']
            print("ASSEMBLY SPECIFC GENE: S1, S2, ", gene_EntrezID, "-----", location_sequence, "-----", start_range, "-----", end_range, "-----", transcript_sequence_ID, "-----", assembly_id)

            # Loop over the Transcripts and proteins
            # for transcripts in assembly_specific_gene['Gene-commentary_products']:
            mRNA_handle = Entrez.efetch(db="nucleotide", id=transcript_sequence_ID, rettype="fasta")
            mRNA_record = mRNA_handle.read()
            mRNA_sequence = ''.join(mRNA_record.split('\n')[1:])           
            #
            # Get Exon information from the gff file 
            # Downloadable from the assembly page
            #
            for assembly_specific_transcript_products in assembly_specific_transcript['Gene-commentary_products']:
                protein_sequence_ID = assembly_specific_transcript_products['Gene-commentary_accession']
                print("TRANSCRIPTS: ", transcript_sequence_ID, "-----", protein_sequence_ID, "-----", mRNA_sequence[0:25], '-----', assembly_id)  
                get_exon_info(gff_file, genome_file, transcript_sequence_ID)


                for Entrez_Comments in gene_record[0]['Entrezgene_comments']:
                    if 'Gene-commentary_comment' in Entrez_Comments:
                        for comments in Entrez_Comments['Gene-commentary_comment']:
                            # print(comments)  
                            if 'Gene-commentary_products' in comments:
                                for product_per_assembly in comments['Gene-commentary_products']:                                
                                    if product_per_assembly['Gene-commentary_heading'] == 'mRNA Sequence':
                                        mRNA = product_per_assembly['Gene-commentary_accession']
                                        if mRNA == transcript_sequence_ID:  
                                            for protein_per_transcript_per_assemlby in product_per_assembly['Gene-commentary_products']:
                                                protein_sequence_ID = protein_per_transcript_per_assemlby['Gene-commentary_accession']
                                                protein_handle = Entrez.efetch(db="protein", id=protein_sequence_ID, rettype="fasta")
                                                protein_record = protein_handle.read()
                                                protein_sequence = ''.join(protein_record.split('\n')[1:])  
                                                uniprotID_list = []
                                                for protein_comments in protein_per_transcript_per_assemlby['Gene-commentary_comment']:
                                                    # print(protein_comments['Gene-commentary_heading'])
                                                    if protein_comments['Gene-commentary_heading'] == 'UniProtKB':
                                                        # print(protein_comments)
                                                        for uniprotID_protein_per_assembly in protein_comments['Gene-commentary_comment'][0]['Gene-commentary_source']:
                                                            protein_uniprot_id = uniprotID_protein_per_assembly['Other-source_src']['Dbtag']['Dbtag_tag']['Object-id']['Object-id_str']
                                                            uniprotID_list.append(protein_uniprot_id)
                                                subcellular_locations_list = []
                                                for UniprotID in uniprotID_list:
                                                    name, weight, length, annotation_score, existance, subcellular_locations_list, tissue_specificity = UniProt_Data_Request(UniprotID)   
                                                    print("PROTEINS: ", UniprotID, "-----", name, "-----", existance, "-----", annotation_score, "-----", length, "-----", protein_sequence[0:25], "-----", weight, "kDa -----", subcellular_locations_list, "-----", tissue_specificity, "-----", protein_sequence_ID)                         
                                    else:       
                                            
                                        if 'Gene-commentary_products' in product_per_assembly:
                                            # print(product_per_assembly)  
                                            for transcript_per_assembly in product_per_assembly['Gene-commentary_products']:      
                                                mRNA = transcript_per_assembly['Gene-commentary_accession']      
                                                # print(mRNA, transcript_sequence_ID, transcript_per_assembly)        
                                                if mRNA == transcript_sequence_ID:                                                  
                                                    for protein_per_transcript_per_assemlby in transcript_per_assembly['Gene-commentary_products']:
                                                        # print(2)
                                                        protein_sequence_ID = protein_per_transcript_per_assemlby['Gene-commentary_accession']
                                                        protein_handle = Entrez.efetch(db="protein", id=protein_sequence_ID, rettype="fasta")
                                                        protein_record = protein_handle.read()
                                                        protein_sequence = ''.join(protein_record.split('\n')[1:])  
                                                        uniprotID_list = []
                                                        for protein_comments in protein_per_transcript_per_assemlby['Gene-commentary_comment']:
                                                            
                                                            # print(protein_comments['Gene-commentary_heading'])
                                                            if protein_comments['Gene-commentary_heading'] == 'UniProtKB':
                                                                for uniprotID_protein_per_assembly in protein_comments['Gene-commentary_comment'][0]['Gene-commentary_source']:
                                                                    protein_uniprot_id = uniprotID_protein_per_assembly['Other-source_src']['Dbtag']['Dbtag_tag']['Object-id']['Object-id_str']
                                                                    uniprotID_list.append(protein_uniprot_id)
                                                        subcellular_locations_list = []
                                                        for UniprotID in uniprotID_list:
                                                            name, weight, length, annotation_score, existance, subcellular_locations_list, tissue_specificity = UniProt_Data_Request(UniprotID)   
                                                            print("PROTEINS: ", UniprotID, "-----", name, "-----", existance, "-----", annotation_score, "-----", length, "-----", protein_sequence[0:25], "-----", weight, "kDa -----", subcellular_locations_list, "-----", tissue_specificity, "-----", protein_sequence_ID)                         

    print("-----------------------------------------------------------------------------------------")
    counter += 1

    # if counter % 1000 == 0:
    #     data = {
    #         'NCBI_ID': ncbi_geneID_list,
    #         'Gene_Name': ncbi_geneName_list,
    #         'NP_Name': ncbi_protein_NP_weirdName,
    #         'Uniprot_ID': uniprot_IDs,
    #         'Ortholog_Genes': orthologs_genes,
    #         'Ortholog_Organisms': orthologs_organism,
    #         'Exon Number': ncbi_gene_exon_size,
    #         'Sequence': aa_sequences
    #     }
    #     df = pd.DataFrame(data)
    #     file_path = 'GeneIDs_' + str(counter) + '.xlsx'
    #     df.to_excel(file_path, index=False)


0 |GENE:  Scap ----- [] ----- 100689048 ----- SREBF chaperone ----- This gene encodes a prote ... ----- ['GO:0005789', 'GO:000013 ... ----- [] ----- 22 ----- 4 ----- plus ----- 22937 ----- 235623
ASSEMBLY SPECIFC GENE: S1, S2,  100689048 ----- NW_003614047 ----- 910892 ----- 941247 ----- NM_001244036 ----- Reference CriGri_1.0 Primary Assembly
TRANSCRIPTS:  NM_001244036 ----- NP_001230965 ----- GCGGCGGAGCGGGAGGGGAAAGGTA ----- Reference CriGri_1.0 Primary Assembly
Total matches: 22
Exon information saved to '../Notebooks/Website Data/NM_001244036-EXONS.txt'
PROTEINS:  G3HKR2 ----- Sterol regulatory element-binding protein cleavage-activating protein ----- 3: Inferred from homology ----- 2.0 ----- 1276 ----- MTLTERLREKISQAFYNHGLLCASY ----- 139528 kDa ----- ['Cytoplasmic vesicle, COPII-coated vesicle membrane', 'Endoplasmic reticulum membrane', 'Golgi apparatus membrane'] ----- [] ----- NP_001230965
PROTEINS:  P97260 ----- Sterol regulatory element-binding protein cleavage-activating prot

In [59]:
data = {
    'Gene_Symbol': Gene_Symbol_List,
    'Gene_Synonyms': Gene_Synonyms_List,
    'Gene_Name': Gene_Name_List,
    'Gene_Description': Gene_Description_List,
    'GO_Terms': GO_Terms_List,
    'COGs': COGs_List,
    'Exon_Count': Exon_Count_List,
    'Chromosome': Chromosome_List,
    'Strand': Strand_List,
    'Human_Ortholog_Entrez': Human_Ortholog_Entrez_List,
    'Mouse_Ortholog_Entrez': Mouse_Ortholog_Entrez_List,
    'Assembly_Specific_Gene_Symbol': Assembly_Specific_Gene_Symbol_List,
    'Gene_References': Gene_References_List,
}

# Create a DataFrame from the dictionary
df = pd.DataFrame(data)
df.to_excel('../Notebooks/Website Data/website_data.xlsx', sheet_name='Genes', index=False)


#### Combine different DataFrames

In [9]:
import pandas as pd
All_Genes = pd.DataFrame()

for i in range(1000, 35000, 1000):
    Genes_temp = pd.read_excel('GeneIDs_' + str(i) + '.xlsx')
    All_Genes = pd.concat([All_Genes, Genes_temp], ignore_index=True)
All_Genes.to_excel('All_Genes.xlsx', index=False)