# Genes
The following notebook retrieves all the genes involved in the reactions from the **"Rxns" Sheet** in Google Sheet file. Then, information regarding the genes is retrieved from different databases and a **"Genes" Sheet** is generated and updated.

In [1]:
import pandas as pd
import numpy as np
import time
from tqdm.notebook import tqdm

from google_sheet import GoogleSheet
from utils import get_gene_info

### 1. Generate "rxns" and "genes" datasets
The "rxns" dataset contains all the reactions with their GPR associations. The "genes" dataset contains all the information of the genes/GPR involved in our reconstruction.

In [2]:
KEY_FILE_PATH = 'credentials.json'
SPREADSHEET_ID = '1MlBXeHIKw8k8fZyXm-sN__AHTRSunJxar_-bqvukZws'

# Initialize the GoogleSheet object
sheet = GoogleSheet(SPREADSHEET_ID, KEY_FILE_PATH)

# Read data from the Google Sheet
sheet_rxns = 'Rxns'
sheet_genes = 'Genes'

rxns = sheet.read_google_sheet(sheet_rxns)
genes = sheet.read_google_sheet(sheet_genes)

In [3]:
rxns

Unnamed: 0,Package,Curated,Reaction,Reaction Name,Reaction Formula,Subsystem,GPR_hef,GPR_fou,GPR_yeo,GPR_Recon3D,GPR_final,Conf. Score,Curation Notes,References
0,7,PD,10FTHF5GLUtl,"5-glutamyl-10FTHF transport, lysosomal",10fthf5glu_c --> 10fthf5glu_l,"TRANSPORT, LYSOSOMAL",,,,,,1,No information available in the literature abo...,
1,7,PD,10FTHF5GLUtm,"5-glutamyl-10FTHF transport, mitochondrial",10fthf5glu_m --> 10fthf5glu_c,"TRANSPORT, MITOCHONDRIAL",,,,,,1,No information available in the literature abo...,
2,7,PD,10FTHF6GLUtl,"6-glutamyl-10FTHF transport, lysosomal",10fthf6glu_c --> 10fthf6glu_l,"TRANSPORT, LYSOSOMAL",,,,,,1,No information available in the literature abo...,
3,7,PD,10FTHF6GLUtm,"6-glutamyl-10FTHF transport, mitochondrial",10fthf6glu_m --> 10fthf6glu_c,"TRANSPORT, MITOCHONDRIAL",,,,,,1,No information available in the literature abo...,
4,7,PD,10FTHF7GLUtl,"7-glutamyl-10FTHF transport, lysosomal",10fthf7glu_c --> 10fthf7glu_l,"TRANSPORT, LYSOSOMAL",,,,,,1,No information available in the literature abo...,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10513,22,,r2534,Major Facilitator(MFS) TCDB:2.A.1.44.1,thr_L_e <=> thr_L_c,"TRANSPORT, EXTRACELLULAR",,,100757617,100757617,100757617,,,
10514,22,,r2535,Major Facilitator(MFS) TCDB:2.A.1.44.1,hom_L_e <=> hom_L_c,TRANSPORT,,,100757617,100757617,100757617,,,
10515,22,,r2537,Utilized transport,lnlncgcoa_c <=> lnlncgcoa_r,TRANSPORT,,,,,,,,
10516,22,,r2538,Utilized transport,dlnlcgcoa_c <=> dlnlcgcoa_r,TRANSPORT,,,,,,,,


### 2. Create the a "gene_list"
Using a list of all the genes included in the dataset we can retrieve information from the NIH database regarding Gene Symbol, Gene Name, Gene Ensembl ID, and mRNA ID and protein ID

In [4]:
# Generation of gene_list from all the genes in the "Whole Cell Network Reconstruction in CHO Cells" dataset
import re

gene_list = []
for index, row in rxns.iterrows():
    if row['GPR_final'] != '':
        gpr = str(row['GPR_final'])
        num = re.findall(r'\d+', gpr)
        for n in num:
            gene_list.append(n)
        
gene_list = list(set(gene_list))

In [5]:
gene_list

['100762846',
 '100753048',
 '2939',
 '100765201',
 '100771827',
 '100773412',
 '100762222',
 '100772617',
 '100750622',
 '100754084',
 '5300',
 '100751131',
 '100750408',
 '100768923',
 '100755925',
 '100755696',
 '100759885',
 '100772279',
 '100762968',
 '100767894',
 '100770579',
 '100771233',
 '100756392',
 '100752487',
 '100757784',
 '100774215',
 '100771294',
 '100754256',
 '100753174',
 '100758683',
 '100752295',
 '100764390',
 '100769574',
 '113836769',
 '60559',
 '100755162',
 '100755033',
 '100752454',
 '10714',
 '100689347',
 '100765313',
 '100767329',
 '100761172',
 '100768340',
 '100774878',
 '103161516',
 '100765484',
 '100759330',
 '100753765',
 '100757188',
 '100764902',
 '100772409',
 '55190',
 '100751584',
 '100772029',
 '124',
 '100770570',
 '100754177',
 '100767969',
 '56903',
 '100754688',
 '100750756',
 '100761105',
 '100752162',
 '100768220',
 '100767138',
 '100761290',
 '100769331',
 '100765556',
 '100774248',
 '100758652',
 '9789',
 '100757917',
 '5793',
 '1007

In [6]:
import time
from urllib.error import HTTPError
from Bio.Entrez.Parser import ValidationError

max_retries = 5  # Set the maximum number of retries

for g in tqdm(gene_list):
    if g not in list(genes['Gene Entrez ID']):
        retry = True
        retries = 0
        while retry:
            try:
                print(g)
                organism, gene_symbol, gene_name, gene_description, picr_ensembl_id, chok1gs_ensembl_id, mRNA_ncbi_id, protein_ncbi_id, go_terms = get_gene_info(g)
                if organism == 'Cricetulus griseus':
                    new_row_data = {'Gene Entrez ID': g, 'Gene Symbol': gene_symbol, 'Gene Name': gene_name, 'Gene Description': gene_description,
                                    'PICR Ensembl ID': picr_ensembl_id, 'CHOK1GS Ensembl ID': chok1gs_ensembl_id, 'Transcript ID': mRNA_ncbi_id,
                                    'Protein ID': protein_ncbi_id, 'GO Terms': go_terms}
                    new_row_df = pd.DataFrame(new_row_data, index=[len(genes)])
                    genes = pd.concat([genes, new_row_df])
                    retry = False
                elif organism == 'Homo sapiens':
                    continue
            except ValidationError:
                print(f'Gene {g} not found')
                retry = False
                continue
            except HTTPError:
                retries += 1
                if retries > max_retries:
                    print(f'HTTP Error, reached maximum retries ({max_retries}) for gene {g}')
                    break
                else:
                    print(f'HTTP Error, retrying with gene {g}')
                    time.sleep(1)

  0%|          | 0/3193 [00:00<?, ?it/s]

100759470
100771487
100762380
100771350
100763538
100768154
100750340
100767827
100752522
100759306
Gene 100759306 does not have PICR Ensembl ID
100755423
100751278
100770830
100758724
HTTP Error, retrying with gene 100758724
100758724


In [None]:
# Make a copy of the DataFrame to avoid changing the original while iterating
genes_copy = genes.copy()

for index, row in tqdm(genes.iterrows()):
    g = row['Gene Entrez ID']
    retry = True
    retries = 0
    while retry:
        try:
            organism, gene_symbol, gene_name, gene_description, picr_ensembl_id, chok1gs_ensembl_id, mRNA_ncbi_id, protein_ncbi_id, go_terms = get_gene_info(g)
            if organism == 'Homo sapiens':
                genes_copy = genes_copy.drop(index)
                print(f'Gene {g,gene_symbol} erased from dataset')
                retry = False
            else:
                retry = False
        except ValidationError:
            print(f'Gene {g} not found')
            retry = False
            continue
        except HTTPError:
            retries += 1
            if retries > max_retries:
                print(f'HTTP Error, reached maximum retries ({max_retries}) for gene {g}')
                break
            else:
                print(f'HTTP Error, retrying with gene {g}')
                time.sleep(1)

# Replace the original DataFrame with the updated one
#genes = genes_copy


0it [00:00, ?it/s]

HTTP Error, retrying with gene 100764709
Gene 100757367 does not have PICR Ensembl ID
Gene 100761245 does not have PICR Ensembl ID
Gene 100761523 does not have PICR Ensembl ID
Gene 100770009 does not have PICR Ensembl ID
Gene 100774679 does not have PICR Ensembl ID
Gene 7367 erased from dataset
HTTP Error, retrying with gene 100772976
HTTP Error, retrying with gene 100772976
Gene 100771298 does not have PICR Ensembl ID
HTTP Error, retrying with gene 100753139
HTTP Error, retrying with gene 100751224
Gene 100753222 does not have PICR Ensembl ID
HTTP Error, retrying with gene 100753783
HTTP Error, retrying with gene 100753783
Gene 387775 erased from dataset
Gene 6697 erased from dataset
Gene 3939 erased from dataset
HTTP Error, retrying with gene 100752211
Gene 6783 erased from dataset
Gene 100689294 does not have PICR Ensembl ID
Gene 641371 erased from dataset
Gene 100753389 does not have PICR Ensembl ID
HTTP Error, retrying with gene 100772055
HTTP Error, retrying with gene 83549
Gene 

HTTP Error, retrying with gene 100762537
HTTP Error, retrying with gene 100752768
Gene 9085 erased from dataset
HTTP Error, retrying with gene 100767833
HTTP Error, retrying with gene 100756992
Gene 100766810 does not have PICR Ensembl ID
HTTP Error, retrying with gene 100765445
HTTP Error, retrying with gene 100750655
Gene 100770231 does not have PICR Ensembl ID
Gene 2169 erased from dataset
Gene 100762146 does not have PICR Ensembl ID
Gene 3979178 does not have PICR Ensembl ID
Gene 3979178 does not have info on human ortholog description
Gene 100771969 does not have PICR Ensembl ID
Gene 4836 erased from dataset
Gene 103158988 does not have PICR Ensembl ID
Gene 5438 erased from dataset
Gene 7334 erased from dataset
Gene 100766236 does not have PICR Ensembl ID
Gene 100762236 does not have PICR Ensembl ID
Gene 2243 erased from dataset
Gene 389898 erased from dataset
Gene 279 erased from dataset
Gene 100757447 does not have PICR Ensembl ID
Gene 170685 erased from dataset
Gene 107977085 d

In [8]:
sheet.update_google_sheet(sheet_genes, genes)
print("Google Sheet updated.")

Google Sheet updated.


In [None]:
for g in tqdm(gene_list):
    if g not in list(genes['Gene Entrez ID']):
        try:
            gene_symbol, gene_name, gene_description, picr_ensembl_id, chok1gs_ensembl_id, mRNA_ncbi_id, protein_ncbi_id, go_terms = get_gene_info(g)
            new_row_data = {'Gene Entrez ID': g, 'Gene Symbol': gene_symbol, 'Gene Name': gene_name, 'Gene Description': gene_description,
                            'PICR Ensembl ID': picr_ensembl_id, 'CHOK1GS Ensembl ID': chok1gs_ensembl_id, 'Transcript ID': mRNA_ncbi_id,
                            'Protein ID': protein_ncbi_id, 'GO Terms': go_terms}
            new_row_df = pd.DataFrame(new_row_data, index=[len(genes)])
            genes = pd.concat([genes, new_row_df])
        except KeyError:
            print(f'Gene {g} not found')
            continue

In [None]:
# Fetch information from the NIH database


# Complete null or blank information in the already generated "Genes Sheet" dataset
for i,row in genes.iterrows():
    if row['Gene Entrez ID'] == '':
        for g in gene_list:
            gene_sheet_list = [str(x) for x in genes['Gene Entrez ID']]
            if g not in gene_sheet_list:
                gene_symbol, gene_name, gene_description, picr_ensembl_id, chok1gs_ensembl_id, mRNA_ncbi_id, protein_ncbi_id, go_terms = get_gene_info(g)
                row['Gene Entrez ID'] = g
                row['Gene Symbol'] = gene_symbol
                row['Gene Name'] = gene_name
                row['Gene Description'] = gene_description
                row['PICR Ensembl ID'] = picr_ensembl_id
                row['CHOK1GS Ensembl ID'] = chok1gs_ensembl_id
                row['Transcript ID'] = mRNA_ncbi_id
                row['Protein ID'] = protein_ncbi_id
                row['GO Terms'] = go_terms
                break          
    elif row['Gene Entrez ID'] != '' and (row['Gene Symbol'] == '' or row['Gene Name'] == '' or row['PICR Ensembl ID'] == '' or row['Transcript ID'] == '' or row['Protein ID'] == ''):
        row['Gene Symbol'] = gene_symbol
        row['Gene Name'] = gene_name
        row['Gene Description'] = gene_description
        row['PICR Ensembl ID'] = picr_ensembl_id
        row['CHOK1GS Ensembl ID'] = chok1gs_ensembl_id
        row['Transcript ID'] = mRNA_ncbi_id
        row['Protein ID'] = protein_ncbi_id
        row['GO Terms'] = go_terms
'''
# Add genes from the gene_list that are not yet in the "Genes Sheet" dataset
for g in gene_list:
    # the first try/except is to avoid overwritting data in case there already some info in the dataset
    try:
        gene_sheet_list = [str(x) for x in df['Gene Entrez ID']]
        id = max(df['Index']) + 2
    except:
        gene_sheet_list = []
        id = 2
    if gene not in gene_sheet_list:
        try:
            gene_symbol, gene_name, gene_description, picr_ensembl_id, chok1gs_ensembl_id, mRNA_ncbi_id, protein_ncbi_id, go_terms = get_gene_info(gene)
            print(id-1)
            genes_sheet.update_cell(id,1,id-1)
            time.sleep(5)
            genes_sheet.update_cell(id,2,gene)
            time.sleep(5)
            genes_sheet.update_cell(id,3,gene_symbol)
            time.sleep(5)
            genes_sheet.update_cell(id,4,gene_name)
            time.sleep(5)
            genes_sheet.update_cell(id,5,gene_description)
            time.sleep(5)
            genes_sheet.update_cell(id,6,picr_ensembl_id)
            time.sleep(5)
            genes_sheet.update_cell(id,7,chok1gs_ensembl_id)
            time.sleep(5)
            genes_sheet.update_cell(id,8,mRNA_ncbi_id)
            time.sleep(5)
            genes_sheet.update_cell(id,9,protein_ncbi_id)
            time.sleep(5)
            genes_sheet.update_cell(id,10,go_terms)
        except:
            print('Google API quota exceeded')
            time.sleep(5)
            continue
            
'''

In [None]:
genes

In [None]:
get_gene_info('100750772')

In [None]:
from Bio import Entrez
Entrez.email = 'account1@theta-ocean-377718.iam.gserviceaccount.com'
handle = Entrez.efetch(db='gene', id='3979190', retmode='xml')
record = Entrez.read(handle)[0]

#gene_name = record['Entrezgene_gene']['Gene-ref']['Gene-ref_desc']
#gene_symbol = record['Entrezgene_gene']['Gene-ref']['Gene-ref_locus']

human_handle = Entrez.efetch(db='gene', id='3979190', retmode='xml')
human_record = Entrez.read(human_handle)[0]
if 'Entrezgene_comments' in human_record and 'Gene-commentary_comment' in human_record['Entrezgene_comments'][0]:
    human_gene_description = human_record['Entrezgene_comments'][0]['Gene-commentary_comment'][0]['String']
elif 'Entrezgene_summary' in human_record:
    human_gene_description = human_record['Entrezgene_summary']
else:
    human_gene_description = ''