# Genes
The following notebook retrieves all the genes involved in the reactions from the **"Rxns" Sheet** in Google Sheet file. Then, information regarding the genes is retrieved from different databases and a **"Genes" Sheet** is generated and updated.

In [1]:
import pandas as pd
import numpy as np
import time
from tqdm.notebook import tqdm

from google_sheet import GoogleSheet
from utils import get_gene_info

### 1. Generate "rxns" and "genes" datasets
The "rxns" dataset contains all the reactions with their GPR associations. The "genes" dataset contains all the information of the genes/GPR involved in our reconstruction.

In [2]:
KEY_FILE_PATH = 'credentials.json'
SPREADSHEET_ID = '1MlBXeHIKw8k8fZyXm-sN__AHTRSunJxar_-bqvukZws'

# Initialize the GoogleSheet object
sheet = GoogleSheet(SPREADSHEET_ID, KEY_FILE_PATH)

# Read data from the Google Sheet
sheet_rxns = 'Rxns'
sheet_genes = 'Genes'

rxns = sheet.read_google_sheet(sheet_rxns)
genes = sheet.read_google_sheet(sheet_genes)

In [3]:
rxns

Unnamed: 0,Package,Curated,Reaction,Reaction Name,Reaction Formula,Subsystem,GPR_hef,GPR_fou,GPR_yeo,GPR_Recon3D,GPR_final,Conf. Score,Curation Notes,References
0,7,PD,10FTHF5GLUtl,"5-glutamyl-10FTHF transport, lysosomal",10fthf5glu_c --> 10fthf5glu_l,"TRANSPORT, LYSOSOMAL",,,,,,1,No information available in the literature abo...,
1,7,PD,10FTHF5GLUtm,"5-glutamyl-10FTHF transport, mitochondrial",10fthf5glu_m --> 10fthf5glu_c,"TRANSPORT, MITOCHONDRIAL",,,,,,1,No information available in the literature abo...,
2,7,PD,10FTHF6GLUtl,"6-glutamyl-10FTHF transport, lysosomal",10fthf6glu_c --> 10fthf6glu_l,"TRANSPORT, LYSOSOMAL",,,,,,1,No information available in the literature abo...,
3,7,PD,10FTHF6GLUtm,"6-glutamyl-10FTHF transport, mitochondrial",10fthf6glu_m --> 10fthf6glu_c,"TRANSPORT, MITOCHONDRIAL",,,,,,1,No information available in the literature abo...,
4,7,PD,10FTHF7GLUtl,"7-glutamyl-10FTHF transport, lysosomal",10fthf7glu_c --> 10fthf7glu_l,"TRANSPORT, LYSOSOMAL",,,,,,1,No information available in the literature abo...,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10513,22,,r2534,Major Facilitator(MFS) TCDB:2.A.1.44.1,thr_L_e <=> thr_L_c,"TRANSPORT, EXTRACELLULAR",,,100757617,100757617,100757617,,,
10514,22,,r2535,Major Facilitator(MFS) TCDB:2.A.1.44.1,hom_L_e <=> hom_L_c,TRANSPORT,,,100757617,100757617,100757617,,,
10515,22,,r2537,Utilized transport,lnlncgcoa_c <=> lnlncgcoa_r,TRANSPORT,,,,,,,,
10516,22,,r2538,Utilized transport,dlnlcgcoa_c <=> dlnlcgcoa_r,TRANSPORT,,,,,,,,


### 2. Create the a "gene_list" with all the genes involved in our reconstruction
We extract the gene IDs from the GPR annotations in the **"rxns" df**. This information is only retrieved from the **GPR_final** column.

In [4]:
# Generation of gene_list from all the genes in the "Whole Cell Network Reconstruction in CHO Cells" dataset
import re

gene_list = []
for index, row in rxns.iterrows():
    if row['GPR_final'] != '':
        gpr = str(row['GPR_final'])
        num = re.findall(r'\d+', gpr)
        for n in num:
            gene_list.append(n)
        
gene_list = list(set(gene_list))

In [5]:
gene_list

['100759929',
 '100770679',
 '100760583',
 '100755602',
 '100752040',
 '100764837',
 '100757439',
 '100754600',
 '1645',
 '100769124',
 '100766362',
 '100762222',
 '100772743',
 '100753866',
 '100772319',
 '100751672',
 '100767405',
 '100752689',
 '100762788',
 '100753123',
 '100773215',
 '100689187',
 '100754017',
 '100755384',
 '100763756',
 '100773901',
 '100766256',
 '100765865',
 '100760126',
 '100753294',
 '100765441',
 '100774781',
 '100756257',
 '100760162',
 '100772193',
 '100757273',
 '100771326',
 '100759885',
 '100769516',
 '100758520',
 '100774594',
 '253175',
 '100752731',
 '100774957',
 '100759109',
 '100767837',
 '100755154',
 '100758055',
 '100767722',
 '100762475',
 '92483',
 '5481',
 '100752019',
 '100775017',
 '100766578',
 '100771604',
 '100775000',
 '100754832',
 '100761589',
 '100767978',
 '100763034',
 '100762628',
 '100758629',
 '100766693',
 '9588',
 '100760728',
 '100765858',
 '100768097',
 '100761613',
 '100765887',
 '100766128',
 '100762505',
 '100756772',


### 3. Add the genes from the "gene_list" to the genes df
Using a list of all the genes included in the dataset we can retrieve information from the NIH database regarding Gene Symbol, Gene Name, Gene Ensembl ID, and mRNA ID and protein ID using the function get_gene_info().

In [6]:
import time
from urllib.error import HTTPError
from Bio.Entrez.Parser import ValidationError

max_retries = 5  # Set the maximum number of retries

for g in tqdm(gene_list):
    if g not in list(genes['Gene Entrez ID']):
        retry = True
        retries = 0
        while retry:
            try:
                print(g)
                organism, gene_symbol, gene_name, gene_description, picr_ensembl_id, chok1gs_ensembl_id, mRNA_ncbi_id, protein_ncbi_id, go_terms = get_gene_info(g)
                if organism == 'Cricetulus griseus':
                    new_row_data = {'Gene Entrez ID': g, 'Gene Symbol': gene_symbol, 'Gene Name': gene_name, 'Gene Description': gene_description,
                                    'PICR Ensembl ID': picr_ensembl_id, 'CHOK1GS Ensembl ID': chok1gs_ensembl_id, 'Transcript ID': mRNA_ncbi_id,
                                    'Protein ID': protein_ncbi_id, 'GO Terms': go_terms}
                    new_row_df = pd.DataFrame(new_row_data, index=[len(genes)])
                    genes = pd.concat([genes, new_row_df])
                    retry = False
                elif organism == 'Homo sapiens':
                    print(f'Gene {g,gene_symbol} is a Human Gene)
                    continue
            except ValidationError:
                print(f'Gene {g} not found')
                retry = False
                continue
            except HTTPError:
                retries += 1
                if retries > max_retries:
                    print(f'HTTP Error, reached maximum retries ({max_retries}) for gene {g}')
                    break
                else:
                    print(f'HTTP Error, retrying with gene {g}')
                    time.sleep(1)

  0%|          | 0/3193 [00:00<?, ?it/s]

### 4. Eliminate unwanted genes
We iterate over the entire dataset to spot **Human genes** in order to eliminate them

In [7]:
# Make a copy of the DataFrame to avoid changing the original while iterating
genes_copy = genes.copy()

for index, row in tqdm(genes.iterrows(), total=genes.shape[0]):
    g = row['Gene Entrez ID']
    retry = True
    retries = 0
    while retry:
        try:
            organism, gene_symbol, gene_name, gene_description, picr_ensembl_id, chok1gs_ensembl_id, mRNA_ncbi_id, protein_ncbi_id, go_terms = get_gene_info(g)
            if organism == 'Homo sapiens':
                genes_copy = genes_copy.drop(index)
                print(f'Gene {g,gene_symbol} erased from dataset')
                retry = False
            else:
                retry = False
        except ValidationError:
            print(f'Gene {g} not found')
            retry = False
            continue
        except HTTPError:
            retries += 1
            if retries > max_retries:
                print(f'HTTP Error, reached maximum retries ({max_retries}) for gene {g}')
                break
            else:
                print(f'HTTP Error, retrying with gene {g}')
                time.sleep(1)

# Replace the original DataFrame with the updated one
genes = genes_copy


  0%|          | 0/3245 [00:00<?, ?it/s]

HTTP Error, retrying with gene 100774660
HTTP Error, retrying with gene 100754978
Gene 100757367 does not have PICR Ensembl ID
Gene 100761245 does not have PICR Ensembl ID
Gene 100761523 does not have PICR Ensembl ID
Gene 100770009 does not have PICR Ensembl ID
HTTP Error, retrying with gene 100771661
Gene 100774679 does not have PICR Ensembl ID
Gene ('7367', 'UGT2B17') erased from dataset
HTTP Error, retrying with gene 100774732
Gene 100771298 does not have PICR Ensembl ID
Gene 100753222 does not have PICR Ensembl ID
Gene ('387775', 'SLC22A10') erased from dataset
Gene ('6697', 'SPR') erased from dataset
Gene ('3939', 'LDHA') erased from dataset
HTTP Error, retrying with gene 100768923
HTTP Error, retrying with gene 100769585
HTTP Error, retrying with gene 100759906
HTTP Error, retrying with gene 100752211
Gene ('6783', 'SULT1E1') erased from dataset
HTTP Error, retrying with gene 100751305
Gene 100689294 does not have PICR Ensembl ID
Gene ('641371', 'ACOT1') erased from dataset
Gene 

Gene ('5478', 'PPIA') erased from dataset
Gene 102724197 does not have PICR Ensembl ID
Gene ('102724197', 'LOC102724197') erased from dataset
HTTP Error, retrying with gene 100754861
Gene ('167127', 'UGT3A2') erased from dataset
Gene ('81888', 'HYI') erased from dataset
Gene ('2939', 'GSTA2') erased from dataset
HTTP Error, retrying with gene 100689402
Gene ('1734', 'DIO2') erased from dataset
Gene 100766959 does not have PICR Ensembl ID
Gene ('5794', 'PTPRH') erased from dataset
Gene ('7363', 'UGT2B4') erased from dataset
HTTP Error, retrying with gene 100756277
Gene 100761051 does not have PICR Ensembl ID
Gene ('55867', 'SLC22A11') erased from dataset
Gene ('8418', 'CMAHP') erased from dataset
Gene ('64219', 'PJA1') erased from dataset
Gene ('55149', 'MTPAP') erased from dataset
Gene ('1514', 'CTSL') erased from dataset
Gene 103164072 does not have PICR Ensembl ID
HTTP Error, retrying with gene 100754380
Gene 100754380 does not have PICR Ensembl ID
Gene 100757365 does not have PICR E

Gene ('440138', 'ALG11') erased from dataset
HTTP Error, retrying with gene 100768417
HTTP Error, retrying with gene 100758721
Gene 103158809 does not have PICR Ensembl ID
Gene ('8287', 'USP9Y') erased from dataset
Gene ('25828', 'TXN2') erased from dataset
HTTP Error, retrying with gene 100757648
Gene 100762656 does not have PICR Ensembl ID
Gene 100765374 does not have PICR Ensembl ID
Gene 100758292 does not have PICR Ensembl ID
Gene ('158160', 'HSD17B7P2') erased from dataset
Gene ('79872', 'CBLL1') erased from dataset
Gene 113831071 does not have PICR Ensembl ID
Gene ('3176', 'HNMT') erased from dataset
Gene 100762645 does not have PICR Ensembl ID
Gene ('60496', 'AASDHPPT') erased from dataset
Gene ('79868', 'ALG13') erased from dataset
Gene 103161868 does not have PICR Ensembl ID
Gene ('60559', 'SPCS3') erased from dataset
Gene 103162274 does not have PICR Ensembl ID
Gene 100773441 does not have PICR Ensembl ID
Gene 100689435 does not have PICR Ensembl ID
Gene 100761532 does not ha

Gene ('10295', 'BCKDK') erased from dataset
Gene ('50484', 'RRM2B') erased from dataset
Gene 100757101 does not have PICR Ensembl ID
Gene 100761392 does not have PICR Ensembl ID
Gene 100763404 does not have PICR Ensembl ID
Gene ('8972', 'MGAM') erased from dataset
Gene 100773723 does not have PICR Ensembl ID
Gene ('10105', 'PPIF') erased from dataset
Gene 107977589 does not have PICR Ensembl ID
Gene 100764001 does not have PICR Ensembl ID
Gene 100765177 does not have PICR Ensembl ID
Gene ('220074', 'LRTOMT') erased from dataset
Gene 100766800 does not have PICR Ensembl ID
Gene 100757865 does not have PICR Ensembl ID
HTTP Error, retrying with gene 100759274
Gene ('79603', 'CERS4') erased from dataset
Gene ('5440', 'POLR2K') erased from dataset
Gene ('51619', 'UBE2D4') erased from dataset
Gene ('8644', 'AKR1C3') erased from dataset
HTTP Error, retrying with gene 100752584
Gene ('30', 'ACAA1') erased from dataset
Gene ('85363', 'TRIM5') erased from dataset
Gene 100756695 does not have PIC

In [8]:
genes

Unnamed: 0,Gene Entrez ID,Gene Symbol,Gene Name,Gene Description,PICR Ensembl ID,CHOK1GS Ensembl ID,Transcript ID,Protein ID,GO Terms,Uniprot ID
0,100774298,Slco1b3,solute carrier organic anion transporter famil...,This gene encodes a liver-specific member of t...,ENSCGRG00015012813,ENSCGRG00001021568,XM_035456357,XP_035312248,"['GO:0055085', 'GO:0022857', 'GO:0005515', 'GO...",
1,100754260,Hsd3b7,"hydroxy-delta-5-steroid dehydrogenase, 3 beta-...",This gene encodes an enzyme which is involved ...,ENSCGRG00015024104,ENSCGRG00001010375,XM_003510980,XP_003511028,"['GO:0016020', 'GO:0016491', 'GO:0016616', 'GO...",G3I7C5
2,100769024,LOC100769024,aromatase,This gene encodes a member of the cytochrome P...,ENSCGRG00015026385,ENSCGRG00001015094,XM_003508783,XP_003508831,"['GO:0016020', 'GO:0046872', 'GO:0016491', 'GO...",A0A061IC63
3,100756356,Atp2a3,ATPase sarcoplasmic/endoplasmic reticulum Ca2+...,This gene encodes one of the SERCA Ca(2+)-ATPa...,ENSCGRG00015000763,ENSCGRG00001017225,XM_035458322,XP_035314213,"['GO:0005783', 'GO:0016529', 'GO:0044325', 'GO...",
4,100774660,Pgls,6-phosphogluconolactonase,Enables 6-phosphogluconolactonase activity. In...,ENSCGRG00015004513,ENSCGRG00001024276,XM_027428449,XP_027284250,"['GO:0016787', 'GO:0005975', 'GO:0017057', 'GO...",
...,...,...,...,...,...,...,...,...,...,...
3240,100759306,LOC100759306,UDP-glucose 4-epimerase,,,,XM_003515945,XP_003515993,,
3241,100755423,LOC100755423,UDP-glucuronosyltransferase 1-6,,,,XM_016980970,XP_016836459,,
3242,100751278,Slc35d1,solute carrier family 35 member D1,,,,XM_007644949,XP_007643139,,
3243,100770830,Csgalnact2,chondroitin sulfate N-acetylgalactosaminyltran...,,,,XM_035457056,XP_035312947,,


In [9]:
sheet.update_google_sheet(sheet_genes, genes)
print("Google Sheet updated.")

Google Sheet updated.


In [None]:
for g in tqdm(gene_list):
    if g not in list(genes['Gene Entrez ID']):
        try:
            gene_symbol, gene_name, gene_description, picr_ensembl_id, chok1gs_ensembl_id, mRNA_ncbi_id, protein_ncbi_id, go_terms = get_gene_info(g)
            new_row_data = {'Gene Entrez ID': g, 'Gene Symbol': gene_symbol, 'Gene Name': gene_name, 'Gene Description': gene_description,
                            'PICR Ensembl ID': picr_ensembl_id, 'CHOK1GS Ensembl ID': chok1gs_ensembl_id, 'Transcript ID': mRNA_ncbi_id,
                            'Protein ID': protein_ncbi_id, 'GO Terms': go_terms}
            new_row_df = pd.DataFrame(new_row_data, index=[len(genes)])
            genes = pd.concat([genes, new_row_df])
        except KeyError:
            print(f'Gene {g} not found')
            continue

In [None]:
# Fetch information from the NIH database


# Complete null or blank information in the already generated "Genes Sheet" dataset
for i,row in genes.iterrows():
    if row['Gene Entrez ID'] == '':
        for g in gene_list:
            gene_sheet_list = [str(x) for x in genes['Gene Entrez ID']]
            if g not in gene_sheet_list:
                gene_symbol, gene_name, gene_description, picr_ensembl_id, chok1gs_ensembl_id, mRNA_ncbi_id, protein_ncbi_id, go_terms = get_gene_info(g)
                row['Gene Entrez ID'] = g
                row['Gene Symbol'] = gene_symbol
                row['Gene Name'] = gene_name
                row['Gene Description'] = gene_description
                row['PICR Ensembl ID'] = picr_ensembl_id
                row['CHOK1GS Ensembl ID'] = chok1gs_ensembl_id
                row['Transcript ID'] = mRNA_ncbi_id
                row['Protein ID'] = protein_ncbi_id
                row['GO Terms'] = go_terms
                break          
    elif row['Gene Entrez ID'] != '' and (row['Gene Symbol'] == '' or row['Gene Name'] == '' or row['PICR Ensembl ID'] == '' or row['Transcript ID'] == '' or row['Protein ID'] == ''):
        row['Gene Symbol'] = gene_symbol
        row['Gene Name'] = gene_name
        row['Gene Description'] = gene_description
        row['PICR Ensembl ID'] = picr_ensembl_id
        row['CHOK1GS Ensembl ID'] = chok1gs_ensembl_id
        row['Transcript ID'] = mRNA_ncbi_id
        row['Protein ID'] = protein_ncbi_id
        row['GO Terms'] = go_terms
'''
# Add genes from the gene_list that are not yet in the "Genes Sheet" dataset
for g in gene_list:
    # the first try/except is to avoid overwritting data in case there already some info in the dataset
    try:
        gene_sheet_list = [str(x) for x in df['Gene Entrez ID']]
        id = max(df['Index']) + 2
    except:
        gene_sheet_list = []
        id = 2
    if gene not in gene_sheet_list:
        try:
            gene_symbol, gene_name, gene_description, picr_ensembl_id, chok1gs_ensembl_id, mRNA_ncbi_id, protein_ncbi_id, go_terms = get_gene_info(gene)
            print(id-1)
            genes_sheet.update_cell(id,1,id-1)
            time.sleep(5)
            genes_sheet.update_cell(id,2,gene)
            time.sleep(5)
            genes_sheet.update_cell(id,3,gene_symbol)
            time.sleep(5)
            genes_sheet.update_cell(id,4,gene_name)
            time.sleep(5)
            genes_sheet.update_cell(id,5,gene_description)
            time.sleep(5)
            genes_sheet.update_cell(id,6,picr_ensembl_id)
            time.sleep(5)
            genes_sheet.update_cell(id,7,chok1gs_ensembl_id)
            time.sleep(5)
            genes_sheet.update_cell(id,8,mRNA_ncbi_id)
            time.sleep(5)
            genes_sheet.update_cell(id,9,protein_ncbi_id)
            time.sleep(5)
            genes_sheet.update_cell(id,10,go_terms)
        except:
            print('Google API quota exceeded')
            time.sleep(5)
            continue
            
'''

In [None]:
genes

In [None]:
get_gene_info('100750772')

In [None]:
from Bio import Entrez
Entrez.email = 'account1@theta-ocean-377718.iam.gserviceaccount.com'
handle = Entrez.efetch(db='gene', id='3979190', retmode='xml')
record = Entrez.read(handle)[0]

#gene_name = record['Entrezgene_gene']['Gene-ref']['Gene-ref_desc']
#gene_symbol = record['Entrezgene_gene']['Gene-ref']['Gene-ref_locus']

human_handle = Entrez.efetch(db='gene', id='3979190', retmode='xml')
human_record = Entrez.read(human_handle)[0]
if 'Entrezgene_comments' in human_record and 'Gene-commentary_comment' in human_record['Entrezgene_comments'][0]:
    human_gene_description = human_record['Entrezgene_comments'][0]['Gene-commentary_comment'][0]['String']
elif 'Entrezgene_summary' in human_record:
    human_gene_description = human_record['Entrezgene_summary']
else:
    human_gene_description = ''