# Whole Cell Network Reconstruction in CHO Cells
The following notebook retrieves and updates information in the "Whole Cell Network Reconstruction for CHO Cells" Google Sheet file.

### 1. Access and retrieve information from the Google Sheet file through the Google Sheet API
Using the gspread library we can access the Google Sheet file and create a pandas df to visualize it.

In [1]:
import gspread
import pandas as pd
import numpy as np
import time
from tqdm.notebook import tqdm

from google_sheet import GoogleSheet
from utils import get_gene_info



In [2]:
KEY_FILE_PATH = 'credentials.json'
SPREADSHEET_ID = '1MlBXeHIKw8k8fZyXm-sN__AHTRSunJxar_-bqvukZws'

# Initialize the GoogleSheet object
sheet = GoogleSheet(SPREADSHEET_ID, KEY_FILE_PATH)

# Read data from the Google Sheet
sheet_rxns = 'Rxns'
sheet_genes = 'Genes'

rxns = sheet.read_google_sheet(sheet_rxns)
genes = sheet.read_google_sheet(sheet_genes)

In [3]:
rxns

Unnamed: 0,Curated,Reaction,Reaction Name,Reaction Formula,Subsystem,GPR_hef,GPR_fou,GPR_yeo,GPR_Recon3D,GPR_final,Conf. Score,Curation Notes,References
0,PD,10FTHF5GLUtl,"5-glutamyl-10FTHF transport, lysosomal",10fthf5glu_c --> 10fthf5glu_l,"TRANSPORT, LYSOSOMAL",,,,,,1,No information available in the literature abo...,
1,PD,10FTHF5GLUtm,"5-glutamyl-10FTHF transport, mitochondrial",10fthf5glu_m --> 10fthf5glu_c,"TRANSPORT, MITOCHONDRIAL",,,,,,1,No information available in the literature abo...,
2,PD,10FTHF6GLUtl,"6-glutamyl-10FTHF transport, lysosomal",10fthf6glu_c --> 10fthf6glu_l,"TRANSPORT, LYSOSOMAL",,,,,,1,No information available in the literature abo...,
3,PD,10FTHF6GLUtm,"6-glutamyl-10FTHF transport, mitochondrial",10fthf6glu_m --> 10fthf6glu_c,"TRANSPORT, MITOCHONDRIAL",,,,,,1,No information available in the literature abo...,
4,PD,10FTHF7GLUtl,"7-glutamyl-10FTHF transport, lysosomal",10fthf7glu_c --> 10fthf7glu_l,"TRANSPORT, LYSOSOMAL",,,,,,1,No information available in the literature abo...,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
11379,Than,SPHINGStg,Sphingosine Intracellular Transport,sphings_c <=> sphings_g,"TRANSPORT, GOLGI APPARATUS",,,,,,,,
11380,PD,XOLEST226tl,Intracellular Transport of Cholesteryl Docosah...,xolest226_hs_c <=> xolest226_hs_l,"Transport, lysosomal",,,,,,,,
11381,PD,XOLEST226_HSte,"Transport of Cholesteryl Docosahexanoate, Chol...",xolest226_hs_e <=> xolest226_hs_c,"Transport, extracellular",,,,,,,,
11382,PD,XOLEST205tl,Intracellular Transport of 1-Timnodnoyl-Choles...,xolest205_hs_c <=> xolest205_hs_l,"Transport, lysosomal",,,,,,,,


### 2. Add information to the "Genes" sheet

Using a list of all the genes included in the dataset we can retrieve information from the NIH database regarding Gene Symbol, Gene Name, Gene Ensembl ID, and mRNA ID and protein ID

In [4]:
# Generation of gene_list from all the genes in the "Whole Cell Network Reconstruction in CHO Cells" dataset
import re

gene_list = []
for index, row in rxns.iterrows():
    if row['GPR_final'] != '':
        gpr = str(row['GPR_final'])
        num = re.findall(r'\d+', gpr)
        for n in num:
            gene_list.append(n)
        
gene_list = list(set(gene_list))

In [5]:
gene_list

['100752960',
 '100770451',
 '100757447',
 '100761807',
 '100764470',
 '170685',
 '107977085',
 '100752414',
 '100767265',
 '103158533',
 '100760366',
 '100769119',
 '100753214',
 '100755147',
 '100689330',
 '100753201',
 '100770447',
 '100773432',
 '103158535',
 '100761287',
 '100762873',
 '100769028',
 '100774016',
 '100751275',
 '100764351',
 '100754654',
 '100760370',
 '100751157',
 '100758490',
 '100752162',
 '100753172',
 '100761001',
 '100760805',
 '100772629',
 '100689022',
 '100753680',
 '100767681',
 '100769024',
 '9426',
 '3100755534',
 '100770562',
 '103159036',
 '100751831',
 '100773213',
 '100770177',
 '100288072',
 '100759203',
 '100753866',
 '100767554',
 '100773089',
 '100760870',
 '100766001',
 '100764752',
 '100750766',
 '100755812',
 '100752583',
 '100758565',
 '100763073',
 '84265',
 '100770206',
 '100756898',
 '100760429',
 '100774338',
 '100764110',
 '100758873',
 '100761516',
 '6259',
 '100756607',
 '100770104',
 '100767615',
 '100762876',
 '100763332',
 '100689

In [7]:
import time
from urllib.error import HTTPError
from Bio.Entrez.Parser import ValidationError

max_retries = 5  # Set the maximum number of retries

for g in tqdm(gene_list):
    if g not in list(genes['Gene Entrez ID']):
        retry = True
        retries = 0
        while retry:
            try:
                print(g)
                gene_symbol, gene_name, gene_description, picr_ensembl_id, chok1gs_ensembl_id, mRNA_ncbi_id, protein_ncbi_id, go_terms = get_gene_info(g)
                new_row_data = {'Gene Entrez ID': g, 'Gene Symbol': gene_symbol, 'Gene Name': gene_name, 'Gene Description': gene_description,
                                'PICR Ensembl ID': picr_ensembl_id, 'CHOK1GS Ensembl ID': chok1gs_ensembl_id, 'Transcript ID': mRNA_ncbi_id,
                                'Protein ID': protein_ncbi_id, 'GO Terms': go_terms}
                new_row_df = pd.DataFrame(new_row_data, index=[len(genes)])
                genes = pd.concat([genes, new_row_df])
                retry = False
            except ValidationError:
                print(f'Gene {g} not found')
                retry = False
                continue
            except HTTPError:
                retries += 1
                if retries > max_retries:
                    print(f'HTTP Error, reached maximum retries ({max_retries}) for gene {g}')
                    break
                else:
                    print(f'HTTP Error, retrying with gene {g}')
                    time.sleep(1)

  0%|          | 0/3235 [00:00<?, ?it/s]

3100755534
Gene 3100755534 not found
12100769644
Gene 12100769644 not found
7100762626
Gene 7100762626 not found
75100768098
Gene 75100768098 not found
33100766715
Gene 33100766715 not found
28100761226
Gene 28100761226 not found
0
HTTP Error, retrying with gene 0
0
HTTP Error, retrying with gene 0
0
HTTP Error, retrying with gene 0
0
HTTP Error, retrying with gene 0
0
HTTP Error, retrying with gene 0
0
HTTP Error, reached maximum retries (5) for gene 0
100763623
100757874
100759316
100762475
Gene 100762475 does not have PICR Ensembl ID
4056
100689446
100751465
Gene 100751465 does not have PICR Ensembl ID
100757047
100689424
100765354
100750910
100768660
100756363
100759847
100769695
100771995
100759026
103159088
Gene 103159088 does not have PICR Ensembl ID
100760125
100762088
7381
100757481
100762673
5319
158584
100757928
100773612
100773003
100774095
HTTP Error, retrying with gene 100774095
100774095
100750488
100760329
113830817
Gene 113830817 does not have PICR Ensembl ID
100689237

100756945
100758921
100772148
23411
2525
100755205
100760922
66002
100755684
Gene 100755684 does not have PICR Ensembl ID
100761551
100770087
100750559
124
100765772
100767370
100762619
100756769
100772118
100763633
100771275
100760683
100756336
100754771
100774416
653505
100774301
100758107
100772907
100771266
100771250
100767305
100772790
100750718
56267
23318
100752729
100761747
100770363
100756929
100751979
100767638
100756958
100762089
100765874
100752094
100689242
103159960
Gene 103159960 does not have PICR Ensembl ID
Gene 103159960 does not have mRNA id and protein id
100760691
Gene 100760691 does not have PICR Ensembl ID
100761486
100689382
Gene 100689382 does not have PICR Ensembl ID
100757710
100689360
100771257
Gene 100771257 does not have PICR Ensembl ID
100772304
100762326
100764171
100771287
100772409
100760215
Gene 100760215 does not have PICR Ensembl ID
100752307
79966
100772418
100756894
100752265
100767789
100751506
Gene 100751506 does not have PICR Ensembl ID
1007581

100760271
100756840
100762955
100774284
100773935
100758803
100764866
100751194
100762975
100762480
100764538
100768251
100774594
7332
100767270
100758937
100689351
100764208
2940
100754784
100753443
100754286
100774348
55089
143162
100757350
8908
133688
100772776
Gene 100772776 does not have PICR Ensembl ID
9917
1735
100766128
100689076
100765660
100762297
55284
100772932
100751113
29082
100753200
100764519
100751432
100765960
100773624
100773287
100770099
100753249
100766179
100755025
100755838
100765870
100752040
Gene 100752040 does not have PICR Ensembl ID
5568
3979179
Gene 3979179 does not have PICR Ensembl ID
Gene 3979179 does not have info on human ortholog description
100758512
100770352
100752064
100769018
2280
100757349
100765193
100765192
100760513
100768067
100771156
8681
100762536
100767598
100763383
100753123
Gene 100753123 does not have PICR Ensembl ID
100773810
103159043
Gene 103159043 does not have PICR Ensembl ID
100755521
100768978
100772999
100772985
100764792
92483

In [8]:
genes

Unnamed: 0,Gene Entrez ID,Gene Symbol,Gene Name,Gene Description,PICR Ensembl ID,CHOK1GS Ensembl ID,Transcript ID,Protein ID,GO Terms
0,100774298,Slco1b3,solute carrier organic anion transporter famil...,This gene encodes a liver-specific member of t...,ENSCGRG00015012813,ENSCGRG00001021568,XM_035456357,XP_035312248,"['GO:0055085', 'GO:0022857', 'GO:0005515', 'GO..."
1,100754260,Hsd3b7,"hydroxy-delta-5-steroid dehydrogenase, 3 beta-...",This gene encodes an enzyme which is involved ...,ENSCGRG00015024104,ENSCGRG00001010375,XM_003510980,XP_003511028,"['GO:0016020', 'GO:0016491', 'GO:0016616', 'GO..."
2,100769024,LOC100769024,aromatase,This gene encodes a member of the cytochrome P...,ENSCGRG00015026385,ENSCGRG00001015094,XM_003508783,XP_003508831,"['GO:0016020', 'GO:0046872', 'GO:0016491', 'GO..."
3,100756356,Atp2a3,ATPase sarcoplasmic/endoplasmic reticulum Ca2+...,This gene encodes one of the SERCA Ca(2+)-ATPa...,ENSCGRG00015000763,ENSCGRG00001017225,XM_035458322,XP_035314213,"['GO:0005783', 'GO:0016529', 'GO:0044325', 'GO..."
4,100774660,Pgls,6-phosphogluconolactonase,Enables 6-phosphogluconolactonase activity. In...,ENSCGRG00015004513,ENSCGRG00001024276,XM_027428449,XP_027284250,"['GO:0016787', 'GO:0005975', 'GO:0017057', 'GO..."
...,...,...,...,...,...,...,...,...,...
3226,100757245,Ndufv1,NADH:ubiquinone oxidoreductase core subunit V1,The mitochondrial respiratory chain provides e...,ENSCGRG00015019323,ENSCGRG00001015739,XM_003509924,XP_003509972,"['GO:0016020', 'GO:0046872', 'GO:0022900', 'GO..."
3227,2527,FUT5,fucosyltransferase 5,Enables 3-galactosyl-N-acetylglucosaminide 4-a...,ENSG00000130383,,NM_002034,NP_002025,
3228,100764156,Slc39a6,solute carrier family 39 member 6,Zinc is an essential cofactor for hundreds of ...,ENSCGRG00015022771,ENSCGRG00001014525,XM_003500808,XP_003500856,"['GO:0005886', 'GO:0005783', 'GO:0005385', 'GO..."
3229,348158,ACSM2B,acyl-CoA synthetase medium chain family member 2B,Enables benzoate-CoA ligase activity. Predicte...,ENSG00000066813,,NM_001410902,NP_001397831,


In [9]:
sheet.update_google_sheet(sheet_genes, genes)
print("Google Sheet updated.")

Google Sheet updated.


In [None]:
for g in tqdm(gene_list):
    if g not in list(genes['Gene Entrez ID']):
        try:
            gene_symbol, gene_name, gene_description, picr_ensembl_id, chok1gs_ensembl_id, mRNA_ncbi_id, protein_ncbi_id, go_terms = get_gene_info(g)
            new_row_data = {'Gene Entrez ID': g, 'Gene Symbol': gene_symbol, 'Gene Name': gene_name, 'Gene Description': gene_description,
                            'PICR Ensembl ID': picr_ensembl_id, 'CHOK1GS Ensembl ID': chok1gs_ensembl_id, 'Transcript ID': mRNA_ncbi_id,
                            'Protein ID': protein_ncbi_id, 'GO Terms': go_terms}
            new_row_df = pd.DataFrame(new_row_data, index=[len(genes)])
            genes = pd.concat([genes, new_row_df])
        except KeyError:
            print(f'Gene {g} not found')
            continue

In [None]:
# Fetch information from the NIH database


# Complete null or blank information in the already generated "Genes Sheet" dataset
for i,row in genes.iterrows():
    if row['Gene Entrez ID'] == '':
        for g in gene_list:
            gene_sheet_list = [str(x) for x in genes['Gene Entrez ID']]
            if g not in gene_sheet_list:
                gene_symbol, gene_name, gene_description, picr_ensembl_id, chok1gs_ensembl_id, mRNA_ncbi_id, protein_ncbi_id, go_terms = get_gene_info(g)
                row['Gene Entrez ID'] = g
                row['Gene Symbol'] = gene_symbol
                row['Gene Name'] = gene_name
                row['Gene Description'] = gene_description
                row['PICR Ensembl ID'] = picr_ensembl_id
                row['CHOK1GS Ensembl ID'] = chok1gs_ensembl_id
                row['Transcript ID'] = mRNA_ncbi_id
                row['Protein ID'] = protein_ncbi_id
                row['GO Terms'] = go_terms
                break          
    elif row['Gene Entrez ID'] != '' and (row['Gene Symbol'] == '' or row['Gene Name'] == '' or row['PICR Ensembl ID'] == '' or row['Transcript ID'] == '' or row['Protein ID'] == ''):
        row['Gene Symbol'] = gene_symbol
        row['Gene Name'] = gene_name
        row['Gene Description'] = gene_description
        row['PICR Ensembl ID'] = picr_ensembl_id
        row['CHOK1GS Ensembl ID'] = chok1gs_ensembl_id
        row['Transcript ID'] = mRNA_ncbi_id
        row['Protein ID'] = protein_ncbi_id
        row['GO Terms'] = go_terms
'''
# Add genes from the gene_list that are not yet in the "Genes Sheet" dataset
for g in gene_list:
    # the first try/except is to avoid overwritting data in case there already some info in the dataset
    try:
        gene_sheet_list = [str(x) for x in df['Gene Entrez ID']]
        id = max(df['Index']) + 2
    except:
        gene_sheet_list = []
        id = 2
    if gene not in gene_sheet_list:
        try:
            gene_symbol, gene_name, gene_description, picr_ensembl_id, chok1gs_ensembl_id, mRNA_ncbi_id, protein_ncbi_id, go_terms = get_gene_info(gene)
            print(id-1)
            genes_sheet.update_cell(id,1,id-1)
            time.sleep(5)
            genes_sheet.update_cell(id,2,gene)
            time.sleep(5)
            genes_sheet.update_cell(id,3,gene_symbol)
            time.sleep(5)
            genes_sheet.update_cell(id,4,gene_name)
            time.sleep(5)
            genes_sheet.update_cell(id,5,gene_description)
            time.sleep(5)
            genes_sheet.update_cell(id,6,picr_ensembl_id)
            time.sleep(5)
            genes_sheet.update_cell(id,7,chok1gs_ensembl_id)
            time.sleep(5)
            genes_sheet.update_cell(id,8,mRNA_ncbi_id)
            time.sleep(5)
            genes_sheet.update_cell(id,9,protein_ncbi_id)
            time.sleep(5)
            genes_sheet.update_cell(id,10,go_terms)
        except:
            print('Google API quota exceeded')
            time.sleep(5)
            continue
            
'''

In [None]:
genes

In [None]:
get_gene_info('100750772')

In [None]:
from Bio import Entrez
Entrez.email = 'account1@theta-ocean-377718.iam.gserviceaccount.com'
handle = Entrez.efetch(db='gene', id='3979190', retmode='xml')
record = Entrez.read(handle)[0]

#gene_name = record['Entrezgene_gene']['Gene-ref']['Gene-ref_desc']
#gene_symbol = record['Entrezgene_gene']['Gene-ref']['Gene-ref_locus']

human_handle = Entrez.efetch(db='gene', id='3979190', retmode='xml')
human_record = Entrez.read(human_handle)[0]
if 'Entrezgene_comments' in human_record and 'Gene-commentary_comment' in human_record['Entrezgene_comments'][0]:
    human_gene_description = human_record['Entrezgene_comments'][0]['Gene-commentary_comment'][0]['String']
elif 'Entrezgene_summary' in human_record:
    human_gene_description = human_record['Entrezgene_summary']
else:
    human_gene_description = ''