# Whole Cell Network Reconstruction in CHO Cells

The following notebook retrieves and updates information in the "Whole Cell Network Reconstruction for CHO Cells" Google Sheet file.

### 1. Access and retrieve information from the Google Sheet file through the Google Sheet API

Using the gspread library we can access the Google Sheet file and create a pandas df to visualize it.

In [1]:
import gspread
import pandas as pd
import numpy as np

In [2]:
# give service account details to gspread
sa = gspread.service_account(filename='credentials.json')

# sa is a gspread client, which can be used for connecting to the sheets
# by using the open method and the sheet name.
cho_recon = sa.open('temporary')

# we also need to specify the page name before getting the data. In this case we use the Rxns sheet.
rxns_sheet = cho_recon.worksheet('Rxns')

In [3]:
# visualization of all the sheets in our dataset
for sheets in cho_recon:
    print(sheets)

<Worksheet 'Info' id:0>
<Worksheet 'Rxns' id:1966089892>
<Worksheet 'Attributes' id:745769606>
<Worksheet 'Added Rxns' id:1377582373>
<Worksheet 'Genes' id:239167986>


In [4]:
# We can extract the data using the get_all_records method and create a pd DataFrame
df = pd.DataFrame(rxns_sheet.get_all_records())
df = df.set_index('Index')
df

Unnamed: 0_level_0,Curated,Reaction,Reaction Name,Reaction Formula,Subsystem,GPR_hef,GPR_fou,GPR_yeo,GPR_Recon3D,GPR_final,GPR_Final_Thanasis,Conf. Score,Curation Notes,References
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,PD,10FTHF5GLUtl,"5-glutamyl-10FTHF transport, lysosomal",10fthf5glu_c --> 10fthf5glu_l,"TRANSPORT, LYSOSOMAL",,,,,,,1,No information available in the literature abo...,
1,PD,10FTHF5GLUtm,"5-glutamyl-10FTHF transport, mitochondrial",10fthf5glu_m --> 10fthf5glu_c,"TRANSPORT, MITOCHONDRIAL",,,,,,,1,No information available in the literature abo...,
2,PD,10FTHF6GLUtl,"6-glutamyl-10FTHF transport, lysosomal",10fthf6glu_c --> 10fthf6glu_l,"TRANSPORT, LYSOSOMAL",,,,,,,1,No information available in the literature abo...,
3,PD,10FTHF6GLUtm,"6-glutamyl-10FTHF transport, mitochondrial",10fthf6glu_m --> 10fthf6glu_c,"TRANSPORT, MITOCHONDRIAL",,,,,,,1,No information available in the literature abo...,
4,PD,10FTHF7GLUtl,"7-glutamyl-10FTHF transport, lysosomal",10fthf7glu_c --> 10fthf7glu_l,"TRANSPORT, LYSOSOMAL",,,,,,,1,No information available in the literature abo...,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8195,,r2534,Major Facilitator(MFS) TCDB:2.A.1.44.1,thr_L_e <=> thr_L_c,"TRANSPORT, EXTRACELLULAR",,,100757617,100757617,,100757617,,,
8196,,r2535,Major Facilitator(MFS) TCDB:2.A.1.44.1,hom_L_e <=> hom_L_c,Transport,,,100757617,100757617,,100757617,,,
8197,,r2537,Utilized transport,lnlncgcoa_c <=> lnlncgcoa_r,Transport,,,,,,,,,
8198,,r2538,Utilized transport,dlnlcgcoa_c <=> dlnlcgcoa_r,Transport,,,,,,,,,


### 2. Add information to the "Genes" sheet

Using a list of all the genes included in the dataset we can retrieve information from the NIH database regarding Gene Symbol, Gene Name, Gene Ensembl ID, and mRNA ID and protein ID

In [5]:
# Generation of gene_list from all the genes in the "Whole Cell Network Reconstruction in CHO Cells" dataset
import re

gene_list = []
for index, row in df.iterrows():
    if row['GPR_Final_Thanasis'] != '':
        gpr = str(row['GPR_Final_Thanasis'])
        num = re.findall(r'\d+', gpr)
        for n in num:
            gene_list.append(n)
        
gene_list = list(set(gene_list))

In [6]:
gene_list

['100752066',
 '100761051',
 '100752162',
 '100761516',
 '100772003',
 '100752019',
 '100768444',
 '100761511',
 '103160118',
 '100774025',
 '100755501',
 '100689038',
 '100767954',
 '100689348',
 '100750761',
 '100689317',
 '100689054',
 '100770474',
 '100753910',
 '100774698',
 '100752718',
 '100689022',
 '100769994',
 '100763738',
 '100770939',
 '7366',
 '100759730',
 '100762127',
 '100757273',
 '100773805',
 '100773059',
 '100761706',
 '373156',
 '100769495',
 '100763159',
 '100756772',
 '100750893',
 '100750667',
 '100761682',
 '100766881',
 '100758244',
 '100772668',
 '100756944',
 '100757439',
 '100767831',
 '100758951',
 '100766362',
 '100773414',
 '100766778',
 '100750530',
 '100757875',
 '100766024',
 '100770347',
 '100769905',
 '100770943',
 '100758702',
 '100767620',
 '100770052',
 '100763673',
 '10965',
 '100757923',
 '100774858',
 '100762894',
 '100682532',
 '100756894',
 '100760295',
 '100751282',
 '100751648',
 '100765057',
 '100754867',
 '100759864',
 '100758336',
 '10

In [None]:
# Fetch information from the NIH database
import time
from utils import get_gene_info

# Open the Genes excel Sheet
cho_temporary = sa.open('temporary')
genes_sheet = cho_temporary.worksheet('Genes')
df = pd.DataFrame(genes_sheet.get_all_records())
df = df.set_index('Index')

# Complete null or blank information in the already generated "Genes Sheet" dataset
for i,row in df.iterrows():
    if row['Gene Entrez ID'] == '':
        for gene in gene_list:
            gene_sheet_list = [str(x) for x in df['Gene Entrez ID']]
            if gene not in gene_sheet_list:
                print(i)
                gene_symbol, gene_name, gene_description, gene_ensembl_id, mRNA_ncbi_id, protein_ncbi_id = get_gene_info(gene)
                genes_sheet.update_cell(i+1,1,i)
                time.sleep(5)
                genes_sheet.update_cell(i+1,2,gene)
                time.sleep(5)
                genes_sheet.update_cell(i+1,3,gene_symbol)
                time.sleep(5)
                genes_sheet.update_cell(i+1,4,gene_name)
                time.sleep(5)
                genes_sheet.update_cell(i+1,5,gene_description)
                time.sleep(5)
                genes_sheet.update_cell(i+1,6,gene_ensembl_id)
                time.sleep(5)
                genes_sheet.update_cell(i+1,7,mRNA_ncbi_id)
                time.sleep(5)
                genes_sheet.update_cell(i+1,8,protein_ncbi_id)
                break
    elif row['Gene Entrez ID'] != '' and (row['Gene Symbol'] == '' or row['Gene Name'] == '' or row['Gene Ensembl ID'] == '' or row['Transcript ID'] == '' or row['Protein ID'] == ''):
        print(i)
        gene_symbol, gene_name, gene_description, gene_ensembl_id, mRNA_ncbi_id, protein_ncbi_id = get_gene_info(row['Gene Entrez ID'])
        genes_sheet.update_cell(i+1,3,gene_symbol)
        time.sleep(5)
        genes_sheet.update_cell(i+1,4,gene_name)
        time.sleep(5)
        genes_sheet.update_cell(i+1,5,gene_description)
        time.sleep(5)
        genes_sheet.update_cell(i+1,6,gene_ensembl_id)
        time.sleep(5)
        genes_sheet.update_cell(i+1,7,mRNA_ncbi_id)
        time.sleep(5)
        genes_sheet.update_cell(i+1,8,protein_ncbi_id)

# Add genes from the gene_list that are not yet in the "Genes Sheet" dataset
for gene in gene_list:
    df = pd.DataFrame(genes_sheet.get_all_records())
    # the first try/except is to avoid overwritting data in case there already some info in the dataset
    try:
        gene_sheet_list = [str(x) for x in df['Gene Entrez ID']]
        id = max(df['Index']) + 2
    except:
        gene_sheet_list = []
        id = 2
    if gene not in gene_sheet_list:
        try:
            gene_symbol, gene_name, gene_description, gene_ensembl_id, mRNA_ncbi_id, protein_ncbi_id = get_gene_info(gene)
            print(id-1)
            genes_sheet.update_cell(id,1,id-1)
            time.sleep(5)
            genes_sheet.update_cell(id,2,gene)
            time.sleep(5)
            genes_sheet.update_cell(id,3,gene_symbol)
            time.sleep(5)
            genes_sheet.update_cell(id,4,gene_name)
            time.sleep(5)
            genes_sheet.update_cell(id,5,gene_description)
            time.sleep(5)
            genes_sheet.update_cell(id,6,gene_ensembl_id)
            time.sleep(5)
            genes_sheet.update_cell(id,7,mRNA_ncbi_id)
            time.sleep(5)
            genes_sheet.update_cell(id,8,protein_ncbi_id)
        except:
            print('Google API quota exceeded')
            time.sleep(5)
            continue

438
439
Google API quota exceeded
440
441
442
443
444
445
Google API quota exceeded
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
Google API quota exceeded
471
472
473
474
475
476
477
478
479
480
481
Google API quota exceeded
Google API quota exceeded
482
483
484
485
486
487
488
489
490
491
492
493
Google API quota exceeded
Google API quota exceeded
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
Google API quota exceeded
509
510
511
512
Google API quota exceeded
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
Google API quota exceeded
536
537
538
539
540
541
Google API quota exceeded
542
543
