# Whole Cell Network Reconstruction in CHO Cells
The following notebook retrieves and updates information in the "Whole Cell Network Reconstruction for CHO Cells" Google Sheet file.

### 1. Access and retrieve information from the Google Sheet file through the Google Sheet API
Using the gspread library we can access the Google Sheet file and create a pandas df to visualize it.

In [1]:
import gspread
import pandas as pd
import numpy as np

In [2]:
# give service account details to gspread
sa = gspread.service_account(filename='credentials.json')

# sa is a gspread client, which can be used for connecting to the sheets
# by using the open method and the sheet name.
cho_recon = sa.open('CHO Network Reconstruction')

# we also need to specify the page name before getting the data. In this case we use the Rxns sheet.
rxns_sheet = cho_recon.worksheet('Rxns')

In [3]:
# visualization of all the sheets in our dataset
for sheets in cho_recon:
    print(sheets)

<Worksheet 'Info' id:0>
<Worksheet 'Rxns' id:1966089892>
<Worksheet 'Attributes' id:745769606>
<Worksheet 'Added Rxns' id:1377582373>
<Worksheet 'Genes' id:239167986>
<Worksheet 'Metabolites' id:1367015881>


In [4]:
# We can extract the data using the get_all_records method and create a pd DataFrame
df = pd.DataFrame(rxns_sheet.get_all_records())
df

Unnamed: 0,Curated,Reaction,Reaction Name,Reaction Formula,Subsystem,GPR_hef,GPR_fou,GPR_yeo,GPR_Recon3D,GPR_final,Conf. Score,Curation Notes,References
0,PD,10FTHF5GLUtl,"5-glutamyl-10FTHF transport, lysosomal",10fthf5glu_c --> 10fthf5glu_l,"TRANSPORT, LYSOSOMAL",,,,,,1,No information available in the literature abo...,
1,PD,10FTHF5GLUtm,"5-glutamyl-10FTHF transport, mitochondrial",10fthf5glu_m --> 10fthf5glu_c,"TRANSPORT, MITOCHONDRIAL",,,,,,1,No information available in the literature abo...,
2,PD,10FTHF6GLUtl,"6-glutamyl-10FTHF transport, lysosomal",10fthf6glu_c --> 10fthf6glu_l,"TRANSPORT, LYSOSOMAL",,,,,,1,No information available in the literature abo...,
3,PD,10FTHF6GLUtm,"6-glutamyl-10FTHF transport, mitochondrial",10fthf6glu_m --> 10fthf6glu_c,"TRANSPORT, MITOCHONDRIAL",,,,,,1,No information available in the literature abo...,
4,PD,10FTHF7GLUtl,"7-glutamyl-10FTHF transport, lysosomal",10fthf7glu_c --> 10fthf7glu_l,"TRANSPORT, LYSOSOMAL",,,,,,1,No information available in the literature abo...,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8184,,r2537,Utilized transport,lnlncgcoa_c <=> lnlncgcoa_r,Transport,,,,,,,,
8185,,r2538,Utilized transport,dlnlcgcoa_c <=> dlnlcgcoa_r,Transport,,,,,,,,
8186,,r2539,Postulated transport reaction,L2aadp6sa_c + L2aadp_m <=> L2aadp6sa_m + L2aadp_c,Transport,,,,,,,,
8187,PD,ALLTTtm,"Allantoate transport via diffusion, mitochondria",alltt_c <=> alltt_m,"Transport, mitochondria",,,,,,1,The transport of Allantoate from the cytoplasm...,


### 2. Add information to the "Genes" sheet

Using a list of all the genes included in the dataset we can retrieve information from the NIH database regarding Gene Symbol, Gene Name, Gene Ensembl ID, and mRNA ID and protein ID

In [5]:
# Generation of gene_list from all the genes in the "Whole Cell Network Reconstruction in CHO Cells" dataset
import re

gene_list = []
for index, row in df.iterrows():
    if row['GPR_final'] != '':
        gpr = str(row['GPR_final'])
        num = re.findall(r'\d+', gpr)
        for n in num:
            gene_list.append(n)
        
gene_list = list(set(gene_list))

In [6]:
gene_list

['100754867',
 '100754740',
 '100769905',
 '100763642',
 '7367',
 '100755324',
 '100774930',
 '100767691',
 '100774638',
 '100763379',
 '100770009',
 '100765292',
 '100759818',
 '100773026',
 '100763166',
 '100760763',
 '100689433',
 '100755696',
 '100757464',
 '100763617',
 '100762635',
 '100762480',
 '100755697',
 '100751269',
 '100755742',
 '100774853',
 '100765390',
 '100753542',
 '100750518',
 '100766630',
 '100752960',
 '100775027',
 '100751461',
 '100771604',
 '100760747',
 '100764152',
 '100755933',
 '100760429',
 '100771787',
 '100770567',
 '100771181',
 '100763623',
 '100762304',
 '100772668',
 '100773723',
 '100766881',
 '100750644',
 '100764967',
 '100774594',
 '100769715',
 '100771257',
 '100751376',
 '100765077',
 '100767211',
 '100764570',
 '100765986',
 '100755684',
 '100769920',
 '100750622',
 '100767929',
 '100751272',
 '100689280',
 '100764594',
 '100763658',
 '100755485',
 '100760375',
 '100757130',
 '100761494',
 '100764751',
 '100752019',
 '100765639',
 '100757710

In [None]:
# Fetch information from the NIH database
import time
from utils import get_gene_info

# Open the Genes excel Sheet
cho_temporary = sa.open('CHO Network Reconstruction')
genes_sheet = cho_temporary.worksheet('Genes')
df = pd.DataFrame(genes_sheet.get_all_records())
df = df.set_index('Index')

# Complete null or blank information in the already generated "Genes Sheet" dataset
for i,row in df.iterrows():
    if row['Gene Entrez ID'] == '':
        for gene in gene_list:
            gene_sheet_list = [str(x) for x in df['Gene Entrez ID']]
            if gene not in gene_sheet_list:
                print(i)
                gene_symbol, gene_name, gene_description, picr_ensembl_id, chok1gs_ensembl_id, mRNA_ncbi_id, protein_ncbi_id, go_terms = get_gene_info(gene)
                genes_sheet.update_cell(i+1,1,i)
                time.sleep(5)
                genes_sheet.update_cell(i+1,2,gene)
                time.sleep(5)
                genes_sheet.update_cell(i+1,3,gene_symbol)
                time.sleep(5)
                genes_sheet.update_cell(i+1,4,gene_name)
                time.sleep(5)
                genes_sheet.update_cell(i+1,5,gene_description)
                time.sleep(5)
                genes_sheet.update_cell(i+1,6,picr_ensembl_id)
                time.sleep(5)
                genes_sheet.update_cell(i+1,7,chok1gs_ensembl_id)
                time.sleep(5)
                genes_sheet.update_cell(i+1,8,mRNA_ncbi_id)
                time.sleep(5)
                genes_sheet.update_cell(i+1,9,protein_ncbi_id)
                time.sleep(5)
                genes_sheet.update_cell(i+1,10,go_terms)
                break
    elif row['Gene Entrez ID'] != '' and (row['Gene Symbol'] == '' or row['Gene Name'] == '' or row['PICR Ensembl ID'] == '' or row['Transcript ID'] == '' or row['Protein ID'] == ''):
        print(i)
        gene_symbol, gene_name, gene_description, picr_ensembl_id, chok1gs_ensembl_id, mRNA_ncbi_id, protein_ncbi_id, go_terms = get_gene_info(row['Gene Entrez ID'])
        genes_sheet.update_cell(i+1,3,gene_symbol)
        time.sleep(5)
        genes_sheet.update_cell(i+1,4,gene_name)
        time.sleep(5)
        genes_sheet.update_cell(i+1,5,gene_description)
        time.sleep(5)
        genes_sheet.update_cell(i+1,6,picr_ensembl_id)
        time.sleep(5)
        genes_sheet.update_cell(i+1,7,chok1gs_ensembl_id)
        time.sleep(5)
        genes_sheet.update_cell(i+1,8,mRNA_ncbi_id)
        time.sleep(5)
        genes_sheet.update_cell(i+1,9,protein_ncbi_id)
        time.sleep(5)
        genes_sheet.update_cell(i+1,10,go_terms)

# Add genes from the gene_list that are not yet in the "Genes Sheet" dataset
for gene in gene_list:
    df = pd.DataFrame(genes_sheet.get_all_records())
    # the first try/except is to avoid overwritting data in case there already some info in the dataset
    try:
        gene_sheet_list = [str(x) for x in df['Gene Entrez ID']]
        id = max(df['Index']) + 2
    except:
        gene_sheet_list = []
        id = 2
    if gene not in gene_sheet_list:
        try:
            gene_symbol, gene_name, gene_description, picr_ensembl_id, chok1gs_ensembl_id, mRNA_ncbi_id, protein_ncbi_id, go_terms = get_gene_info(gene)
            print(id-1)
            genes_sheet.update_cell(id,1,id-1)
            time.sleep(5)
            genes_sheet.update_cell(id,2,gene)
            time.sleep(5)
            genes_sheet.update_cell(id,3,gene_symbol)
            time.sleep(5)
            genes_sheet.update_cell(id,4,gene_name)
            time.sleep(5)
            genes_sheet.update_cell(id,5,gene_description)
            time.sleep(5)
            genes_sheet.update_cell(id,6,picr_ensembl_id)
            time.sleep(5)
            genes_sheet.update_cell(id,7,chok1gs_ensembl_id)
            time.sleep(5)
            genes_sheet.update_cell(id,8,mRNA_ncbi_id)
            time.sleep(5)
            genes_sheet.update_cell(id,9,protein_ncbi_id)
            time.sleep(5)
            genes_sheet.update_cell(id,10,go_terms)
        except:
            print('Google API quota exceeded')
            time.sleep(5)
            continue

1229
Google API quota exceeded
Google API quota exceeded
1230
Google API quota exceeded
1231
1232
1233
Google API quota exceeded
Google API quota exceeded
1234
Google API quota exceeded
Google API quota exceeded
Google API quota exceeded
Google API quota exceeded
1235
1236
Google API quota exceeded
1237
1238
Google API quota exceeded
1239
Google API quota exceeded
1240
1241
Google API quota exceeded
Google API quota exceeded
Google API quota exceeded
Google API quota exceeded
1242
1243
1244
Google API quota exceeded
Google API quota exceeded
Google API quota exceeded
1245
Google API quota exceeded
1246
1247
Google API quota exceeded
1248
1249
Google API quota exceeded
1250
Google API quota exceeded
1251
Google API quota exceeded
Google API quota exceeded
1252
Google API quota exceeded
1253
1254
Google API quota exceeded
1255
1256
1257
Google API quota exceeded
1258
Google API quota exceeded
1259
Google API quota exceeded
1260
1261
1262
1263
1264
Google API quota exceeded
1265
1266
1267

### 3 GPR Rules Final

In [None]:
# Read the google excel spreadsheet
sa = gspread.service_account(filename='credentials.json')
cho_temporary = sa.open('temporary')
rxns_sheet = cho_temporary.worksheet('Rxns')
rxns_sheet_dataframe = pd.DataFrame(rxns_sheet.get_all_records())
rxns_sheet_dataframe = rxns_sheet_dataframe.set_index('Index')

# Create dataframes
Final_Genes = rxns_sheet_dataframe.iloc[:,9]
Final_Than_Genes = rxns_sheet_dataframe.iloc[:,10]

In [None]:
# Check for empty cells and replace
counter = 0 
for g in Final_Genes:
    if g =="":
        Final_Genes[counter] = Final_Than_Genes[counter]
    counter += 1
    
# Write the lists to a txt file
with open('OutputGPR_Final.txt', 'w') as f:
    # write each item in the list to the file
    for item in Final_Genes:
        f.write(str(item) + '\n')