# Whole Cell Network Reconstruction in CHO Cells
The following notebook retrieves and updates information in the "Whole Cell Network Reconstruction for CHO Cells" Google Sheet file.

### 1. Access and retrieve information from the Google Sheet file through the Google Sheet API
Using the gspread library we can access the Google Sheet file and create a pandas df to visualize it.

In [1]:
import gspread
import pandas as pd
import numpy as np

In [2]:
# give service account details to gspread
sa = gspread.service_account(filename='credentials.json')

# sa is a gspread client, which can be used for connecting to the sheets
# by using the open method and the sheet name.
cho_recon = sa.open('CHO Network Reconstruction')

# we also need to specify the page name before getting the data. In this case we use the Rxns sheet.
rxns_sheet = cho_recon.worksheet('Rxns')

In [None]:
# visualization of all the sheets in our dataset
for sheets in cho_recon:
    print(sheets)

In [None]:
# We can extract the data using the get_all_records method and create a pd DataFrame
df = pd.DataFrame(rxns_sheet.get_all_records())
df

### 2. Add information to the "Genes" sheet

Using a list of all the genes included in the dataset we can retrieve information from the NIH database regarding Gene Symbol, Gene Name, Gene Ensembl ID, and mRNA ID and protein ID

In [5]:
# Generation of gene_list from all the genes in the "Whole Cell Network Reconstruction in CHO Cells" dataset
import re

gene_list = []
for index, row in df.iterrows():
    if row['GPR_final'] != '':
        gpr = str(row['GPR_final'])
        num = re.findall(r'\d+', gpr)
        for n in num:
            gene_list.append(n)
        
gene_list = list(set(gene_list))

In [None]:
# Fetch information from the NIH database
import time
from utils import get_gene_info

# Open the Genes excel Sheet
cho_temporary = sa.open('CHO Network Reconstruction')
genes_sheet = cho_temporary.worksheet('Genes')
df = pd.DataFrame(genes_sheet.get_all_records())
df = df.set_index('Index')

# Complete null or blank information in the already generated "Genes Sheet" dataset
for i,row in df.iterrows():
    if row['Gene Entrez ID'] == '':
        for gene in gene_list:
            gene_sheet_list = [str(x) for x in df['Gene Entrez ID']]
            if gene not in gene_sheet_list:
                print(i)
                gene_symbol, gene_name, gene_description, picr_ensembl_id, chok1gs_ensembl_id, mRNA_ncbi_id, protein_ncbi_id, go_terms = get_gene_info(gene)
                genes_sheet.update_cell(i+1,1,i)
                time.sleep(5)
                genes_sheet.update_cell(i+1,2,gene)
                time.sleep(5)
                genes_sheet.update_cell(i+1,3,gene_symbol)
                time.sleep(5)
                genes_sheet.update_cell(i+1,4,gene_name)
                time.sleep(5)
                genes_sheet.update_cell(i+1,5,gene_description)
                time.sleep(5)
                genes_sheet.update_cell(i+1,6,picr_ensembl_id)
                time.sleep(5)
                genes_sheet.update_cell(i+1,7,chok1gs_ensembl_id)
                time.sleep(5)
                genes_sheet.update_cell(i+1,8,mRNA_ncbi_id)
                time.sleep(5)
                genes_sheet.update_cell(i+1,9,protein_ncbi_id)
                time.sleep(5)
                genes_sheet.update_cell(i+1,10,go_terms)
                break
    elif row['Gene Entrez ID'] != '' and (row['Gene Symbol'] == '' or row['Gene Name'] == '' or row['PICR Ensembl ID'] == '' or row['Transcript ID'] == '' or row['Protein ID'] == ''):
        print(i)
        gene_symbol, gene_name, gene_description, picr_ensembl_id, chok1gs_ensembl_id, mRNA_ncbi_id, protein_ncbi_id, go_terms = get_gene_info(row['Gene Entrez ID'])
        genes_sheet.update_cell(i+1,3,gene_symbol)
        time.sleep(5)
        genes_sheet.update_cell(i+1,4,gene_name)
        time.sleep(5)
        genes_sheet.update_cell(i+1,5,gene_description)
        time.sleep(5)
        genes_sheet.update_cell(i+1,6,picr_ensembl_id)
        time.sleep(5)
        genes_sheet.update_cell(i+1,7,chok1gs_ensembl_id)
        time.sleep(5)
        genes_sheet.update_cell(i+1,8,mRNA_ncbi_id)
        time.sleep(5)
        genes_sheet.update_cell(i+1,9,protein_ncbi_id)
        time.sleep(5)
        genes_sheet.update_cell(i+1,10,go_terms)

# Add genes from the gene_list that are not yet in the "Genes Sheet" dataset
for gene in gene_list:
    df = pd.DataFrame(genes_sheet.get_all_records())
    # the first try/except is to avoid overwritting data in case there already some info in the dataset
    try:
        gene_sheet_list = [str(x) for x in df['Gene Entrez ID']]
        id = max(df['Index']) + 2
    except:
        gene_sheet_list = []
        id = 2
    if gene not in gene_sheet_list:
        try:
            gene_symbol, gene_name, gene_description, picr_ensembl_id, chok1gs_ensembl_id, mRNA_ncbi_id, protein_ncbi_id, go_terms = get_gene_info(gene)
            print(id-1)
            genes_sheet.update_cell(id,1,id-1)
            time.sleep(5)
            genes_sheet.update_cell(id,2,gene)
            time.sleep(5)
            genes_sheet.update_cell(id,3,gene_symbol)
            time.sleep(5)
            genes_sheet.update_cell(id,4,gene_name)
            time.sleep(5)
            genes_sheet.update_cell(id,5,gene_description)
            time.sleep(5)
            genes_sheet.update_cell(id,6,picr_ensembl_id)
            time.sleep(5)
            genes_sheet.update_cell(id,7,chok1gs_ensembl_id)
            time.sleep(5)
            genes_sheet.update_cell(id,8,mRNA_ncbi_id)
            time.sleep(5)
            genes_sheet.update_cell(id,9,protein_ncbi_id)
            time.sleep(5)
            genes_sheet.update_cell(id,10,go_terms)
        except:
            print('Google API quota exceeded')
            time.sleep(5)
            continue

### 3 GPR Rules Final

In [None]:
# Read the google excel spreadsheet
sa = gspread.service_account(filename='credentials.json')
cho_temporary = sa.open('temporary')
rxns_sheet = cho_temporary.worksheet('Rxns')
rxns_sheet_dataframe = pd.DataFrame(rxns_sheet.get_all_records())
rxns_sheet_dataframe = rxns_sheet_dataframe.set_index('Index')

# Create dataframes
Final_Genes = rxns_sheet_dataframe.iloc[:,9]
Final_Than_Genes = rxns_sheet_dataframe.iloc[:,10]

In [None]:
# Check for empty cells and replace
counter = 0 
for g in Final_Genes:
    if g =="":
        Final_Genes[counter] = Final_Than_Genes[counter]
    counter += 1
    
# Write the lists to a txt file
with open('OutputGPR_Final.txt', 'w') as f:
    # write each item in the list to the file
    for item in Final_Genes:
        f.write(str(item) + '\n')

### Curate GPRs

In [83]:
def balance_parentheses(string):
    stack = []
    for i in range(len(string)):
        if string[i] == "(":
            stack.append(i)
        elif string[i] == ")":
            if len(stack) == 0:
                return False
            stack.pop()
    if len(stack) > 0:
        return False
    else:
        return string

df = pd.DataFrame(rxns_sheet.get_all_records())
df['GPR_final']

Final_Genes = []

for counter, gpr in enumerate(df['GPR_final'][0:300]):
    string = str(gpr)
    if balance_parentheses(string):
        string = balance_parentheses(string)
        Final_Genes.append(string)

    else:
        new_string = ''
        for char in string:
            if char == '(':
                new_string += ''
            elif char == ')':
                new_string += ''
            else:
                new_string += char
        Final_Genes.append(new_string)

with open('CuratedGPRs.txt', 'w') as f:
    for item in Final_Genes:
        f.write(str(item) + '\n')