# Genes
The following notebook retrieves all the genes involved in the reactions from the **"Rxns" Sheet** in Google Sheet file. Then, information regarding the genes is retrieved from different databases and a **"Genes" Sheet** is generated and updated.

In [1]:
import pandas as pd
import numpy as np
import time
from tqdm.notebook import tqdm

from google_sheet import GoogleSheet
from utils import get_gene_info

### 1. Generate "rxns" and "genes" datasets
The "rxns" dataset contains all the reactions with their GPR associations. The "genes" dataset contains all the information of the genes/GPR involved in our reconstruction.

In [2]:
KEY_FILE_PATH = 'credentials.json'
SPREADSHEET_ID = '1MlBXeHIKw8k8fZyXm-sN__AHTRSunJxar_-bqvukZws'

# Initialize the GoogleSheet object
sheet = GoogleSheet(SPREADSHEET_ID, KEY_FILE_PATH)

# Read data from the Google Sheet
sheet_rxns = 'Rxns'
sheet_genes = 'Genes'

rxns = sheet.read_google_sheet(sheet_rxns)
genes = sheet.read_google_sheet(sheet_genes)

### 2. Create the a "gene_list" with all the genes involved in our reconstruction
We extract the gene IDs from the GPR annotations in the **"rxns" df**. This information is only retrieved from the **GPR_final** column.

In [3]:
# Generation of gene_list from all the genes in the "Whole Cell Network Reconstruction in CHO Cells" dataset
import re

gene_list = []
human_gene_list = []
for index, row in rxns.iterrows():
    if row['GPR_final'] != '':
        gprs = str(row['GPR_final'])
        gpr = re.findall(r'\b[hH]?\d+\b', gprs)
        for g in gpr:
            if g.lower().startswith('h'):  # we convert to lower case to catch both 'h' and 'H'
                human_gene_list.append(int(g[1:]))  # remove 'h' and convert to int before appending
            else:
                gene_list.append(int(g))  # convert to int before appending

gene_list = list(set(gene_list))
len(gene_list)

3045

In [4]:
# Build a dictionary with human gene IDs as keys and CHO gene IDs as values
orthologs = pd.read_csv('../Data/Orthologs/cho2human_mapping.tsv', sep='\t')
human_to_cho_dict = dict(zip(orthologs['HUMAN_ID'], orthologs['CHO_ID']))

def replace_human_ids_with_cho_ids(s):
    replaced_dict = {}
    if isinstance(s, str):
        genes = re.findall(r'\b[hH]?\d+\b', s)
        for gene in genes:
            if gene.lower().startswith('h'):  # if gene ID starts with 'h' or 'H'
                human_id = int(gene[1:])  # remove 'h' and convert to int
                if human_id in human_to_cho_dict:  # if human gene ID is in the dictionary
                    cho_id = human_to_cho_dict[human_id]  # get corresponding CHO gene ID
                    s = s.replace(gene, str(cho_id))  # replace human gene ID with CHO gene ID in the string
                    replaced_dict[gene] = cho_id
    return s, replaced_dict

# Initialize a dictionary to store replacements
replacements = {}

# Apply the function and update the replacements dictionary
for i in range(len(rxns)):
    rxns.at[i, 'GPR_final'], replacements_dict = replace_human_ids_with_cho_ids(rxns.at[i, 'GPR_final'])
    replacements.update(replacements_dict)

# Now, 'replacements' is a dictionary where keys are the original gene IDs and values are the replaced CHO gene IDs
replacements

{}

In [5]:
######################################
#### ---------------------------- ####
#### ---- Update Rxns Sheet ----- ####
#### ---------------------------- ####
######################################
if replacements:
    sheet.update_google_sheet(sheet_rxns, rxns)
    print("Google Sheet updated.")
elif not replacements:
    print("No changes to the original Rxns Sheet")

No changes to the original Rxns Sheet


### 3. Add the genes from the "gene_list" to the genes df
Using a list of all the genes included in the dataset we can retrieve information from the NIH database regarding Gene Symbol, Gene Name, Gene Ensembl ID, and mRNA ID and protein ID using the function get_gene_info().

In [15]:
import time
from urllib.error import HTTPError
from Bio.Entrez.Parser import ValidationError

max_retries = 5  # Set the maximum number of retries

for g in tqdm(gene_list):
    if str(g) not in list(genes['Gene Entrez ID']):
        retry = True
        retries = 0
        while retry:
            try:
                print(g)
                organism, gene_symbol, gene_name, gene_description, picr_ensembl_id, chok1gs_ensembl_id, mRNA_ncbi_id, protein_ncbi_id, go_terms = get_gene_info(g)
                if organism == 'Cricetulus griseus':
                    new_row_data = {'Gene Entrez ID': g, 'Gene Symbol': gene_symbol, 'Gene Name': gene_name, 'Gene Description': gene_description,
                                    'PICR Ensembl ID': picr_ensembl_id, 'CHOK1GS Ensembl ID': chok1gs_ensembl_id, 'Transcript ID': mRNA_ncbi_id,
                                    'Protein ID': protein_ncbi_id, 'GO Terms': go_terms}
                    new_row_df = pd.DataFrame(new_row_data, index=[len(genes)])
                    genes = pd.concat([genes, new_row_df])
                    retry = False
                elif organism != 'Cricetulus griseus':
                    print(f'Gene {g,gene_symbol} is a {organism} Gene')
                    break
            except ValidationError:
                print(f'Gene {g} not found')
                retry = False
                continue
            except HTTPError:
                retries += 1
                if retries > max_retries:
                    print(f'HTTP Error, reached maximum retries ({max_retries}) for gene {g}')
                    break
                else:
                    print(f'HTTP Error, retrying with gene {g}')
                    time.sleep(1)

  0%|          | 0/3045 [00:00<?, ?it/s]

100769795
100761603
100770004
100770100
100761930
100753739
Gene 100753739 does not have PICR Ensembl ID
100770138
100753824
100770274
100753962
103162531
Gene 103162531 does not have PICR Ensembl ID
100754136
100762374
100754196
100754357
100762553
100754451
100770876
100770922
Gene 100770922 does not have PICR Ensembl ID
100771070
100754734
100771270
103163344
Gene 103163344 does not have PICR Ensembl ID
100763215
100763252
100771461
100771582
100763403
103163675
Gene 103163675 does not have PICR Ensembl ID
100763430
100763474
100755288
100771780
100763595
100755641
Gene 100755641 does not have PICR Ensembl ID
100755739
100755837
100755863
100764057
Gene 100764057 does not have PICR Ensembl ID
100764142
Gene 100764142 does not have PICR Ensembl ID
Gene 100764142 does not have mRNA id and protein id
100772358
100755998
100764216
100764593
100764809
Gene 100764809 does not have PICR Ensembl ID
100773010
Gene 100773010 does not have PICR Ensembl ID
100764848
100764916
100765169
10077341

In [16]:
genes

Unnamed: 0,Gene Entrez ID,Gene Symbol,Gene Name,Gene Description,PICR Ensembl ID,CHOK1GS Ensembl ID,Transcript ID,Protein ID,GO Terms,Uniprot ID
0,100774298,Slco1b3,solute carrier organic anion transporter famil...,This gene encodes a liver-specific member of t...,ENSCGRG00015012813,ENSCGRG00001021568,XM_035456357,XP_035312248,"['GO:0055085', 'GO:0022857', 'GO:0005515', 'GO...",
1,100754260,Hsd3b7,"hydroxy-delta-5-steroid dehydrogenase, 3 beta-...",This gene encodes an enzyme which is involved ...,ENSCGRG00015024104,ENSCGRG00001010375,XM_003510980,XP_003511028,"['GO:0016020', 'GO:0016491', 'GO:0016616', 'GO...",G3I7C5
2,100769024,LOC100769024,aromatase,This gene encodes a member of the cytochrome P...,ENSCGRG00015026385,ENSCGRG00001015094,XM_003508783,XP_003508831,"['GO:0016020', 'GO:0046872', 'GO:0016491', 'GO...",A0A061IC63
3,100756356,Atp2a3,ATPase sarcoplasmic/endoplasmic reticulum Ca2+...,This gene encodes one of the SERCA Ca(2+)-ATPa...,ENSCGRG00015000763,ENSCGRG00001017225,XM_035458322,XP_035314213,"['GO:0005783', 'GO:0016529', 'GO:0044325', 'GO...",
4,100774660,Pgls,6-phosphogluconolactonase,Enables 6-phosphogluconolactonase activity. In...,ENSCGRG00015004513,ENSCGRG00001024276,XM_027428449,XP_027284250,"['GO:0016787', 'GO:0005975', 'GO:0017057', 'GO...",
...,...,...,...,...,...,...,...,...,...,...
3177,100769454,LOC100769454,phosphatidylinositol N-acetylglucosaminyltrans...,,,,XM_003515280,XP_003515328,,
3178,100761300,LOC100761300,histamine N-methyltransferase,,,,XM_003509092,XP_003509140,,
3179,100761353,Ptprg,protein tyrosine phosphatase receptor type G,,ENSCGRG00015014841,,XM_035454322,XP_035310213,,
3180,100753164,LOC100753164,bile salt sulfotransferase 1-like,,ENSCGRG00015042115,,XM_035437148,XP_035293039,,


### 4. Eliminate unwanted genes
We iterate over the entire dataset to spot **Human genes** in order to eliminate them

In [None]:
# Make a copy of the DataFrame to avoid changing the original while iterating
genes_copy = genes.copy()

for index, row in tqdm(genes.iterrows(), total=genes.shape[0]):
    g = row['Gene Entrez ID']
    retry = True
    retries = 0
    while retry:
        try:
            organism, gene_symbol, gene_name, gene_description, picr_ensembl_id, chok1gs_ensembl_id, mRNA_ncbi_id, protein_ncbi_id, go_terms = get_gene_info(g)
            if organism == 'Homo sapiens':
                genes_copy = genes_copy.drop(index)
                print(f'Gene {g,gene_symbol} erased from dataset')
                retry = False
            else:
                retry = False
        except ValidationError:
            print(f'Gene {g} not found')
            retry = False
            continue
        except HTTPError:
            retries += 1
            if retries > max_retries:
                print(f'HTTP Error, reached maximum retries ({max_retries}) for gene {g}')
                break
            else:
                print(f'HTTP Error, retrying with gene {g}')
                time.sleep(1)

# Replace the original DataFrame with the updated one
genes = genes_copy


In [None]:
sheet.update_google_sheet(sheet_genes, genes)
print("Google Sheet updated.")

In [None]:
for g in tqdm(gene_list):
    if g not in list(genes['Gene Entrez ID']):
        try:
            gene_symbol, gene_name, gene_description, picr_ensembl_id, chok1gs_ensembl_id, mRNA_ncbi_id, protein_ncbi_id, go_terms = get_gene_info(g)
            new_row_data = {'Gene Entrez ID': g, 'Gene Symbol': gene_symbol, 'Gene Name': gene_name, 'Gene Description': gene_description,
                            'PICR Ensembl ID': picr_ensembl_id, 'CHOK1GS Ensembl ID': chok1gs_ensembl_id, 'Transcript ID': mRNA_ncbi_id,
                            'Protein ID': protein_ncbi_id, 'GO Terms': go_terms}
            new_row_df = pd.DataFrame(new_row_data, index=[len(genes)])
            genes = pd.concat([genes, new_row_df])
        except KeyError:
            print(f'Gene {g} not found')
            continue

In [None]:
# Fetch information from the NIH database


# Complete null or blank information in the already generated "Genes Sheet" dataset
for i,row in genes.iterrows():
    if row['Gene Entrez ID'] == '':
        for g in gene_list:
            gene_sheet_list = [str(x) for x in genes['Gene Entrez ID']]
            if g not in gene_sheet_list:
                gene_symbol, gene_name, gene_description, picr_ensembl_id, chok1gs_ensembl_id, mRNA_ncbi_id, protein_ncbi_id, go_terms = get_gene_info(g)
                row['Gene Entrez ID'] = g
                row['Gene Symbol'] = gene_symbol
                row['Gene Name'] = gene_name
                row['Gene Description'] = gene_description
                row['PICR Ensembl ID'] = picr_ensembl_id
                row['CHOK1GS Ensembl ID'] = chok1gs_ensembl_id
                row['Transcript ID'] = mRNA_ncbi_id
                row['Protein ID'] = protein_ncbi_id
                row['GO Terms'] = go_terms
                break          
    elif row['Gene Entrez ID'] != '' and (row['Gene Symbol'] == '' or row['Gene Name'] == '' or row['PICR Ensembl ID'] == '' or row['Transcript ID'] == '' or row['Protein ID'] == ''):
        row['Gene Symbol'] = gene_symbol
        row['Gene Name'] = gene_name
        row['Gene Description'] = gene_description
        row['PICR Ensembl ID'] = picr_ensembl_id
        row['CHOK1GS Ensembl ID'] = chok1gs_ensembl_id
        row['Transcript ID'] = mRNA_ncbi_id
        row['Protein ID'] = protein_ncbi_id
        row['GO Terms'] = go_terms
'''
# Add genes from the gene_list that are not yet in the "Genes Sheet" dataset
for g in gene_list:
    # the first try/except is to avoid overwritting data in case there already some info in the dataset
    try:
        gene_sheet_list = [str(x) for x in df['Gene Entrez ID']]
        id = max(df['Index']) + 2
    except:
        gene_sheet_list = []
        id = 2
    if gene not in gene_sheet_list:
        try:
            gene_symbol, gene_name, gene_description, picr_ensembl_id, chok1gs_ensembl_id, mRNA_ncbi_id, protein_ncbi_id, go_terms = get_gene_info(gene)
            print(id-1)
            genes_sheet.update_cell(id,1,id-1)
            time.sleep(5)
            genes_sheet.update_cell(id,2,gene)
            time.sleep(5)
            genes_sheet.update_cell(id,3,gene_symbol)
            time.sleep(5)
            genes_sheet.update_cell(id,4,gene_name)
            time.sleep(5)
            genes_sheet.update_cell(id,5,gene_description)
            time.sleep(5)
            genes_sheet.update_cell(id,6,picr_ensembl_id)
            time.sleep(5)
            genes_sheet.update_cell(id,7,chok1gs_ensembl_id)
            time.sleep(5)
            genes_sheet.update_cell(id,8,mRNA_ncbi_id)
            time.sleep(5)
            genes_sheet.update_cell(id,9,protein_ncbi_id)
            time.sleep(5)
            genes_sheet.update_cell(id,10,go_terms)
        except:
            print('Google API quota exceeded')
            time.sleep(5)
            continue
            
'''

In [None]:
genes

In [None]:
get_gene_info('100750772')

In [None]:
from Bio import Entrez
Entrez.email = 'account1@theta-ocean-377718.iam.gserviceaccount.com'
handle = Entrez.efetch(db='gene', id='3979190', retmode='xml')
record = Entrez.read(handle)[0]

#gene_name = record['Entrezgene_gene']['Gene-ref']['Gene-ref_desc']
#gene_symbol = record['Entrezgene_gene']['Gene-ref']['Gene-ref_locus']

human_handle = Entrez.efetch(db='gene', id='3979190', retmode='xml')
human_record = Entrez.read(human_handle)[0]
if 'Entrezgene_comments' in human_record and 'Gene-commentary_comment' in human_record['Entrezgene_comments'][0]:
    human_gene_description = human_record['Entrezgene_comments'][0]['Gene-commentary_comment'][0]['String']
elif 'Entrezgene_summary' in human_record:
    human_gene_description = human_record['Entrezgene_summary']
else:
    human_gene_description = ''