# Genes
The following notebook retrieves all the genes involved in the reactions from the **"Rxns" Sheet** in Google Sheet file. Then, information regarding the genes is retrieved from different databases and a **"Genes" Sheet** is generated and updated.

In [None]:
import pandas as pd
import numpy as np
import time
from tqdm.notebook import tqdm

import sys, os
sys.path.append(os.path.abspath(".."))
from Utils.utils import get_gene_info

### 1.Generate "rxns" and "genes" datasets
The "rxns" dataset contains all the reactions with their GPR associations. The "genes" dataset contains all the information of the genes/GPR involved in our reconstruction.

In [None]:
##### ----- Read iCHO3K reaction files ----- #####

#Path to iCHO3K Excell
FILE_PATH = '../../iCHO3K/Dataset/iCHO3K.xlsx'


# Sheets
sheet_rxns = 'Rxns'
sheet_genes = 'Genes'

# Read into DataFrames
rxns             = pd.read_excel(FILE_PATH, sheet_name=sheet_rxns)
genes            = pd.read_excel(FILE_PATH, sheet_name=sheet_genes)

In [None]:
gene_sheet_list = list(genes['Gene Entrez ID'])
gene_sheet_list_int = list(map(int, gene_sheet_list))

### 2. Create the a "gene_list" with all the genes involved in our reconstruction
We extract the gene IDs from the GPR annotations in the **"rxns" df**. This information is only retrieved from the **GPR_final** column.

In [None]:
# Generation of gene_list from all the genes in the "Whole Cell Network Reconstruction in CHO Cells" dataset
import re

gene_list = []
human_gene_list = []
for index, row in rxns.iterrows():
    if row['GPR_final'] != '':
        gprs = str(row['GPR_final'])
        gpr = re.findall(r'\b[hH]?\d+\b', gprs)
        for g in gpr:
            if g.lower().startswith('h'):  # we convert to lower case to catch both 'h' and 'H'
                human_gene_list.append(int(g[1:]))  # remove 'h' and convert to int before appending
            else:
                gene_list.append(int(g))  # convert to int before appending

gene_list = list(set(gene_list))
len(gene_list)

In [None]:
set1 = set(gene_sheet_list_int)
set2 = set(gene_list)

# Find elements unique to each list
unique_to_list1 = set1 - set2
unique_to_list2 = set2 - set1

# Optionally, combine the unique elements
unique_elements = unique_to_list1.union(unique_to_list2)

# Output the results
print(f"Elements unique to list1: {unique_to_list1}")
print(f"Elements unique to list2: {unique_to_list2}")

### 3. Replace remaining Human Gene IDs in our dataset with CHO Gene IDs
Using the **cho2human_mapping** file, we replace the Human gene IDs with those of CHO, these IDs were not spotted when we first added the Recon3D reactions.

In [None]:
# Build a dictionary with human gene IDs as keys and CHO gene IDs as values
orthologs = pd.read_csv('../../Data/Orthologs/cho2human_mapping.tsv', sep='\t')
human_to_cho_dict = dict(zip(orthologs['HUMAN_ID'], orthologs['CHO_ID']))

def replace_human_ids_with_cho_ids(s):
    replaced_dict = {}
    if isinstance(s, str):
        genes = re.findall(r'\b[hH]?\d+\b', s)
        for gene in genes:
            if gene.lower().startswith('h'):  # if gene ID starts with 'h' or 'H'
                human_id = int(gene[1:])  # remove 'h' and convert to int
                if human_id in human_to_cho_dict:  # if human gene ID is in the dictionary
                    cho_id = human_to_cho_dict[human_id]  # get corresponding CHO gene ID
                    s = s.replace(gene, str(cho_id))  # replace human gene ID with CHO gene ID in the string
                    replaced_dict[gene] = cho_id
    return s, replaced_dict

# Initialize a dictionary to store replacements
replacements = {}

# Apply the function and update the replacements dictionary
for i in range(len(rxns)):
    rxns.at[i, 'GPR_final'], replacements_dict = replace_human_ids_with_cho_ids(rxns.at[i, 'GPR_final'])
    replacements.update(replacements_dict)

# Now, 'replacements' is a dictionary where keys are the original gene IDs and values are the replaced CHO gene IDs
replacements

### 3. Add the genes from the "gene_list" to the genes df
Using a list of all the genes included in the dataset we can retrieve information from the NIH database regarding Gene Symbol, Gene Name, Gene Ensembl ID, and mRNA ID and protein ID using the function get_gene_info().

In [None]:
# Need to provide a valid email address for this code to run
email = ""

import time
from urllib.error import HTTPError
from Bio.Entrez.Parser import ValidationError

max_retries = 5  # Set the maximum number of retries
c = 0

for g in tqdm(gene_list):
    if str(g) not in list(genes['Gene Entrez ID']):
        retry = True
        retries = 0
        while retry:
            try:
                print(g)
                organism, gene_symbol, gene_name, gene_description, picr_ensembl_id, chok1gs_ensembl_id, mRNA_ncbi_id, protein_ncbi_id, go_terms = get_gene_info(g, email)
                if organism == 'Cricetulus griseus':
                    new_row_data = {'Gene Entrez ID': g, 'Gene Symbol': gene_symbol, 'Gene Name': gene_name, 'Gene Description': gene_description,
                                    'PICR Ensembl ID': picr_ensembl_id, 'CHOK1GS Ensembl ID': chok1gs_ensembl_id, 'Transcript ID': mRNA_ncbi_id,
                                    'Protein ID': protein_ncbi_id, 'GO Terms': go_terms}
                    new_row_df = pd.DataFrame(new_row_data, index=[len(genes)])
                    genes = pd.concat([genes, new_row_df])
                    c+=1
                    retry = False
                elif organism != 'Cricetulus griseus':
                    print(f'Gene {g,gene_symbol} is a {organism} Gene')
                    break
            except ValidationError:
                print(f'Gene {g} not found')
                retry = False
                continue
            except HTTPError:
                retries += 1
                if retries > max_retries:
                    print(f'HTTP Error, reached maximum retries ({max_retries}) for gene {g}')
                    break
                else:
                    print(f'HTTP Error, retrying with gene {g}')
                    time.sleep(1)
                    
print(f'A total of {c} Genes were added to the dataset')

### 4. Eliminate unwanted genes
We iterate over the entire dataset to spot **Human genes** in order to eliminate them

In [None]:
# Make a copy of the DataFrame to avoid changing the original while iterating
erased = 0
genes_copy = genes.copy()

In [None]:
for index, row in tqdm(genes.iterrows(), total=genes.shape[0]):
    g = row['Gene Entrez ID']
    retry = True
    retries = 0
    while retry:
        try:
            organism, gene_symbol, gene_name, gene_description, picr_ensembl_id, chok1gs_ensembl_id, mRNA_ncbi_id, protein_ncbi_id, go_terms = get_gene_info(g, email)
            if organism == 'Homo sapiens':
                genes_copy = genes_copy.drop(index)
                print(f'Gene {g,gene_symbol} erased from dataset')
                erased+=1
                retry = False
            else:
                retry = False
        except ValidationError:
            print(f'Gene {g} not found')
            retry = False
            continue
        except HTTPError:
            retries += 1
            if retries > max_retries:
                print(f'HTTP Error, reached maximum retries ({max_retries}) for gene {g}')
                break
            else:
                print(f'HTTP Error, retrying with gene {g}')
                time.sleep(1)

# Replace the original DataFrame with the updated one
print(f'A total of {erased} Human Genes were identifed from the original dataset')

### 5. Eliminate genes that are not in the model
Next, we make sure that the genes in the **GPR_final** column of the Reactions Sheet matches those of the **Gene Entrez ID** column of the Genes Sheet. If there are genes in the Genes Sheet that are not in the **GPR_final** column of th Reactions Sheet, we eliminate them.

In [None]:
for index, row in genes_copy.iterrows():
    g = int(row['Gene Entrez ID'])
    if g not in gene_list:
        genes_copy = genes_copy.drop(index)
        erased+=1
        print(g)
    

# Replace the original DataFrame with the updated one
print(f'A total of {erased} Genes were erased from the original dataset')

In [None]:
genes = genes_copy

In [None]:
### Compare Genes Sheet with Rxns Sheet to make sure we have the same gene IDs in both datasets
genes_sheet_list = list(genes['Gene Entrez ID'])
genes_sheet_list = [int(item) for item in genes_sheet_list]
genes_sheet_list = set(genes_sheet_list)
gene_list = set(gene_list)

# Find elements in list_a but not in list_b
missing_in_genes_sheet = gene_list - genes_sheet_list
print("Missing in Genes Sheet:", missing_in_genes_sheet)

# Find elements in list_b but not in list_a
missing_in_rxns_sheet = genes_sheet_list - gene_list
print("Missing in Rxns Sheet:", missing_in_rxns_sheet)

In [None]:
def find_duplicates(lst):
    seen = set()
    duplicates = set()
    for item in lst:
        if item in seen:
            duplicates.add(item)
        else:
            seen.add(item)
    return duplicates

# Example usage:
my_list = list(genes['Gene Entrez ID'])
print(find_duplicates(my_list))  # Shows the duplicated items