# Annotation

## Overview

This Jupyter notebook standardizes the gene names in a model through a comprehensive search for gene names from the Human Gene Nomenclature Committee (HGNC) database. However, manual review is required before making any changes.

## Usage

1. **Input format:** Network in SBML-qual format.
2. **Gene name mapping**: Using the returned table, manually review and correct any errors.
3. **Update names**: Update the names in the network.
4. **Save to File:** Network in SBML-qual format with a standardized gene name.

In [1]:
model_name = "Ikonomi2020"

In [2]:
import ginsim
import biolqm
import maboss
import pypint
import pandas as pd
import numpy as np
from colomoto_jupyter import tabulate # for fixpoint table display
import matplotlib.pyplot as plt # for modifying plots
import seaborn as sns # for heatmap visualization
import requests # for access the HGNC API
import xml.etree.ElementTree as ET # for parse the SBML file

This notebook has been executed using the docker image `colomoto/colomoto-docker:2024-01-01`

In [3]:
def fetch_hgnc_data(gene_list):
    results = []

    for gene in gene_list:
        # Check for symbols and take the first part
        searched_gene = gene.split('/')[0].split('_')[0]
        symbol_flag = 'contains_symbol' if searched_gene != gene else 'no_symbol'

        # Try fetching by the first part of the symbol on the HGNC website
        symbol_response = requests.get(
            f"http://rest.genenames.org/fetch/symbol/{searched_gene}", 
            headers={"Accept": "application/json"}
        )

        if symbol_response.status_code == 200 and symbol_response.json()["response"]["numFound"] > 0:
            doc = symbol_response.json()["response"]["docs"][0]
            doc['original_gene'] = gene
            doc['searched_gene'] = searched_gene
            doc['source'] = 'direct_symbol_fetch'
            doc['symbol_flag'] = symbol_flag
            results.append(doc)
        else:
            # If symbol fetch fails, try a general search
            search_response = requests.get(
                f"http://rest.genenames.org/search/{searched_gene}", 
                headers={"Accept": "application/json"}
            )
            if search_response.status_code == 200:
                search_data = search_response.json()["response"]
                if search_data["numFound"] > 0:
                    # Find an exact match in symbol or alias
                    exact_match = next((doc for doc in search_data["docs"] if doc.get("symbol") == searched_gene or searched_gene in doc.get("alias_symbol", [])), None)
                    doc = exact_match or search_data["docs"][0]  # Take the first result
                    doc['original_gene'] = gene
                    doc['searched_gene'] = searched_gene
                    doc['source'] = 'general_search'
                    doc['symbol_flag'] = symbol_flag
                    results.append(doc)
                else:
                    results.append({
                        "original_gene": gene,
                        "searched_gene": searched_gene,
                        "source": "general_search",
                        "status": "Not found",
                        "symbol_flag": symbol_flag
                    })
            else:
                results.append({
                    "original_gene": gene,
                    "searched_gene": searched_gene,
                    "source": "api_error",
                    "status": "Error in API request",
                    "symbol_flag": symbol_flag
                })

    # Create DataFrame and rearrange columns
    df = pd.DataFrame(results)
    column_order = ['original_gene', 'searched_gene', 'source', 'symbol_flag'] + \
                   [col for col in df.columns if col not in ['original_gene', 'searched_gene', 'source', 'symbol_flag']]
    df = df[column_order]

    return df

In [5]:
def getnodes(model_name):
    # Load and parse the SBML file
    tree = ET.parse("../Models/" + model_name + ".sbml")
    root = tree.getroot()
    
    # Define the namespace for SBML Level 3 Version 1 Core and Qual
    ns = {
        'sbml': 'http://www.sbml.org/sbml/level3/version1/core',
        'qual': 'http://www.sbml.org/sbml/level3/version1/qual/version1'
    }
    
    # Find all qualitativeSpecies in the model
    qual_species_list = root.findall('.//qual:qualitativeSpecies', ns)
    
    # Extract the IDs of the qualitativeSpecies
    qual_species_ids = []
    for species in qual_species_list:
        species_id = species.attrib.get('{http://www.sbml.org/sbml/level3/version1/qual/version1}id')
        if species_id:
            qual_species_ids.append(species_id)

    print(qual_species_ids)
    return qual_species_ids

['External_quiescence', 'External_cycling', 'PI3K', 'TSC1_2', 'mTORC1', 'FOXO3A', 'ATM', 'ROS', 'Mitochondria', 'Autophagy', 'RAS', 'ETS', 'MEF', 'GSK3b', 'CTNNB1', 'cMYC', 'BMI1', 'MDM2', 'TP53', 'CDKN1C', 'CDKN1A', 'CDKN1B', 'GFI1', 'RB', 'E2F', 'CCND1', 'CCNE1', 'S_phase', 'AKT', 'CDKN2D', 'CDKN2A', 'Pro_apoptotic_proteins', 'Anti_apoptotic_proteins', 'CYCS', 'Apoptosis', 'Senescence']


In [4]:
# Load the data
model = biolqm.load("../Models/" + model_name + ".sbml")

# Use the GinSIM package to visualize it
model_lrg = biolqm.to_ginsim(model)
ginsim.show(model_lrg)

In [6]:
# Get nodes
qual_species_ids = getnodes(model_name)

# Fetch HGNC data for these components
hgnc_data = fetch_hgnc_data(qual_species_ids)

# Convert to DataFrame and review
hgnc_df = pd.DataFrame(hgnc_data)
print(hgnc_df)

# Save this data to a file
hgnc_df.to_csv("model_name + "_hgnc_results.csv", index=False)

              original_gene searched_gene               source  \
0       External_quiescence      External       general_search   
1          External_cycling      External       general_search   
2                      PI3K          PI3K       general_search   
3                    TSC1_2          TSC1  direct_symbol_fetch   
4                    mTORC1        mTORC1       general_search   
5                    FOXO3A        FOXO3A       general_search   
6                       ATM           ATM  direct_symbol_fetch   
7                       ROS           ROS       general_search   
8              Mitochondria  Mitochondria       general_search   
9                 Autophagy     Autophagy       general_search   
10                      RAS           RAS       general_search   
11                      ETS           ETS       general_search   
12                      MEF           MEF       general_search   
13                    GSK3b         GSK3b  direct_symbol_fetch   
14        

  
  
  
## After manully check of the search results, we can use it for mapping the names.  


    

In [7]:
# Read the revised csv file
hgnc_df = pd.read_csv(model_name + "_hgnc_results_checked.csv")

# Create a name mapping dictionary from the reviewed data
name_mapping = {row['original_gene']: row['symbol'] for _, row in hgnc_df.iterrows() if 'symbol' in row}

# Read the SBML file as text
with open("../Models/" + model_name + ".sbml", "r") as file:
    sbml_content = file.read()

# Replace gene names based on the mapping
for original_name, new_symbol in name_mapping.items():
    sbml_content = sbml_content.replace(original_name, new_symbol)

# Write the modified content to a new file
with open("../Models/" + model_name + "_updated.sbml", "w") as file:
    file.write(sbml_content)

In [8]:
# Visualize it
updated_model = biolqm.load("../Models/" + model_name + "_updated.sbml")
updated_model_lrg = biolqm.to_ginsim(updated_model)
ginsim.show(updated_model_lrg)