In [10]:
%pip install ace_tools

Note: you may need to restart the kernel to use updated packages.


In [8]:
%pip install -U pip setuptools

Collecting pip
  Downloading pip-24.3.1-py3-none-any.whl.metadata (3.7 kB)
Collecting setuptools
  Downloading setuptools-75.6.0-py3-none-any.whl.metadata (6.7 kB)
Downloading pip-24.3.1-py3-none-any.whl (1.8 MB)
   ---------------------------------------- 0.0/1.8 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.8 MB ? eta -:--:--
    --------------------------------------- 0.0/1.8 MB 653.6 kB/s eta 0:00:03
   -- ------------------------------------- 0.1/1.8 MB 1.1 MB/s eta 0:00:02
   ---- ----------------------------------- 0.2/1.8 MB 1.4 MB/s eta 0:00:02
   ------ --------------------------------- 0.3/1.8 MB 1.6 MB/s eta 0:00:01
   -------- ------------------------------- 0.4/1.8 MB 1.7 MB/s eta 0:00:01
   -------------- ------------------------- 0.7/1.8 MB 2.3 MB/s eta 0:00:01
   ---------------------- ----------------- 1.0/1.8 MB 3.1 MB/s eta 0:00:01
   --------------------------------- ------ 1.5/1.8 MB 4.0 MB/s eta 0:00:01
   ------------------------------------

In [13]:
import pandas as pd

# Load the TSV file
file_path = 'gene-aging-mechanisms.tsv'

# Read the file as a DataFrame
data = pd.read_csv(file_path, sep="\t", header=None, quoting=3, engine='python')

# Rename columns for clarity
data.columns = ["Gene", "Categories"]

# Clean and split categories
data["Gene"] = data["Gene"].str.strip('"')  # Remove double quotes from gene names
data["Categories"] = data["Categories"].str.strip("'")  # Remove outer single quotes from categories
data["Categories"] = data["Categories"].str.split("','")  # Split multiple categories

# Remove unnecessary symbols from category entries
data["Categories"] = data["Categories"].apply(
    lambda x: [cat.strip("'\"") for cat in x] if isinstance(x, list) else x
)

data


Unnamed: 0,Gene,Categories
0,GHR,[INS/IGF-1 pathway dysregulation]
1,GHRH,"[INS/IGF-1 pathway dysregulation, intercellula..."
2,SHC1,[INS/IGF-1 pathway dysregulation]
3,POU1F1,[transcriptional alterations]
4,PROP1,[transcriptional alterations]
...,...,...
2400,PMF1,[transcriptional alterations]
2401,RP1,[]
2402,SLC41A1,[]
2403,CLTC-IT1,[]


In [21]:
data['Categories'].nunique

<bound method IndexOpsMixin.nunique of 0                       [INS/IGF-1 pathway dysregulation]
1       [INS/IGF-1 pathway dysregulation, intercellula...
2                       [INS/IGF-1 pathway dysregulation]
3                           [transcriptional alterations]
4                           [transcriptional alterations]
                              ...                        
2400                        [transcriptional alterations]
2401                                                   []
2402                                                   []
2403                                                   []
2404                         [SIRT pathway dysregulation]
Name: Categories, Length: 2405, dtype: object>

In [22]:
# Explode the 'Categories' column to create one row per category
exploded_data = data.explode("Categories")

# Group by the 'Categories' column and collect the genes for each category
grouped_by_category = exploded_data.groupby("Categories")["Gene"].apply(list).reset_index()

# Rename columns for clarity
grouped_by_category.columns = ["Category", "Genes"]

# Display grouped data
print(grouped_by_category)


                                             Category  \
0                                                       
1                          AMPK pathway dysregulation   
2                     INS/IGF-1 pathway dysregulation   
3                          SIRT pathway dysregulation   
4                           TOR pathway dysregulation   
5             accumulation of reactive oxygen species   
6                      alterations in DNA methylation   
7                alterations in histone modifications   
8       changes in the extracellular matrix structure   
9                                chromatin remodeling   
10                 degradation of proteolytic systems   
11                            disabled macroautophagy   
12       impairment of proteins folding and stability   
13  impairment of the mitochondrial integrity and ...   
14             intercellular communication impairment   
15                      mitochondrial DNA instability   
16                            n

In [23]:
# Explode the 'Categories' column to create one row per category
exploded_data = data.explode("Categories")

# Group by the 'Categories' column and count the number of genes in each category
genes_per_category = exploded_data.groupby("Categories")["Gene"].count().reset_index()

# Rename columns for clarity
genes_per_category.columns = ["Category", "Gene Count"]

# Display grouped and counted data
print(genes_per_category)


                                             Category  Gene Count
0                                                            1147
1                          AMPK pathway dysregulation          11
2                     INS/IGF-1 pathway dysregulation          54
3                          SIRT pathway dysregulation           1
4                           TOR pathway dysregulation          31
5             accumulation of reactive oxygen species         165
6                      alterations in DNA methylation          20
7                alterations in histone modifications          83
8       changes in the extracellular matrix structure          59
9                                chromatin remodeling          61
10                 degradation of proteolytic systems         193
11                            disabled macroautophagy          37
12       impairment of proteins folding and stability          20
13  impairment of the mitochondrial integrity and ...         110
14        

In [24]:
# Explode the 'Categories' column to create one row per category
exploded_data = data.explode("Categories")

# Group by 'Gene' and count the number of unique categories for each gene
gene_category_counts = exploded_data.groupby("Gene")["Categories"].nunique().reset_index()

# Filter genes that are present in multiple categories
genes_in_multiple_categories = gene_category_counts[gene_category_counts["Categories"] > 1]

# Rename columns for clarity
genes_in_multiple_categories.columns = ["Gene", "Number of Categories"]

# Display the genes that belong to multiple categories
print(genes_in_multiple_categories)


          Gene  Number of Categories
1          A2M                     2
3         AASS                     2
20        ACAN                     2
21         ACE                     2
41      ADAM10                     3
...        ...                   ...
2342     ZBTB1                     3
2360     ZFP36                     2
2368      ZHX2                     2
2370     ZMIZ1                     2
2372  ZMPSTE24                     8

[451 rows x 2 columns]


In [25]:
# Explode the 'Categories' column to create one row per category
exploded_data = data.explode("Categories")

# Count the number of genes in each category
category_gene_counts = exploded_data.groupby("Categories")["Gene"].count().reset_index()
category_gene_counts.columns = ["Category", "Gene Count"]

# Merge the category counts back into the exploded data
exploded_data = exploded_data.merge(category_gene_counts, left_on="Categories", right_on="Category")

# Sort by gene name and category gene count
exploded_data = exploded_data.sort_values(by=["Gene", "Gene Count"])

# Keep only the first occurrence of each gene (assigned to the smallest category)
assigned_genes = exploded_data.drop_duplicates(subset=["Gene"])

# Create a cleaned dataset with unique gene-category assignments
cleaned_data = assigned_genes[["Gene", "Category"]].sort_values(by="Category").reset_index(drop=True)

# Display the cleaned dataset
print(cleaned_data)


        Gene                     Category
0                                        
1      LIPT1                             
2      LINS1                             
3      LIMK2                             
4     LIMCH1                             
...      ...                          ...
2400     TOX  transcriptional alterations
2401  POU1F1  transcriptional alterations
2402   GABPA  transcriptional alterations
2403     FRK  transcriptional alterations
2404    SAFB  transcriptional alterations

[2405 rows x 2 columns]


In [28]:
import os

# Function to sanitize category names for file paths
def sanitize_category_name(category):
    # Replace problematic characters with underscores
    return "".join(c if c.isalnum() or c in (' ', '-') else '_' for c in category).strip().replace(' ', '_')

# Create a directory to store the files
output_dir = "genes_by_category"
os.makedirs(output_dir, exist_ok=True)

# Save genes into separate files by category
for category, group in cleaned_data.groupby("Category"):
    sanitized_category = sanitize_category_name(category)  # Sanitize the category name
    file_name = f"{sanitized_category}_genes.txt"
    file_path = os.path.join(output_dir, file_name)
    
    # Save only the gene names
    group["Gene"].to_csv(file_path, index=False, header=False)
    print(f"Saved genes for category '{category}' to {file_path}")



Saved genes for category '' to genes_by_category\_genes.txt
Saved genes for category 'AMPK pathway dysregulation' to genes_by_category\AMPK_pathway_dysregulation_genes.txt
Saved genes for category 'INS/IGF-1 pathway dysregulation' to genes_by_category\INS_IGF-1_pathway_dysregulation_genes.txt
Saved genes for category 'SIRT pathway dysregulation' to genes_by_category\SIRT_pathway_dysregulation_genes.txt
Saved genes for category 'TOR pathway dysregulation' to genes_by_category\TOR_pathway_dysregulation_genes.txt
Saved genes for category 'accumulation of reactive oxygen species' to genes_by_category\accumulation_of_reactive_oxygen_species_genes.txt
Saved genes for category 'alterations in DNA methylation' to genes_by_category\alterations_in_DNA_methylation_genes.txt
Saved genes for category 'alterations in histone modifications' to genes_by_category\alterations_in_histone_modifications_genes.txt
Saved genes for category 'changes in the extracellular matrix structure' to genes_by_category\

In [1]:
import pandas as pd

# Load the data
genes_file = "_genes.txt"
criteria_file = "gene-criteria1.tsv"

# Read the gene list
with open(genes_file, "r") as file:
    gene_list = [line.strip() for line in file if line.strip()]

# Read the selection criteria file
criteria_df = pd.read_csv(criteria_file, sep="\t", header=None, names=["Gene", "Category"])

# Map genes to categories
categories = []
for gene in gene_list:
    matches = criteria_df[criteria_df["Gene"] == gene]
    if not matches.empty:
        categories.append((gene, matches["Category"].iloc[0]))
    else:
        categories.append((gene, "Uncategorized"))

# Create a new DataFrame for the results
result_df = pd.DataFrame(categories, columns=["Gene", "Category"])

# Save to a new file
output_file = "genes_with_categories.csv"
result_df.to_csv(output_file, index=False)
print(f"Genes with updated categories saved to {output_file}")


Genes with updated categories saved to genes_with_categories.csv


In [2]:
cleaned_data1 = pd.read_csv('genes_with_categories.csv')

In [3]:
cleaned_data1.head()

Unnamed: 0,Gene,Category
0,"""""",Uncategorized
1,LIPT1,"'Age-related changes in gene expression, methy..."
2,LINS1,"'Age-related changes in gene expression, methy..."
3,LIMK2,"'Age-related changes in gene expression, methy..."
4,LIMCH1,'Association of genetic variants and gene expr...


In [4]:
import os

# Function to sanitize category names for file paths
def sanitize_category_name(category):
    # Replace problematic characters with underscores
    return "".join(c if c.isalnum() or c in (' ', '-') else '_' for c in category).strip().replace(' ', '_')

# Create a directory to store the files
output_dir = "genes_by_category"
os.makedirs(output_dir, exist_ok=True)

# Save genes into separate files by category
for category, group in cleaned_data1.groupby("Category"):
    sanitized_category = sanitize_category_name(category)  # Sanitize the category name
    file_name = f"{sanitized_category}_genes.txt"
    file_path = os.path.join(output_dir, file_name)
    
    # Save only the gene names
    group["Gene"].to_csv(file_path, index=False, header=False)
    print(f"Saved genes for category '{category}' to {file_path}")



Saved genes for category ''Age-related changes in gene expression, methylation or protein activity in humans'' to genes_by_category\_Age-related_changes_in_gene_expression__methylation_or_protein_activity_in_humans__genes.txt
Saved genes for category ''Age-related changes in gene expression, methylation or protein activity in non-mammals'' to genes_by_category\_Age-related_changes_in_gene_expression__methylation_or_protein_activity_in_non-mammals__genes.txt
Saved genes for category ''Age-related changes in gene expression, methylation or protein activity'' to genes_by_category\_Age-related_changes_in_gene_expression__methylation_or_protein_activity__genes.txt
Saved genes for category ''Association of genetic variants and gene expression levels with longevity'' to genes_by_category\_Association_of_genetic_variants_and_gene_expression_levels_with_longevity__genes.txt
Saved genes for category ''Association of the gene with accelerated aging in humans'' to genes_by_category\_Association_of

In [1]:
from Bio import Entrez
from http.client import IncompleteRead
import time


# Input file containing gene names
gene_file = "genes_by_category\impairment_of_the_mitochondrial_integrity_and_biogenesis_genes.txt"

# Output FASTA file for large genes (protein sequences)
output_fasta = "genes_by_category/opengenes1_proteins.fasta"

# Function to fetch sequences robustly with retries
def robust_fetch(fetch_function, retries=3, wait_time=5):
    for attempt in range(retries):
        try:
            return fetch_function()
        except IncompleteRead as e:
            print(f"IncompleteRead error: {e}. Retrying {attempt + 1}/{retries}...")
            time.sleep(wait_time)
    raise Exception("Failed to fetch data after multiple retries")

# Fetch protein sequences from NCBI
def fetch_protein_sequence(gene_name):
    try:
        # Search for the gene in the NCBI protein database (restricted to Homo sapiens)
        query = f"{gene_name}[Gene] AND Homo sapiens[Organism]"
        handle = Entrez.esearch(db="protein", term=query, retmax=1)
        record = Entrez.read(handle)
        handle.close()

        # Fetch the protein sequence by ID
        if record["IdList"]:
            seq_id = record["IdList"][0]
            print(f"Fetching protein sequence for {gene_name} (ID: {seq_id})...")
            handle = robust_fetch(
                lambda: Entrez.efetch(
                    db="protein", id=seq_id, rettype="fasta", retmode="text"
                )
            )
            sequence = handle.read()
            handle.close()
            return sequence
        else:
            print(f"No protein sequence found for {gene_name}")
            return None
    except Exception as e:
        print(f"Error fetching {gene_name}: {e}")
        return None

# Read gene names and fetch protein sequences
with open(gene_file, "r") as file:
    genes = [line.strip() for line in file if line.strip()]

with open(output_fasta, "w") as fasta_out:
    for gene in genes:
        protein_sequence = fetch_protein_sequence(gene)
        if protein_sequence:
            fasta_out.write(protein_sequence)

print(f"Protein sequences saved to: {output_fasta}")

  gene_file = "genes_by_category\impairment_of_the_mitochondrial_integrity_and_biogenesis_genes.txt"
            Email address is not specified.

            To make use of NCBI's E-utilities, NCBI requires you to specify your
            email address with each request.  As an example, if your email address
            is A.N.Other@example.com, you can specify it as follows:
               from Bio import Entrez
               Entrez.email = 'A.N.Other@example.com'
            In case of excessive usage of the E-utilities, NCBI will attempt to contact
            a user at the email address provided before blocking access to the
            E-utilities.


Fetching protein sequence for ALKBH7 (ID: 14150066)...
Fetching protein sequence for NDUFV3 (ID: 20178326)...
Fetching protein sequence for RAB3A (ID: 131801)...
Fetching protein sequence for PMAIP1 (ID: 6919841)...
Fetching protein sequence for SOD2 (ID: 1018191640)...
Fetching protein sequence for SIRT5 (ID: 1776945293)...
Fetching protein sequence for CEP89 (ID: 50083293)...
Fetching protein sequence for SIRT4 (ID: 1880340295)...
Fetching protein sequence for BAK1 (ID: 4502363)...
Fetching protein sequence for IMMT (ID: 2181405828)...
Fetching protein sequence for TIMM8A (ID: 6014944)...
Fetching protein sequence for BAD (ID: 14670388)...
Fetching protein sequence for HTRA2 (ID: 1012282651)...
Fetching protein sequence for NOS3 (ID: 231571356)...
Fetching protein sequence for MTFP1 (ID: 52783151)...
Fetching protein sequence for DNM1L (ID: 1058916509)...
Fetching protein sequence for SNCA (ID: 1768365570)...
Fetching protein sequence for FEZ1 (ID: 13431526)...
Fetching protein seque

In [1]:
input_file = "realmergedprots_filtered.faa"
output_file = "realmergedprots_noen.faa"

with open(input_file, "r") as infile, open(output_file, "w") as outfile:
    write = True
    for line in infile:
        if line.startswith(">en"):
            write = False  # Skip writing this header line
        elif line.startswith(">"):
            write = True  # Reset for the next header
        if write:
            outfile.write(line)

print(f"Filtered file saved to {output_file}")


Filtered file saved to realmergedprots_noen.faa


mmseqs/bin/mmseqs easy-search impairment_of_the_mitochondrial_integrity_and_biogenesis_genes.fasta realmergedprots_filtered.faa mitimpair.m8 -c 0.2 tmp --min-seq-id 0.3 --alt-ali 25

import Biopython

In [1]:
import Bio

In [2]:
%pip freeze

ace_tools==0.0Note: you may need to restart the kernel to use updated packages.

aiohttp @ file:///C:/b/abs_8860tt1424/croot/aiohttp_1715108828392/work
aiosignal @ file:///tmp/build/80754af9/aiosignal_1637843061372/work
anaconda-anon-usage @ file:///C:/b/abs_c3w_h1zzjg/croot/anaconda-anon-usage_1710965204622/work
anaconda-catalogs @ file:///C:/Users/dev-admin/perseverance-python-buildout/croot/anaconda-catalogs_1701816586117/work
anaconda-client @ file:///C:/b/abs_34txutm0ue/croot/anaconda-client_1708640705294/work
anaconda-cloud-auth @ file:///C:/b/abs_b02evi84gh/croot/anaconda-cloud-auth_1713991445770/work
anaconda-navigator @ file:///C:/b/abs_d8d4a02c3t/croot/anaconda-navigator_1713464063970/work
anaconda-project @ file:///C:/b/abs_95s0l9dwvd/croot/anaconda-project_1706049257687/work
annotated-types @ file:///C:/b/abs_0dmaoyhhj3/croot/annotated-types_1709542968311/work
anyio @ file:///C:/b/abs_847uobe7ea/croot/anyio_1706220224037/work
appdirs==1.4.4
archspec @ file:///croot/archspec

In [1]:
import os

def count_genes_in_categories(folder_path='genes_by_category'):
    """
    Counts and displays the number of genes (lines) in each .txt file
    in the given folder. Assumes each line in the file represents one gene.
    """
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            # Derive category name from the file name (strip '.txt')
            category_name = os.path.splitext(filename)[0]
            
            # Construct full path to the file
            file_path = os.path.join(folder_path, filename)
            
            # Read all lines and count them
            with open(file_path, 'r') as f:
                lines = [line.strip() for line in f if line.strip()]  # strip out empty lines
            num_genes = len(lines)
            
            print(f"Category: {category_name}, Number of genes: {num_genes}")


if __name__ == "__main__":
    count_genes_in_categories()


Category: accumulation_of_reactive_oxygen_species_genes, Number of genes: 126
Category: alterations_in_DNA_methylation_genes, Number of genes: 18
Category: alterations_in_histone_modifications_genes, Number of genes: 45
Category: AMPK pathway dysregulation_genes, Number of genes: 11
Category: AMPK_pathway_dysregulation_genes, Number of genes: 11
Category: changes_in_the_extracellular_matrix_structure_genes, Number of genes: 57
Category: chromatin_remodeling_genes, Number of genes: 39
Category: degradation_of_proteolytic_systems_genes, Number of genes: 114
Category: disabled_macroautophagy_genes, Number of genes: 34
Category: impairment_of_proteins_folding_and_stability_genes, Number of genes: 20
Category: impairment_of_the_mitochondrial_integrity_and_biogenesis_genes, Number of genes: 79
Category: INS_IGF-1_pathway_dysregulation_genes, Number of genes: 45
Category: intercellular_communication_impairment_genes, Number of genes: 118
Category: mitochondrial_DNA_instability_genes, Number o

In [2]:
import os
import pandas as pd

def generate_genes_df(folder_path='genes_by_category'):
    """
    Generates a DataFrame with two columns: 'category' and 'num_genes'.
    Each row corresponds to a .txt file in the folder, 
    where 'category' is the file name (without .txt) 
    and 'num_genes' is the count of non-empty lines (genes).
    """
    data = []
    
    # Loop over all files in the folder
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            category_name = os.path.splitext(filename)[0]  # Remove the '.txt' extension
            file_path = os.path.join(folder_path, filename)
            
            # Count the number of non-empty lines (genes)
            with open(file_path, 'r') as f:
                lines = [line.strip() for line in f if line.strip()]  
            num_genes = len(lines)
            
            # Append to our data list
            data.append((category_name, num_genes))
    
    # Create a DataFrame from our data
    df = pd.DataFrame(data, columns=['category', 'num_genes'])
    return df

if __name__ == "__main__":
    genes_df = generate_genes_df()
    print(genes_df)
    # Optionally, to save this DataFrame to a CSV file:
    # genes_df.to_csv("genes_by_category_summary.csv", index=False)


                                             category  num_genes
0       accumulation_of_reactive_oxygen_species_genes        126
1                alterations_in_DNA_methylation_genes         18
2          alterations_in_histone_modifications_genes         45
3                    AMPK pathway dysregulation_genes         11
4                    AMPK_pathway_dysregulation_genes         11
5   changes_in_the_extracellular_matrix_structure_...         57
6                          chromatin_remodeling_genes         39
7            degradation_of_proteolytic_systems_genes        114
8                       disabled_macroautophagy_genes         34
9   impairment_of_proteins_folding_and_stability_g...         20
10  impairment_of_the_mitochondrial_integrity_and_...         79
11              INS_IGF-1_pathway_dysregulation_genes         45
12       intercellular_communication_impairment_genes        118
13                mitochondrial_DNA_instability_genes          5
14              nuclear_a