OPENGENES

In [1]:
import numpy as np
import pandas as pd
from scipy.stats import fisher_exact
from statsmodels.stats.multitest import multipletests
import tabulate

# Combine the two datasets into one list of dictionaries.
# Missing or ambiguous PSG counts are set to 0 by default,
# except where a note indicates a value (e.g., "4" for near-significance).
data = [
    # First dataset (Hallmark categories)
    {"Category": "accumulation_of_reactive_oxygen_species_genes", "Total": 126, "PSGs": 2},
    {"Category": "alterations_in_DNA_methylation_genes", "Total": 18, "PSGs": 1},
    {"Category": "alterations_in_histone_modifications_genes", "Total": 45, "PSGs": 0},  # missing count set to 0
    {"Category": "AMPK pathway dysregulation_genes", "Total": 11, "PSGs": 0},             # missing count set to 0
    {"Category": "changes_in_the_extracellular_matrix_structure_...", "Total": 57, "PSGs": 0},  # missing count set to 0
    {"Category": "chromatin_remodeling_genes", "Total": 39, "PSGs": 1},
    {"Category": "degradation_of_proteolytic_systems_genes", "Total": 114, "PSGs": 4},
    {"Category": "disabled_macroautophagy_genes", "Total": 34, "PSGs": 0},               # missing count set to 0
    {"Category": "impairment_of_proteins_folding_and_stability_g...", "Total": 20, "PSGs": 0},  # missing count set to 0
    {"Category": "impairment_of_the_mitochondrial_integrity_and_...", "Total": 79, "PSGs": 2},
    {"Category": "INS_IGF-1_pathway_dysregulation_genes", "Total": 45, "PSGs": 1},
    {"Category": "intercellular_communication_impairment_genes", "Total": 118, "PSGs": 4},
    {"Category": "mitochondrial_DNA_instability_genes", "Total": 5, "PSGs": 0},            # missing count set to 0
    {"Category": "nuclear_architecture_impairment_genes", "Total": 24, "PSGs": 0},         # missing count set to 0
    {"Category": "nuclear_DNA_instability_genes", "Total": 82, "PSGs": 1},
    {"Category": "senescent_cells_accumulation_genes", "Total": 33, "PSGs": 0},            # missing count set to 0
    {"Category": "SIRT_pathway_dysregulation_genes", "Total": 1, "PSGs": 0},
    {"Category": "stem_cell_exhaustion_genes", "Total": 38, "PSGs": 6},
    {"Category": "sterile_inflammation_genes", "Total": 99, "PSGs": 1},
    {"Category": "telomere_attrition_genes", "Total": 30, "PSGs": 0},                      # missing count set to 0
    {"Category": "TOR_pathway_dysregulation_genes", "Total": 26, "PSGs": 1},
    {"Category": "transcriptional_alterations_genes", "Total": 214, "PSGs": 3},            # note: (+2 not surv. FDR) ignored

    # Second dataset
    {"Category": "Age-related changes in gene expression methylation or protein activity in humans", "Total": 932, "PSGs": 6},
    {"Category": "Age-related changes in gene expression methylation or protein activity in non-mammals", "Total": 2, "PSGs": 0},  # missing PSG count set to 0
    {"Category": "Age-related changes in gene expression methylation or protein activity", "Total": 54, "PSGs": 1},
    {"Category": "Association of genetic variants and gene expression levels with longevity", "Total": 112, "PSGs": 4},  # note: 4 indicated as nearly significant
    {"Category": "Association of the gene with accelerated aging in humans", "Total": 2, "PSGs": 0},
    {"Category": "Changes in gene activity enhance age-related deterioration", "Total": 1, "PSGs": 0},
    {"Category": "Changes in gene activity extend mammalian lifespan", "Total": 12, "PSGs": 0},
    {"Category": "Changes in gene activity extend non-mammalian lifespan", "Total": 8, "PSGs": 0},
    {"Category": "Changes in gene activity protect against age-related impairment", "Total": 3, "PSGs": 0},
    {"Category": "Changes in gene activity reduce mammalian lifespan", "Total": 11, "PSGs": 0},
    {"Category": "Regulation of  associated with aging", "Total": 8, "PSGs": 0}  # missing PSG count set to 0
]

# Convert the data into a pandas DataFrame for ease of manipulation.
df = pd.DataFrame(data)

# Calculate overall totals for the combined data.
total_genes = df['Total'].sum()
total_psgs = df['PSGs'].sum()

print("Overall total genes:", total_genes)
print("Overall total PSGs:", total_psgs)

# Initialize a list for storing results from each category.
results_list = []

# Loop over each category row, build the contingency table, and perform Fisher's exact test.
for index, row in df.iterrows():
    a = row['PSGs']                     # PSGs in the category
    b = row['Total'] - a                # Non-PSGs in the category
    c = total_psgs - a                  # PSGs outside the category
    # Non-PSGs outside category: remaining genes after excluding both total in category and PSGs outside category.
    d = total_genes - row['Total'] - c
    
    # Create the 2x2 contingency table:
    #         In Category    |  Outside Category
    # PSGs         a          |         c
    # Non-PSGs     b          |         d
    contingency_table = np.array([[a, b],
                                    [c, d]])
    
    # Perform Fisher's exact test.
    odds_ratio, p_value = fisher_exact(contingency_table, alternative='two-sided')
    
    results_list.append({
        "Category": row['Category'],
        "Total": row['Total'],
        "PSGs": a,
        "Odds_Ratio": odds_ratio,
        "P_value": p_value
    })

# Convert the results into a DataFrame.
results_df = pd.DataFrame(results_list)

# Adjust for multiple comparisons using Bonferroni correction.
adjusted = multipletests(results_df['P_value'], method='bonferroni')
results_df['P_value_adj'] = adjusted[1]

# Sort the results by the adjusted p-value.
results_df = results_df.sort_values(by='P_value_adj')

# Print the results
print("\nFisher's Exact Test Results for Each Category:")
print(results_df)

# Optionally, display the results in a nicely formatted table.
print("\nFormatted Results:")
print(tabulate.tabulate(results_df, headers='keys', tablefmt='github', showindex=False))


Overall total genes: 2403
Overall total PSGs: 38

Fisher's Exact Test Results for Each Category:
                                             Category  Total  PSGs  \
17                         stem_cell_exhaustion_genes     38     6   
22  Age-related changes in gene expression methyla...    932     6   
0       accumulation_of_reactive_oxygen_species_genes    126     2   
30  Changes in gene activity protect against age-r...      3     0   
29  Changes in gene activity extend non-mammalian ...      8     0   
28  Changes in gene activity extend mammalian life...     12     0   
27  Changes in gene activity enhance age-related d...      1     0   
26  Association of the gene with accelerated aging...      2     0   
25  Association of genetic variants and gene expre...    112     4   
24  Age-related changes in gene expression methyla...     54     1   
23  Age-related changes in gene expression methyla...      2     0   
21                  transcriptional_alterations_genes    214   

ECM

In [2]:
import numpy as np
import pandas as pd
from scipy.stats import fisher_exact
from statsmodels.stats.multitest import multipletests
import tabulate

# Define the data (ignoring isoforms)
data = [
    {"Category": "ECM glycoproteins", "Total": 195, "PSGs": 3},
    {"Category": "Collagens", "Total": 44, "PSGs": 1},
    {"Category": "Proteoglycans", "Total": 35, "PSGs": 1},
    {"Category": "ECM-affiliated proteins", "Total": 171, "PSGs": 0},  # assumed PSG count = 0
    {"Category": "ECM regulators", "Total": 238, "PSGs": 3},
    {"Category": "Secreted factors", "Total": 344, "PSGs": 16},
]

# Create a DataFrame from the data
df = pd.DataFrame(data)

# Calculate overall totals
total_genes = df['Total'].sum()
total_psgs = df['PSGs'].sum()

print("Overall total genes:", total_genes)
print("Overall total PSGs:", total_psgs)

# Initialize a list to store Fisher's test results for each category
results_list = []

# Loop through each category in the DataFrame to construct the contingency table and run Fisher's exact test.
for index, row in df.iterrows():
    # a = PSGs in the category
    a = row['PSGs']
    # b = Non-PSGs in the category
    b = row['Total'] - a
    # c = PSGs outside the category (overall PSGs minus the ones in the category)
    c = total_psgs - a
    # d = Non-PSGs outside the category = total genes outside category - PSGs outside category
    d = total_genes - row['Total'] - c
    
    # Create the 2x2 contingency table:
    #                In Category   |  Outside Category
    # PSGs             a           |  c
    # Non-PSGs         b           |  d
    contingency_table = np.array([[a, b],
                                    [c, d]])
    
    # Perform Fisher's exact test (two-sided)
    odds_ratio, p_value = fisher_exact(contingency_table, alternative='two-sided')
    
    results_list.append({
        "Category": row['Category'],
        "Total": row['Total'],
        "PSGs": a,
        "Odds_Ratio": odds_ratio,
        "P_value": p_value
    })

# Convert the results into a DataFrame
results_df = pd.DataFrame(results_list)

# Adjust p-values for multiple testing using the Bonferroni correction
adjusted = multipletests(results_df['P_value'], method='bonferroni')
results_df['P_value_adj'] = adjusted[1]

# Sort the results by the adjusted p-value to highlight the most significant categories
results_df = results_df.sort_values(by='P_value_adj')

# Display the results
print("\nFisher's Exact Test Results for Each ECM-Related Category:")
print(results_df)

# Optionally, display a nicely formatted table
print("\nFormatted Results:")
print(tabulate.tabulate(results_df, headers='keys', tablefmt='github', showindex=False))


Overall total genes: 1027
Overall total PSGs: 24

Fisher's Exact Test Results for Each ECM-Related Category:
                  Category  Total  PSGs  Odds_Ratio   P_value  P_value_adj
5         Secreted factors    344    16    4.115854  0.001457     0.008742
3  ECM-affiliated proteins    171     0    0.000000  0.022744     0.136464
0        ECM glycoproteins    195     3    0.603423  0.598917     1.000000
1                Collagens     44     1    0.970677  1.000000     1.000000
2            Proteoglycans     35     1    1.239130  0.569070     1.000000
4           ECM regulators    238     3    0.466869  0.325617     1.000000

Formatted Results:
| Category                |   Total |   PSGs |   Odds_Ratio |   P_value |   P_value_adj |
|-------------------------|---------|--------|--------------|-----------|---------------|
| Secreted factors        |     344 |     16 |     4.11585  |  0.001457 |      0.008742 |
| ECM-affiliated proteins |     171 |      0 |     0        |  0.022744 |   