# Statistical description of functional modules

In this notebook, we'll statistically describe the contents of the modules identified using WGCNA analysis. Instead of using enrichment analysis, we'll use Mann-Whitney U tests to see which genes are most correlated with each module.
The python functions needed for this notebook are in the `scripts/analysis/module_mwu.py` file. The functions are imported below.

To do this analysis, we need:
- Lists of KO's in each module, ranked by correlation with the module eigengene (but not filtered by p-value or a cutoff!). These are in the `data/analysis/WGCNA/information/` directory. The files are called MEname_content.txt and contain a column of KO's and a column called moduleMembership, which is the correlation between a KO and the module eigengene.
- For all KO's, a list with annotation information. Here these are pathway IDs to which these KOs belong. These are parsed from the KEGG API.

After, we want to describe for certain pathways in a module, how many of its' original transcripts are annotated as a certain species.

In [1]:
from module_mwu import qvals_bh, enrich_mwu
import pandas as pd
import requests
import os
import concurrent.futures
import csv
from collections import defaultdict
import numpy as np
from tqdm import tqdm
from ast import literal_eval
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.colors as mcolors
from mpl_toolkits.axes_grid1 import make_axes_locatable

### Annotate KO IDs

In [None]:
# First, read all KO terms found in the dataset
ko_terms = pd.read_csv('../../data/analysis/WGCNA/information/KEGG_ID_list.txt', header=None)

# Create a defaultdict to store the KO IDs and their corresponding pathways
ko_pathway_dict = defaultdict(list)

# Define a function to get the pathway information for a single KO ID
def get_pathways(ko_id, ko_pathway_dict=ko_pathway_dict):
    # Send a GET request to the KEGG API to get the pathway information
    response = requests.get(f"https://rest.kegg.jp/link/pathway/{ko_id}")
    # Parse the pathway information from the response
    pathways = [pathway.split(":")[2] for pathway in response.text.split("\n") if pathway]
    if pathways:
        # leave out values if they don't contain 'map'
        pathways = [x for x in pathways if 'map' in x]
        # Add all pathways to the dictionary for this KO ID
        ko_pathway_dict[ko_id].extend(pathways)
    else:
        # If the KO ID has no attached pathways, add 'None' to the dictionary
        ko_pathway_dict[ko_id].append('None')

# For all the keys in the dictionary, send a request to the KEGG API to get the pathway information
for ko_id in tqdm(ko_terms, desc='Getting pathways...'):
    get_pathways(ko_id)

In [None]:
# Check if the result is complete
print(len(np.unique(ko_terms)))
print(len(ko_pathway_dict.keys()))

In [None]:
# Write the KO data and pathway information to a new file
output_file = "../../data/analysis/WGCNA/ko_pathway_info.csv"
with open(output_file, "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["KEGG_ID", "Pathway"])
    for ko_id, pathways in ko_pathway_dict.items():
        writer.writerow([ko_id, pathways])

## Combine annotation data with modules & perform MWU tests

In [None]:
# Combine KO IDs, map info, and module membership data for a given module
## Create the required annotation file
KO_info = pd.read_csv("../../data/analysis/WGCNA/ko_pathway_info.csv")
KO_info['Pathway'] = KO_info['Pathway'].apply(literal_eval)
KO_info = KO_info.explode('Pathway')
KO_info = KO_info[KO_info['Pathway'] != 'None']
KO_info.columns = ['KEGG_ID', 'annotation']

input_dir = "../../data/analysis/WGCNA/information/"

# Iterate over all modules in folder
for filename in tqdm(os.listdir(input_dir), desc="Processing modules..."):
    if filename.endswith("_content.txt"):
        input_file = os.path.join(input_dir, filename)
        module_data = pd.read_table(input_file)
        module = filename.split("_")[0]
        
        # The data should be ordered by modulemembership
        module_data.sort_values(by=[module], inplace=True, ascending=False)
        
        # Run MWU
        res = enrich_mwu(module_data, KO_info, min_size=1, print_progress=True)
        
        # Save results
        res.to_csv(f"{input_dir}{module}_MWU_results.csv", index=False)

## Add pathway description to MWU test results

In [None]:
# For every significantly higher ranking pathway, get more annotation information and store it in a new file
for filename in tqdm(os.listdir(input_dir), desc="Analyzing modules..."):
    if filename.endswith("_MWU_results.csv"):
        input_file = os.path.join(input_dir, filename)
        module = filename.split("_")[0]
        res = pd.read_csv(input_file)
        res = res[(res['q_val'] < 0.10) & (res['estimate'] > 0.5)]
        # Get more annotation information for the pathways
        ## Add empty columns to the dataframe
        res['pathway_name'] = ''
        res['pathway_class'] = ''
        for pathway in tqdm(res['annotation'].unique(), desc="Adding pathway information..."):
            # Send a GET request to the KEGG API to get the pathway information
            response = requests.get(f"https://rest.kegg.jp/get/path:{pathway}")

            # Parse the pathway name and class from the response
            pathway_name = ""
            pathway_class = ""
            for line in response.text.split("\n"):
                if line.startswith("NAME"):
                    pathway_name = line.split("        ")[1]
                elif line.startswith("CLASS"):
                    pathway_class = line.split("    ")[1]
            
            # Add the pathway name and class to the dataframe
            res.loc[res['annotation'] == pathway, 'pathway_name'] = pathway_name
            res.loc[res['annotation'] == pathway, 'pathway_class'] = pathway_class
            
        # Save the results
        res.to_csv(f"{input_dir}{module}_MWU_results_annotated.csv", index=False)

## Relating taxonomy and module pathways
For a given pathway within a certain module, I want to obtain the species that is associated most with the KOs within that pathway.
Approach:
1. For a given pathway X, obtain all KOs associated with that pathway that are included in the WGCNA analysis
2. For that list of KOs, extract all transcript names annotated as one of these KOs
3. For the extracted list of transcript names:
    - Plot the TPM sums per month
    - Plot the fraction of transcripts taxonomically annotated as Z
    - Give the mean percentage of this set of KOs annotated as Z

In [2]:
## Load the necessary data
### KOs and their associated pathways that have been included in the WGCNA analysis
KO_info = pd.read_csv("../../data/analysis/WGCNA/ko_pathway_info.csv")

### The transcript functional annotation data
annotation = pd.read_csv('../../data/annotation/functional_eggnog/functional_annotation.emapper.annotations', sep = '\t', engine = 'pyarrow')

#### Fix transcript names in the first column so that they equal the transcript identifiers in the count files
#### This is necessary because TransDecoder adds .p2 or .p1 to the sequence identifiers
annotation.iloc[:, 0] = annotation.iloc[:, 0].str.split(".", expand=True).drop(columns=1)

### The transcript counts
counts = pd.read_csv('../../data/kallisto/tpm.csv', sep=',', index_col=0, engine='pyarrow')

## The EukProt taxonomic annotation data
eukprot_annotation = pd.read_table(f'../../data/annotation/taxonomy_eukprot/eukprot_DB.firsthit.90plus_alnscore.m8', header=None)
# Fix transcript names in the first column so that they equal the transcript identifiers in the count files
eukprot_annotation.iloc[:, 0] = eukprot_annotation.iloc[:, 0].str.split(".", expand=True).drop(columns=1)

## In the second column, split of the EukProt ID off
eukprot_ID = eukprot_annotation.iloc[:, 1].str.split("_", expand=True)[0]
eukprot_annotation.iloc[:, 1] = eukprot_ID
eukprot_annotation.columns = ['query_id', 'target_id', 'p_ident', 'alnlen', 'mismatch', 'gapopen', 'qstart', 'qend', 'tstart', 'tend', 'evalue', 'bits']

## Add taxonomic information
eukprot_taxonomy = pd.read_table('../../data/annotation/taxonomy_eukprot/EukProt_included_data_sets.v03.2021_11_22.txt')

# Drop the columns that are not needed
eukprot_taxonomy.drop(columns=['Previous_Names', 'Replaces_EukProt_ID', 'Data_Source_URL', 'Data_Source_Name', 'Paper_DOI', 'Actions_Prior_to_Use',
       'Data_Source_Type', 'Notes', 'Columns_Modified_Since_Previous_Version', 'Merged_Strains',
       'Alternative_Strain_Names', '18S_Sequence_GenBank_ID', '18S_Sequence',
       '18S_Sequence_Source', '18S_Sequence_Other_Strain_GenBank_ID',
       '18S_Sequence_Other_Strain_Name', '18S_and_Taxonomy_Notes'], inplace=True)

# Swap the _ to a space in the Name_to_Use column
eukprot_taxonomy['Name_to_Use'] = eukprot_taxonomy['Name_to_Use'].str.replace('_', ' ')

# Merge the annotation and taxonomy files
eukprot_annotation = eukprot_annotation.merge(eukprot_taxonomy, left_on='target_id', right_on='EukProt_ID', how='left')

# Drop the columns that are not needed
eukprot_annotation.drop(columns=['target_id', 'gapopen', 'qstart', 'qend', 'tstart', 'tend', 'evalue', 'bits'], inplace=True)

## The shallow EukProt taxonomic annotation data
eukprot_annotation_60 = pd.read_table(f'../../data/annotation/taxonomy_eukprot/eukprot_DB.firsthit.60plus_alnscore.m8', header=None)
# Fix transcript names in the first column so that they equal the transcript identifiers in the count files
eukprot_annotation_60.iloc[:, 0] = eukprot_annotation_60.iloc[:, 0].str.split(".", expand=True).drop(columns=1)

## In the second column, split of the EukProt ID off
eukprot_ID = eukprot_annotation_60.iloc[:, 1].str.split("_", expand=True)[0]
eukprot_annotation_60.iloc[:, 1] = eukprot_ID
eukprot_annotation_60.columns = ['query_id', 'target_id', 'p_ident', 'alnlen', 'mismatch', 'gapopen', 'qstart', 'qend', 'tstart', 'tend', 'evalue', 'bits']

## Add taxonomic information
eukprot_taxonomy = pd.read_table('../../data/annotation/taxonomy_eukprot/EukProt_included_data_sets.v03.2021_11_22.txt')

# Drop the columns that are not needed
eukprot_taxonomy.drop(columns=['Previous_Names', 'Replaces_EukProt_ID', 'Data_Source_URL', 'Data_Source_Name', 'Paper_DOI', 'Actions_Prior_to_Use',
       'Data_Source_Type', 'Notes', 'Columns_Modified_Since_Previous_Version', 'Merged_Strains',
       'Alternative_Strain_Names', '18S_Sequence_GenBank_ID', '18S_Sequence',
       '18S_Sequence_Source', '18S_Sequence_Other_Strain_GenBank_ID',
       '18S_Sequence_Other_Strain_Name', '18S_and_Taxonomy_Notes'], inplace=True)

# Swap the _ to a space in the Name_to_Use column
eukprot_taxonomy['Name_to_Use'] = eukprot_taxonomy['Name_to_Use'].str.replace('_', ' ')

# Merge the annotation and taxonomy files
eukprot_annotation_60 = eukprot_annotation_60.merge(eukprot_taxonomy, left_on='target_id', right_on='EukProt_ID', how='left')

# Drop the columns that are not needed
eukprot_annotation_60.drop(columns=['target_id', 'gapopen', 'qstart', 'qend', 'tstart', 'tend', 'evalue', 'bits'], inplace=True)

# load metadata
meta = pd.read_csv('../../samples.csv', sep=';', index_col=0)
## Set the order of the months in the metadata
month_order = ["July_2020", "August_2020", "September_2020", "November_2020", 
                "December_2020", "January_2021", "February_2021", "April_2021", 
                "May_2021", "June_2021", "July_2021"]

In [3]:
def plot_pathway_by_genus(KO_info, annotation, counts, eukprot_annotation, meta, module, pathway, genus, module_dir='../../data/analysis/WGCNA/information/'):
    # Extract KO values for the given pathway
    ko_values = KO_info.loc[KO_info['Pathway'].str.contains(pathway), 'KEGG_ID'].tolist()

    # Read the module data
    filename = os.path.join(module_dir, "ME" + module + "_content.txt")
    module_data = pd.read_table(filename)
    
    # Retain all KOs for a given pathway that are also found in the module
    ko_values = list(set(ko_values) & set(module_data['KEGG_ID']))

    # Extract relevant transcripts
    transcript_names = annotation[annotation['KEGG_ko'].str.contains('|'.join(ko_values))]['#query'].tolist()
    counts_subset = counts.loc[transcript_names].reset_index().melt(id_vars='target_id', var_name='sample', value_name='TPM')

    # Annotate with taxonomic information
    annotated = eukprot_annotation.loc[eukprot_annotation['Genus_UniEuk'] == genus, 'query_id']
    ## Set of transcripts that are annotated, but not as the given genus
    other = set(eukprot_annotation['query_id']) - set(annotated)
    
    # Extract the Taxogroup2_UniEuk of the given genus
    taxo_group_genus = eukprot_annotation[eukprot_annotation['Genus_UniEuk'] == genus]['Taxogroup2_UniEuk'].iloc[0]

    # Merge the DataFrames to have taxonomic information directly in counts_subset
    counts_subset = counts_subset.merge(eukprot_annotation[['query_id', 'Taxogroup2_UniEuk', 'Genus_UniEuk']], 
                                left_on='target_id', 
                                right_on='query_id', 
                                how='left')

    # Directly create the annotation column with vectorized operations
    counts_subset['annotation'] = 'unannotated'
    counts_subset.loc[counts_subset['Genus_UniEuk'] == genus, 'annotation'] = genus
    counts_subset.loc[
        (counts_subset['annotation'] == 'unannotated') & 
        (counts_subset['Taxogroup2_UniEuk'] == taxo_group_genus), 'annotation'] = taxo_group_genus
    counts_subset.loc[
        (counts_subset['annotation'] == 'unannotated') & 
        (counts_subset['Taxogroup2_UniEuk'] != taxo_group_genus) & 
        (counts_subset['Taxogroup2_UniEuk'].notna()), 'annotation'] = 'other'

    # Add month information
    plot_df = pd.merge(counts_subset, meta, left_on='sample', right_index=True)
    # Ensure all months are represented in the data
    all_months = ["July_2020", "August_2020", "September_2020", "November_2020", 
                  "December_2020", "January_2021", "February_2021", "April_2021", 
                  "May_2021", "June_2021", "July_2021"]
    # Set the 'month' column as a categorical type with predefined categories
    plot_df['month'] = pd.Categorical(plot_df['month'], categories=all_months, ordered=True)

    # Compute metrics
    num_KOs = len(ko_values)
    num_transcripts_KOs = len(transcript_names)
    num_transcripts_genus = len(annotated)
    num_transcripts_annotated = len(other) + len(annotated)

     # Compute the number of transcripts for the associated KOs that are also annotated as the given genus
    num_transcripts_KOs_genus = len(set(transcript_names).intersection(set(annotated)))
    
    # Compute the fraction of the total TPM sum for the involved transcripts in a month attributed to the genus
    total_TPM = plot_df.groupby('month')['TPM'].sum()
    genus_TPM = plot_df[plot_df['annotation'] == genus].groupby('month')['TPM'].sum()
    TPM_fraction_genus = (genus_TPM / total_TPM).fillna(0)

    # Plot
    ## Convert width and height from cm to pixels
    width_px = 7 * 37.795275591 
    height_px = 5 * 37.795275591
    fig = px.histogram(plot_df.sort_values("month"), 
                       x="month", 
                       y="TPM",
                       color="annotation",
                       histfunc="sum",  # Sum the TPM values for each month
                       barmode="relative",  # Display bars as a fraction of the total TPM for the month
                       color_discrete_map={genus: 'green', taxo_group_genus: 'lightblue', 'other': 'lightcoral', 'unannotated': 'grey'},  # Specify colors for the genus and 'other'
                       category_orders={"month": all_months, "annotation": [genus, taxo_group_genus, 'other', 'unannotated']})  # Specify the order of the months and colors

    # Modify x-axis labels to show only the first letter of each month
    fig.update_xaxes(
        tickvals=list(range(len(all_months))),
        ticktext=[month[0] for month in all_months]
    )

    fig.update_layout(
        title=f"Module: {module}, Pathway: {pathway}",
        font=dict(
            family="Times New Roman, serif",
            size=8,
            color="black"
        ),
        autosize=False,
        width=width_px,
        height=height_px,
        margin=dict(l=0, r=25, b=25, t=25),
        yaxis_title_text='TPM',
        xaxis_title_text='Month'
    )
    
    return fig, num_KOs, num_transcripts_KOs, num_transcripts_genus, TPM_fraction_genus, num_transcripts_KOs_genus, num_transcripts_annotated

In [4]:
# Manually iterate over all modules' KO map IDs identified in MWU and all genera that significantly correlate with a given module

## Still need to do M4, for some reason it takes forever
# List of pathways
pathways = [
"map00946",
"map00903"
]

# Genus and module
genera = ['Noctiluca']
module = 'darkorange'

# Directory to save the figures
base_figures_dir = '../../figures/WGCNA/module_pathways_genus_annotation'

for genus in genera:
    # Create a new directory for each module_genus if it doesn't exist
    figures_dir = os.path.join(base_figures_dir, f"{module}_{genus}")
    os.makedirs(figures_dir, exist_ok=True)

    # Open a file to store the printed results
    with open(os.path.join(figures_dir, "results.txt"), "w") as result_file:
        # Iterate over pathways
        for pathway in pathways:
            # Run the function
            fig, num_KOs, num_transcripts_KOs, num_transcripts_genus, TPM_fraction_genus, num_transcripts_KOs_genus, num_transcripts_annotated = plot_pathway_by_genus(KO_info, annotation, counts, eukprot_annotation, meta, module, pathway, genus)

            fig.show()
            # Save the figure as .svg
            fig_path = os.path.join(figures_dir, f"{pathway}_plot.svg")
            fig.write_image(fig_path)

            # Print the results to the file
            print(f"Results for pathway: {pathway}", file=result_file)
            print("Number of KOs:", num_KOs, file=result_file)
            print("Number of Transcripts Annotated as these KOs:", num_transcripts_KOs, file=result_file)
            print("Number of Transcripts Annotated as the Given Genus:", num_transcripts_genus, file=result_file)
            print("Number of Transcripts Annotated as these KOs and the Given Genus:", num_transcripts_KOs_genus, file=result_file)
            print("Number of Transcripts Annotated as these KOs and a taxonomic annotation other than this genus:", num_transcripts_annotated, file=result_file)
            print("Fraction of these transcripts' total TPM that can be attributed to this genus by month:", TPM_fraction_genus, file=result_file)
            print("\n", file=result_file)  # Add a newline for better readability

#### Identify tax groups with highest absolute abundances in transcripts associated to a pathway

In [None]:
# First, create a function that obtains a name and class for a given KO map ID
def fetch_pathway_info(pathway_ids):
    pathway_info = {}
    for pathway_id in tqdm(pathway_ids, desc="Fetching Pathway Info"):
        response = requests.get(f"https://rest.kegg.jp/get/{pathway_id}")
        
        pathway_name = 'N/A'  # Default value
        pathway_class = 'N/A'  # Default value
        
        for line in response.text.split("\n"):
            if line.startswith("NAME"):
                pathway_name = line.split("        ")[1]
            elif line.startswith("CLASS"):
                pathway_class = line.split("    ")[1]

        pathway_info[pathway_id] = {'name': pathway_name, 'class': pathway_class}
        
    # Convert pathway_info to a DataFrame
    pathway_df = pd.DataFrame.from_dict(pathway_info, orient='index', columns=['name', 'class'])
    # Reset the index to have the pathway id as a separate column
    pathway_df.reset_index(inplace=True)
    pathway_df.rename(columns={'index': 'pathway_id'}, inplace=True)
    
    return pathway_df

pathway_info = fetch_pathway_info(pathways)

# Store the pathway info as a csv file
pathway_info.to_csv('../../data/analysis/WGCNA/pathway_info.csv')

In [None]:
# Fetch pathway descriptors
pathways = KO_info['Pathway'].str.extractall(r"'(map\d+)'")[0].unique()

# Read the pathway info from the csv file
pathway_info = pd.read_csv('../../data/analysis/WGCNA/pathway_info.csv', index_col=0)

In [None]:
def pathway_abundance_analysis(KO_info, annotation, counts, eukprot_annotation, pathways, pathway_info, tax_level='Name_to_Use', result_path='../../data/analysis/WGCNA/pathway_taxonomic_analysis.csv'):
    results = []

    for pathway in tqdm(pathways, desc="Analyzing Pathways"):
        # Filtering KO_info for rows where the Pathway column contains the current pathway
        ko_values = KO_info[KO_info['Pathway'].str.contains(pathway)]['KEGG_ID'].tolist()
        
        # Filtering annotation DataFrame for rows where KEGG_ko column contains any value in ko_values
        transcript_names = annotation[annotation['KEGG_ko'].str.contains('|'.join(ko_values))]['#query'].tolist()
        num_func_annotated_transcripts = len(transcript_names)

        # Get taxonomic annotations of these transcripts
        annotated_transcripts = eukprot_annotation[eukprot_annotation['query_id'].isin(transcript_names)]
        num_tax_annotated_transcripts = len(annotated_transcripts)
        
        # Biggest Contributor
        tax_counts = annotated_transcripts.groupby(tax_level).size()
        
        if not tax_counts.empty:
            biggest_contributor, biggest_contributor_count = tax_counts.idxmax(), tax_counts.max()
            
            # Calculate TPM contribution
            tpm_counts = counts.loc[transcript_names]
            total_tpm = tpm_counts.sum().sum()
            genus_tpm = tpm_counts.loc[annotated_transcripts[annotated_transcripts[tax_level] == biggest_contributor]['query_id']].sum().sum()
            tpm_percentage = (genus_tpm / total_tpm) * 100 if total_tpm != 0 else 0
        else:
            biggest_contributor, biggest_contributor_count, tpm_percentage = 'N/A', 0, 0
        
        # Extract Pathway_Name and Pathway_Class from pathway_info DataFrame
        pathway_name = pathway_info[pathway_info['pathway_id'] == pathway]['name'].values[0] if pathway in pathway_info['pathway_id'].values else 'N/A'
        pathway_class = pathway_info[pathway_info['pathway_id'] == pathway]['class'].values[0] if pathway in pathway_info['pathway_id'].values else 'N/A'

        results.append({
            'Pathway_ID': pathway,
            'Pathway_Name': pathway_name,
            'Pathway_Class': pathway_class,
            'Func_Annotated_Transcripts': num_func_annotated_transcripts,
            'Tax_Annotated_Transcripts': num_tax_annotated_transcripts,
            'Biggest_Contributor': biggest_contributor,
            'Contributor_Count': biggest_contributor_count,
            'TPM_Percentage': tpm_percentage
        })
    
    # Create DataFrame and save as CSV
    result_df = pd.DataFrame(results)
    result_df.to_csv(result_path, index=False)

In [None]:
# Run the function on deep taxonomic level
pathway_abundance_analysis(KO_info, annotation, counts, eukprot_annotation, pathways, pathway_info, tax_level='Name_to_Use', result_path='../../data/analysis/WGCNA/pathway_deep_taxonomic_analysis.csv')

# Run the function on genus level
pathway_abundance_analysis(KO_info, annotation, counts, eukprot_annotation, pathways, pathway_info, tax_level='Genus_UniEuk', result_path='../../data/analysis/WGCNA/pathway_genus_taxonomic_analysis.csv')

# Run the function on shallow taxonomic level
pathway_abundance_analysis(KO_info, annotation, counts, eukprot_annotation_60, pathways, pathway_info, tax_level='Taxogroup2_UniEuk', result_path='../../data/analysis/WGCNA/pathway_shallow_taxonomic_analysis.csv')

#### Identify tax groups that contribute most to the expression of a given pathway

In [None]:
def grouped_pathway_analysis(KO_info, annotation, counts, eukprot_annotation, pathways, pathway_info, meta, group_by='month', tax_level='Name_to_Use', result_path='../../data/analysis/WGCNA/pathway_analysis.csv'):
    results = []
    
    # Group the metadata 
    grouped_meta = meta.groupby(group_by).apply(lambda x: x.index.tolist())
    
    # Loop over the given pathways
    for pathway in tqdm(pathways, desc="Analyzing Pathways"):
        # Filter KO_info for rows where the Pathway column contains the current pathway
        ko_values = KO_info[KO_info['Pathway'].str.contains(pathway)]['KEGG_ID'].tolist()
        # Filter the functional annotation dataframe for rows where the KEGG_ko column contains any value from ko_values and extract the transcript names
        transcript_names = annotation[annotation['KEGG_ko'].str.contains('|'.join(ko_values))]['#query'].tolist()
        
        # Loop over the groups
        for group, samples in grouped_meta.items():
            # Extract the relevant TPM count columns
            grouped_counts = counts[samples]
            # Calculate the total TPM sum of all transcripts involved in the pathway in the current group
            TPM_pathway_sum = grouped_counts.loc[transcript_names].sum().sum()
            # Extract the taxonomically annotated set of transcripts
            annotated_transcripts = eukprot_annotation[eukprot_annotation['query_id'].isin(transcript_names)]
            # Get the TPM sum for a given taxonomic level
            tpm_by_tax = annotated_transcripts.groupby(tax_level).apply(lambda x: grouped_counts.loc[x['query_id']].sum().sum())
            # Get the total TPM sum for all annotated transcripts
            total_tpm = tpm_by_tax.sum()
            
            if not tpm_by_tax.empty:
                # Identify the tax group that has the max total TPM value within each pathway for each group
                biggest_contributor = tpm_by_tax.idxmax()
                # Store the max value
                max_tpm = tpm_by_tax.max()
                # Calculate the fraction of the biggest contributor relative to all annotated transcripts
                tpm_fraction = (max_tpm / total_tpm) * 100 if total_tpm != 0 else 0
            else:
                biggest_contributor, max_tpm, tpm_fraction = 'N/A', 0, 0
            
            # Add pathway annotation
            pathway_name = pathway_info[pathway_info['pathway_id'] == pathway]['name'].values[0] if pathway in pathway_info['pathway_id'].values else 'N/A'
            pathway_class = pathway_info[pathway_info['pathway_id'] == pathway]['class'].values[0] if pathway in pathway_info['pathway_id'].values else 'N/A'
            
            # Write to results
            results.append({
                'Group': group,
                'Pathway_ID': pathway,
                'Pathway_Name': pathway_name,
                'Pathway_Class': pathway_class,
                'TPM_pathway_sum': TPM_pathway_sum, 
                'Biggest_Contributor': biggest_contributor,
                'Max_TPM': max_tpm,
                'TPM_Fraction_of_annotated': tpm_fraction
            })
    
    result_df = pd.DataFrame(results)
    result_df.to_csv(result_path, index=False)

In [None]:
# Run the function on the deep taxonomic level
grouped_pathway_analysis(KO_info, annotation, counts, eukprot_annotation, pathways, pathway_info, meta, group_by=['month', 'station'], tax_level='Name_to_Use', result_path='../../data/analysis/WGCNA/pathway_deep_grouped_analysis.csv')

# Run the function on the genus level
grouped_pathway_analysis(KO_info, annotation, counts, eukprot_annotation, pathways, pathway_info, meta, group_by=['month', 'station'], tax_level='Genus_UniEuk', result_path='../../data/analysis/WGCNA/pathway_genus_grouped_analysis.csv')

# Run the function on a shallow taxonomic level
grouped_pathway_analysis(KO_info, annotation, counts, eukprot_annotation_60, pathways, pathway_info, meta, group_by=['month', 'station'], tax_level='Taxogroup2_UniEuk', result_path='../../data/analysis/WGCNA/pathway_shallow_grouped_analysis.csv')

### Plot results

In [None]:
# Load your DataFrame here
df = pd.read_csv('../../data/analysis/WGCNA/pathway_shallow_grouped_analysis.csv')

# Split the 'Group' column into 'Month' and 'Station'
df[['Month', 'Station']] = df['Group'].str.extract(r"\(\'(.+?)\', \'(.+?)\'\)")

# Sort the DataFrame by 'Month' and 'Station'
months_order = ['July_2020', 'August_2020', 'September_2020', 'November_2020', 'December_2020', 'January_2021', 'February_2021', 'April_2021', 'May_2021', 'June_2021', 'July_2021']
stations_order = ['ZG02', '120', '330', '130', '780', '700']

# Get the top 18 pathways based on the maximum TPM across samples
top_pathways = df.groupby('Pathway_ID')['TPM_pathway_sum'].sum().nlargest(464).sort_values(ascending=False).index

# Filter the DataFrame to keep only the top 18 pathways
df_top_pathways = df[df['Pathway_ID'].isin(top_pathways)]

# Define color mapping
color_mapping = {
    "Diatomeae": "#E69F00",
    "core-Noctilucales": "#56B4E9",
    "Dinophyceae": "#56B4E9",
    "Prymnesiophyceae": "#009E73",
    "Spirotrichea": "#F0E442",
    "Arthropoda": "#0072B2",
    "Vertebrata": "#6600CC",
    "Oligohymenophorea": "#900101",
    "Ctenophora": "#8F4500",
    "Cnidaria": "#5668BD",
    "Florideophyceae": "#FF009D"
}

# For each pathway, create a plot
for pathway in top_pathways:
    df_pathway = df_top_pathways[df_top_pathways['Pathway_ID'] == pathway].copy()  # Creating a copy of the DataFrame
    
    # Map the contributors to numeric values
    df_pathway['Contributor_Numbers'] = df_pathway['Biggest_Contributor'].map(lambda x: list(color_mapping.keys()).index(x) if x in color_mapping else np.nan)
    
    # Ensuring that each combination of Station and Month is present in the DataFrame
    for station in stations_order:
        for month in months_order:
            if not ((df_pathway['Station'] == station) & (df_pathway['Month'] == month)).any():
                missing_row = pd.DataFrame({'Station': [station], 'Month': [month], 
                                            'Biggest_Contributor': [np.nan], 'TPM_Fraction': [np.nan], 'Contributor_Numbers': [np.nan]})
                df_pathway = pd.concat([df_pathway, missing_row], ignore_index=True)

    # Creating a pivot table for numeric contributor values
    contributor_pivot = df_pathway.pivot_table(index='Station', columns='Month', values='Contributor_Numbers', aggfunc='first')
    contributor_pivot = contributor_pivot.reindex(index=stations_order, columns=months_order)
    
    # Creating a pivot table for colors (using the 'Contributor_Numbers' column)
    color_pivot = contributor_pivot.values  # Using the values from the pivot table directly
   
    # Creating a pivot table for TPM_Fraction
    tpm_fraction_pivot = df_pathway.pivot_table(index='Station', columns='Month', values='TPM_Fraction_of_annotated', aggfunc='sum')
    tpm_fraction_pivot = tpm_fraction_pivot.reindex(index=stations_order, columns=months_order)

    # Create a pivot table for the TPM_sums
    tpm_sum_pivot = df_pathway.pivot_table(index='Station', columns='Month', values='TPM_pathway_sum', aggfunc='sum')
    tpm_sum_pivot = tpm_sum_pivot.reindex(index=stations_order, columns=months_order)

    # Creating a custom colormap
    bounds = list(range(len(color_mapping) + 1))
    cmap = mcolors.ListedColormap(list(color_mapping.values()))
    norm = mcolors.BoundaryNorm(bounds, cmap.N)
    
    # Creating the plot with marginal histograms
    fig, ax = plt.subplots(figsize=(12, 10))
    divider = make_axes_locatable(ax)
    ax_histx = divider.append_axes("top", 1.2, pad=0.1, sharex=ax)
    ax_histy = divider.append_axes("right", 1.2, pad=0.1, sharey=ax)
    
    # Displaying the heatmap with the custom colormap and normalization
    im = ax.imshow(color_pivot, aspect='auto', cmap=cmap, norm=norm)
    
    # Adding annotations depicting the proportion of tax. annotated expression could be assigned to the largest 'taxonomic contributor'
    for i in range(len(stations_order)):
        for j in range(len(months_order)):
            text = ax.text(j, i, f'{tpm_fraction_pivot.iloc[i, j]:.1f}',
                           ha="center", va="center", color="w")
    
    # Setting titles and labels
    ax.set_xticks(np.arange(len(months_order)))
    ax.set_yticks(np.arange(len(stations_order)))
    ax.set_xticklabels(months_order, rotation=45, ha='right', rotation_mode='anchor')
    ax.set_yticklabels(stations_order)
    
    # Creating marginal histograms displaying the total TPM sum of transcripts annotated within this pathway
    ax_histx.bar(months_order, tpm_sum_pivot.sum(axis=0))
    ax_histy.barh(stations_order, tpm_sum_pivot.sum(axis=1))
    
    # Make some labels invisible
    ax_histx.xaxis.set_tick_params(labelbottom=False)
    ax_histy.yaxis.set_tick_params(labelleft=False)
    
    # Creating a colorbar with the custom ticks and labels
    cbar = plt.colorbar(im, ax=ax, ticks=np.arange(0.5, len(color_mapping), 1), boundaries=np.arange(0, len(color_mapping) + 1))
    cbar.set_ticklabels(list(color_mapping.keys()))
    
    # Setting the title at the top
    plt.title(f"Pathway: {df_pathway['Pathway_Name'].iloc[0]} ({pathway})", loc='right', y=1.25)
    
    # Saving the plot as an SVG file
    plt.savefig(f"../../figures/WGCNA/pathways_seasonal_spatial_taxonomic/{pathway}.svg", format='svg')
    
    # Displaying the plot
    plt.show()