# Description of Metabolic Pathways
In this notebook, I want to analyze all KEGG pathways found in the metatranscriptome. We'll combine them with taxonomic information to analyze contributions of different taxonomic groups to the specific pathway.
First, we'll need to gather the data needed to do this analysis.

In [2]:
import pandas as pd
from tqdm import tqdm

## Data

In [1]:
# KOs and their associated pathways that have been included in the WGCNA analysis
KO_info = pd.read_csv("../../data/analysis/WGCNA/ko_pathway_info.csv")

# The transcript functional annotation data
annotation = pd.read_csv('../../data/annotation/functional_eggnog/functional_annotation.emapper.annotations', sep = '\t', engine = 'pyarrow')

## Fix transcript names in the first column so that they equal the transcript identifiers in the count files
## This is necessary because TransDecoder adds .p2 or .p1 to the sequence identifiers
annotation.iloc[:, 0] = annotation.iloc[:, 0].str.split(".", expand=True).drop(columns=1)

# The transcript counts
counts = pd.read_csv('../../data/kallisto/tpm.csv', sep=',', index_col=0, engine='pyarrow')

# The EukProt taxonomic annotation data
eukprot_annotation = pd.read_table(f'../../data/annotation/taxonomy_eukprot/eukprot_DB.firsthit.90plus_alnscore.m8', header=None)
## Fix transcript names in the first column so that they equal the transcript identifiers in the count files
eukprot_annotation.iloc[:, 0] = eukprot_annotation.iloc[:, 0].str.split(".", expand=True).drop(columns=1)

## In the second column, split of the EukProt ID off
eukprot_ID = eukprot_annotation.iloc[:, 1].str.split("_", expand=True)[0]
eukprot_annotation.iloc[:, 1] = eukprot_ID
eukprot_annotation.columns = ['query_id', 'target_id', 'p_ident', 'alnlen', 'mismatch', 'gapopen', 'qstart', 'qend', 'tstart', 'tend', 'evalue', 'bits']

## Add taxonomic information
eukprot_taxonomy = pd.read_table('../../data/annotation/taxonomy_eukprot/EukProt_included_data_sets.v03.2021_11_22.txt')

## Drop the columns that are not needed
eukprot_taxonomy.drop(columns=['Previous_Names', 'Replaces_EukProt_ID', 'Data_Source_URL', 'Data_Source_Name', 'Paper_DOI', 'Actions_Prior_to_Use',
       'Data_Source_Type', 'Notes', 'Columns_Modified_Since_Previous_Version', 'Merged_Strains',
       'Alternative_Strain_Names', '18S_Sequence_GenBank_ID', '18S_Sequence',
       '18S_Sequence_Source', '18S_Sequence_Other_Strain_GenBank_ID',
       '18S_Sequence_Other_Strain_Name', '18S_and_Taxonomy_Notes'], inplace=True)

## Swap the _ to a space in the Name_to_Use column
eukprot_taxonomy['Name_to_Use'] = eukprot_taxonomy['Name_to_Use'].str.replace('_', ' ')

## Merge the annotation and taxonomy files
eukprot_annotation = eukprot_annotation.merge(eukprot_taxonomy, left_on='target_id', right_on='EukProt_ID', how='left')

## Drop the columns that are not needed
eukprot_annotation.drop(columns=['target_id', 'gapopen', 'qstart', 'qend', 'tstart', 'tend', 'evalue', 'bits'], inplace=True)

# The shallow EukProt taxonomic annotation data
eukprot_annotation_60 = pd.read_table(f'../../data/annotation/taxonomy_eukprot/eukprot_DB.firsthit.60plus_alnscore.m8', header=None)
## Fix transcript names in the first column so that they equal the transcript identifiers in the count files
eukprot_annotation_60.iloc[:, 0] = eukprot_annotation_60.iloc[:, 0].str.split(".", expand=True).drop(columns=1)

## In the second column, split of the EukProt ID off
eukprot_ID = eukprot_annotation_60.iloc[:, 1].str.split("_", expand=True)[0]
eukprot_annotation_60.iloc[:, 1] = eukprot_ID
eukprot_annotation_60.columns = ['query_id', 'target_id', 'p_ident', 'alnlen', 'mismatch', 'gapopen', 'qstart', 'qend', 'tstart', 'tend', 'evalue', 'bits']

## Add taxonomic information
eukprot_taxonomy = pd.read_table('../../data/annotation/taxonomy_eukprot/EukProt_included_data_sets.v03.2021_11_22.txt')

## Drop the columns that are not needed
eukprot_taxonomy.drop(columns=['Previous_Names', 'Replaces_EukProt_ID', 'Data_Source_URL', 'Data_Source_Name', 'Paper_DOI', 'Actions_Prior_to_Use',
       'Data_Source_Type', 'Notes', 'Columns_Modified_Since_Previous_Version', 'Merged_Strains',
       'Alternative_Strain_Names', '18S_Sequence_GenBank_ID', '18S_Sequence',
       '18S_Sequence_Source', '18S_Sequence_Other_Strain_GenBank_ID',
       '18S_Sequence_Other_Strain_Name', '18S_and_Taxonomy_Notes'], inplace=True)

## Swap the _ to a space in the Name_to_Use column
eukprot_taxonomy['Name_to_Use'] = eukprot_taxonomy['Name_to_Use'].str.replace('_', ' ')

## Merge the annotation and taxonomy files
eukprot_annotation_60 = eukprot_annotation_60.merge(eukprot_taxonomy, left_on='target_id', right_on='EukProt_ID', how='left')

## Drop the columns that are not needed
eukprot_annotation_60.drop(columns=['target_id', 'gapopen', 'qstart', 'qend', 'tstart', 'tend', 'evalue', 'bits'], inplace=True)

## load metadata
meta = pd.read_csv('../../samples.csv', sep=';', index_col=0)
## Set the order of the months in the metadata
month_order = ["July_2020", "August_2020", "September_2020", "November_2020", 
                "December_2020", "January_2021", "February_2021", "April_2021", 
                "May_2021", "June_2021", "July_2021"]

# Fetch all pathway descriptors
pathways = KO_info['Pathway'].str.extractall(r"'(map\d+)'")[0].unique()

# Read the pathway info from the csv file
pathway_info = pd.read_csv('../../data/analysis/WGCNA/pathway_info.csv', index_col=0)

Now from these datasets I want to create two dataframes:
1. A dataframe containing the pathways, samples, and their total expression at that moment
2. A dataframe containing the pathways, the taxonomic annotation (Taxogroup2_UniEuk, Genus_UniEuk, Name_to_Use), the sampling date, station, and total expression at the moment.

In [20]:
def pathway_expression(pathways, pathway_info, KO_info, annotation, counts, meta, result_path='../../data/analysis/WGCNA/pathway_TPM_sum.csv'):
    results = []

    for pathway in tqdm(pathways, desc="Analyzing Pathways"):
        # Filtering KO_info for rows where the Pathway column contains the current pathway
        ko_values = KO_info[KO_info['Pathway'].str.contains(pathway)]['KEGG_ID'].tolist()
        
        # Filtering annotation DataFrame for rows where KEGG_ko column contains any value in ko_values
        transcript_names = annotation[annotation['KEGG_ko'].str.contains('|'.join(ko_values))]['#query'].tolist()

        # Extract Pathway_Name and Pathway_Class from pathway_info DataFrame
        pathway_name = pathway_info[pathway_info['pathway_id'] == pathway]['name'].values[0] if pathway in pathway_info['pathway_id'].values else 'N/A'
        pathway_class = pathway_info[pathway_info['pathway_id'] == pathway]['class'].values[0] if pathway in pathway_info['pathway_id'].values else 'N/A'

        for sample in meta.index:
            # Get the total expression of transcripts involved in this pathway in this sample
            tpm = counts[sample].loc[transcript_names].sum()
        
            results.append({
                'Pathway_ID': pathway,
                'Pathway_Name': pathway_name,
                'Pathway_Class': pathway_class,
                'Sample': sample,
                'TPM': tpm
                })
    
    # Create DataFrame and save as CSV
    result_df = pd.DataFrame(results)
    result_df.to_csv(result_path, index=False)

In [21]:
# Run the function
pathway_expression(pathways, pathway_info, KO_info, annotation, counts, meta)

Analyzing Pathways: 100%|██████████| 464/464 [12:04<00:00,  1.56s/it] 


In [30]:
def pathway_taxonomic_expression(pathways, pathway_info, KO_info, annotation, eukprot_annotation, counts, meta, result_path='../../data/analysis/WGCNA/pathway_tax_TPM_sum.csv'):
    # Merging annotation with eukprot_annotation to get the Taxogroup2_UniEuk information
    annotation = annotation.merge(eukprot_annotation[['query_id', 'Taxogroup2_UniEuk']], left_on='#query', right_on='query_id', how='left')
    
    results = []

    for pathway in tqdm(pathways, desc="Analyzing Pathways"):
        ko_values = KO_info[KO_info['Pathway'].str.contains(pathway)]['KEGG_ID'].tolist()
        transcript_names = annotation[annotation['KEGG_ko'].str.contains('|'.join(ko_values))]

        pathway_name = pathway_info[pathway_info['pathway_id'] == pathway]['name'].values[0] if pathway in pathway_info['pathway_id'].values else 'N/A'
        pathway_class = pathway_info[pathway_info['pathway_id'] == pathway]['class'].values[0] if pathway in pathway_info['pathway_id'].values else 'N/A'

        for sample in meta.index:
            tpm_data = counts[sample].loc[transcript_names['#query']]
            transcript_names_copy = transcript_names.copy()  
            transcript_names_copy['TPM'] = tpm_data.values
            tpm_sum = transcript_names_copy.groupby('Taxogroup2_UniEuk')['TPM'].sum().reset_index()

            for _, row in tpm_sum.iterrows():
                results.append({
                    'Pathway_ID': pathway,
                    'Pathway_Name': pathway_name,
                    'Pathway_Class': pathway_class,
                    'Sample': sample,
                    'Taxogroup2_UniEuk': row['Taxogroup2_UniEuk'],
                    'TPM': row['TPM']
                })

    # Create DataFrame and save as CSV
    result_df = pd.DataFrame(results)
    result_df.to_csv(result_path, index=False)


In [31]:
# Run the function
pathway_taxonomic_expression(pathways, pathway_info, KO_info, annotation, eukprot_annotation, counts, meta)

Analyzing Pathways: 100%|██████████| 464/464 [12:24<00:00,  1.60s/it] 
