# Sequence Analysis and Functional Prediction Pipeline
# 1. Introduction
This notebook analyzes the functional and sequence relationships between newly identified bacteria and known corrosion-influencing microorganisms. The analysis builds upon previous findings where:
Statistical significance was established between the selected bacteria and corrosion risk (Notebook 3).Most of the bacteria have also been previously reported as influencing corrosion as seeing in the literature search notebook 4. Also, the evolutionary relationhship of the candidates to be assigned as MIC has been mapped through phylogenetic analyis on notebok 5.   
The study focuses on bacteria from operational heating and cooling water systems, primarily in Germany. Using 16S rRNA data (bootstrap-validated from Notebook 5), this analysis employs PICRUSt2 to:
 Predict metabolic functions from 16S sequences. Focuses on pathways relevant to corrosion such as sulfur and iron metabolism. Ultimately it compares functional profiles between the known corrosion-causing bacteria on the selected list (validated through literature) and the newly identified candidates showing statistical correlation with corrosion. This functional comparison aims to validate whether statistical correlations reflect genuine metabolic capabilities associated with corrosion processes.

# 2. Loading and Preparing the Data

In [1]:
import pandas as pd
import numpy as np
from Bio import SeqIO, Entrez
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import subprocess
import os
from pathlib import Path
import ast
from io import StringIO
import openpyxl

ModuleNotFoundError: No module named 'openpyxl'

Following is the structure of the notebook data named data_picrus  
data_tree  
 ├── sequences/  
 │   ├── known.fasta : sequences of known corrosion-causing bacteria  
 │   ├── candidate.fasta : sequences of potential new corrosion-causing bacteria  
 |   └── other files  
 data_picrus  
 └── picrust_results/  
      ├── known_bacteria/  
      |               ├── EC_predictions/       : enzyme predictions  
      |               ├── pathway_predictions/  : metabolic pathway abundance  
      |               ├── KO_predictions/       : KEGG ortholog predictions  
      |               └── other_picrust_files/  
      ├── candidate_bacteria/  
      |               ├── EC_predictions/       : enzyme predictions  
      |               ├── pathway_predictions/  : metabolic pathway abundance  
      |               ├── KO_predictions/       : KEGG ortholog predictions  
      |               └── other_picrust_files/  : final comparison summary  
      │      
      └── functional_comparison.xlsx  

In [2]:
# For VSCode
base_dir = Path("/home/beatriz/MIC/2_Micro/data_picrus")
known_dir = base_dir / "known_bacteria"
candidate_dir = base_dir / "candidate_bacteria"
results_file = base_dir / "functional_comparison.xlsx" 

# Create output directory
output_dir = Path(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
    

In [None]:

# Path Configuration
base_dir = Path("/home/beatriz/MIC/2_Micro/data_picrus")
known_dir = base_dir / "known_bacteria"
candidate_dir = base_dir / "candidate_bacteria"
results_file = base_dir / "functional_comparison.xlsx"
aligned_file = Path("/home/beatriz/MIC/2_Micro/data_tree/aligned_sequences.fasta")

def parse_fasta_and_merge(aligned_file, abundance_excel, known_bacteria, base_dir, known_dir, candidate_dir):
    """
    Parse FASTA file and merge with abundance data from Excel using specified directory structure
    """
    # Create output directories if they don't exist
    known_dir.mkdir(parents=True, exist_ok=True)
    candidate_dir.mkdir(parents=True, exist_ok=True)
    
    # Read abundance data
    abundance_df = pd.read_excel(abundance_excel)
    
    # Parse FASTA and create lists for known and candidate bacteria
    known_seqs = []
    candidate_seqs = []
    sequence_info = {}
    
    for record in SeqIO.parse(aligned_file, "fasta"):
        genus = record.description.split()[0]
        sequence_info[genus] = {
            'sequence': str(record.seq),
            'accession': record.description.split()[1].split(':')[1]
        }
        
        if genus in known_bacteria:
            known_seqs.append(record)
        else:
            candidate_seqs.append(record)
    
    # Create separate abundance DataFrames
    known_abundance = abundance_df[abundance_df['Genus'].isin(known_bacteria)]
    candidate_abundance = abundance_df[~abundance_df['Genus'].isin(known_bacteria)]
    
    # Write FASTA files to respective directories
    SeqIO.write(known_seqs, known_dir / "known.fasta", "fasta")
    SeqIO.write(candidate_seqs, candidate_dir / "candidate.fasta", "fasta")
    
    # Get site columns
    site_cols = [col for col in abundance_df.columns if col.startswith('site_')]
    
    def save_abundance_file(abundance_data, output_path):
        matched_abundance = abundance_data[abundance_data['Genus'].isin(sequence_info.keys())]
        matched_abundance['Accession'] = matched_abundance['Genus'].map(
            lambda x: sequence_info[x]['accession'] if x in sequence_info else None
        )
        matched_abundance[['Genus', 'Accession'] + site_cols].to_csv(
            output_path / "abundance.csv", index=False
        )
        return matched_abundance
    
    # Save abundance files
    known_matched = save_abundance_file(known_abundance, known_dir)
    candidate_matched = save_abundance_file(candidate_abundance, candidate_dir)
    
    # Save summary to functional_comparison.xlsx
    with pd.ExcelWriter(results_file) as writer:
        pd.DataFrame({
            'Category': ['Known', 'Candidate'],
            'Sequences': [len(known_seqs), len(candidate_seqs)],
            'With Abundance': [len(known_matched), len(candidate_matched)]
        }).to_excel(writer, sheet_name='Summary', index=False)
        
        # Add matching details
        matching_data = []
        for genus in abundance_df['Genus'].unique():
            matching_data.append({
                'Genus': genus,
                'Category': 'Known' if genus in known_bacteria else 'Candidate',
                'Has Sequence': genus in sequence_info,
                'Accession': sequence_info.get(genus, {}).get('accession', 'Not Found')
            })
        pd.DataFrame(matching_data).to_excel(writer, sheet_name='Matching_Details', index=False)
    
    print("\nSummary:")
    print(f"Known bacteria with sequences: {len(known_seqs)}")
    print(f"Candidate bacteria with sequences: {len(candidate_seqs)}")
    print(f"Results saved to: {results_file}")
    
    return {
        'known_sequences': known_seqs,
        'candidate_sequences': candidate_seqs,
        'known_abundance': known_matched,
        'candidate_abundance': candidate_matched
    }

# Define known corrosive bacteria
known_bacteria = [
    'Aquamicrobium', 'Azospira', 'Brachybacterium', 'Brevibacterium',
    'Cellulosimicrobium', 'Clavibacter', 'Clostridium', 'Cohnella',
    'Corynebacterium', 'Enterococcus', 'Halomonas', 'Legionella',
    'Methyloversatilis', 'Mycobacterium', 'Neisseria', 'Novosphingobium',
    'Opitutus', 'Paracoccus', 'Prevotella', 'Psb-m-3', 'Pseudarthrobacter',
    'Pseudoalteromonas', 'Roseateles', 'Streptococcus', 'Thiobacillus'
]

if __name__ == "__main__":
    # Run the preparation with your paths
    results = parse_fasta_and_merge(
        aligned_file=aligned_file,
        abundance_excel=base_dir / "selected_to_note.xlsx",
        known_bacteria=known_bacteria,
        base_dir=base_dir,
        known_dir=known_dir,
        candidate_dir=candidate_dir
    )

In [4]:
# Calling the function
if __name__ == "__main__":
    # Define known corrosive bacteria
    known_bacteria = [
        'Aquamicrobium', 'Azospira', 'Brachybacterium', 'Brevibacterium',
        'Cellulosimicrobium', 'Clavibacter', 'Clostridium', 'Cohnella',
        'Corynebacterium', 'Enterococcus', 'Halomonas', 'Legionella',
        'Methyloversatilis', 'Mycobacterium', 'Neisseria', 'Novosphingobium',
        'Opitutus', 'Paracoccus', 'Prevotella', 'Psb-m-3', 'Pseudarthrobacter',
        'Pseudoalteromonas', 'Roseateles', 'Streptococcus', 'Thiobacillus'
    ]
    
    # File paths
    fasta_path = "/home/beatriz/MIC/2_Micro/data_tree/aligned_sequences.fasta"
    abundance_path = "/home/beatriz/MIC/2_Micro/data_tree/selected_to_note.xlsx"
    output_dir = "data_picrust"
    
    # Run the preparation
    results = parse_fasta_and_merge(fasta_path, abundance_path, known_bacteria, output_dir)

ImportError: Missing optional dependency 'openpyxl'.  Use pip or conda to install openpyxl.

# 3. Integrating data from Sequences and Abundances 

In [None]:
def parse_fasta_file(fasta_path):
    """
    Parse FASTA file to extract bacteria name and accession
    Returns DataFrame with accession numbers and sequences
    """
    sequences_data = []
    
    with open(fasta_path, 'r') as handle:
        for record in SeqIO.parse(handle, 'fasta'):
            # Split the description to get accession
            desc_parts = record.description.split()
            accession = desc_parts[1].split(':')[1]
            
            sequences_data.append({
                'Accession': accession,
                'Sequence': str(record.seq)
            })
    
    return pd.DataFrame(sequences_data)

def process_abundance_data(abundance_df):
    """
    Process the abundance DataFrame to prepare for merging
    """
    # Make a copy to avoid modifying the original
    df = abundance_df.copy()
    
    # Convert the IDs column from string representation of list to actual list
    df['AccessionList'] = df['IDs'].apply(ast.literal_eval)
    
    # Explode the DataFrame so each accession gets its own row
    df = df.explode('AccessionList')
    
    # Rename the exploded column to match with sequence data
    df = df.rename(columns={'AccessionList': 'Accession'})
    
    return df

def merge_data_for_picrust(fasta_path, abundance_path, output_fasta_path, output_abundance_path):
    """
    Merge sequence data with abundance data and prepare files for PICRUSt2
    """
    # Read and parse files
    seq_df = parse_fasta_file(fasta_path)
    abundance_df = pd.read_csv(abundance_path)
    
    # Process abundance data
    processed_abundance = process_abundance_data(abundance_df)
    
    # Merge sequence data with abundance data
    merged_data = pd.merge(seq_df, processed_abundance, on='Accession', how='inner')
    
    # Write new FASTA file
    with open(output_fasta_path, 'w') as f:
        for _, row in merged_data.iterrows():
            # Using genus name and accession in FASTA header
            f.write(f">{row['Genus']}|{row['Accession']}\n")
            f.write(f"{row['Sequence']}\n")
    
    # Prepare abundance table for PICRUSt2
    # Get site columns
    site_cols = [col for col in merged_data.columns if col.startswith('site_')]
    
    # Create abundance table with necessary columns
    abundance_output = merged_data[['Genus', 'Accession'] + site_cols]
    
    # If multiple sequences exist for same genus, sum their abundances
    abundance_output = abundance_output.groupby(['Genus', 'Accession'])[site_cols].sum().reset_index()
    
    # Save abundance table
    abundance_output.to_csv(output_abundance_path, index=False)
    
    return merged_data


In [None]:
# Callin the function to merge the sequences with the abundance genera
if __name__ == "__main__":
    # Set your file paths
    fasta_input = "aligned_sequences.fasta"
    abundance_input = "taxa_accession.csv"
    output_fasta = "picrust_input_sequences.fasta"
    output_abundance = "picrust_abundance_table.csv"
    
    # Run the integration
    result = merge_data_for_picrust(
        fasta_input,
        abundance_input,
        output_fasta,
        output_abundance
    )
    
    print(f"Successfully processed {len(result)} sequences with corresponding abundance data")
    
    # Print some summary statistics
    print("\nSummary:")
    print(f"Number of unique genera: {result['Genus'].nunique()}")
    print(f"Number of unique accessions: {result['Accession'].nunique()}")
    print(f"Total number of sites: {len([col for col in result.columns if col.startswith('site_')])}")

In [3]:
# Read aligned sequences
aligned_file = Path("/home/beatriz/MIC/2_Micro/data_tree/aligned_sequences.fasta")

# Define known corrosive bacteria
known_bacteria = ['Aquamicrobium',' Azospira', 'Brachybacterium', 'Brevibacterium', 'Cellulosimicrobium', 'Clavibacter',
                   'Clostridium', 'Cohnella', 'Corynebacterium', 'Enterococcus', 'Halomonas', 'Legionella', 'Methyloversatilis',
                     'Mycobacterium', 'Neisseria', 'Novosphingobium', 'Opitutus', 'Paracoccus', 'Prevotella','Psb-m-3', 'Pseudarthrobacter',
                        'Pseudoalteromonas', 'Roseateles', 'Streptococcus', 'Thiobacillus']

# Split sequences
known_seqs = []
candidate_seqs = []

for record in SeqIO.parse(aligned_file, "fasta"):
    if record.id in known_bacteria:
        known_seqs.append(record)
    else:
        candidate_seqs.append(record)

# Save split files
SeqIO.write(known_seqs, "data_picrus/known.fasta", "fasta")
SeqIO.write(candidate_seqs, "data_picrus/candidate.fasta", "fasta")

6

In [4]:
aligned_file

PosixPath('/home/beatriz/MIC/2_Micro/data_tree/aligned_sequences.fasta')

In [5]:
as

SyntaxError: invalid syntax (1239779345.py, line 1)

In [4]:
def prepare_sequences_for_picrust(sequences, output_dir):
    """
    Prepare sequences for PICRUSt2 analysis
    
    Parameters:
    sequences: list of SeqRecord objects or path to FASTA file
    output_dir: directory to save prepared files
    """
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Save sequences in FASTA format if they're not already in a file
    if isinstance(sequences, list):
        output_fasta = os.path.join(output_dir, 'sequences.fasta')
        SeqIO.write(sequences, output_fasta, 'fasta')
    else:
        output_fasta = sequences
    
    return output_fasta

def run_picrust2_pipeline(fasta_file, output_dir):
    """
    Run PICRUSt2 analysis pipeline
    
    Parameters:
    fasta_file: path to input FASTA file
    output_dir: directory for PICRUSt2 output
    """
    try:
        # Run PICRUSt2 pipeline
        cmd = [
            'picrust2_pipeline.py',
            '-s', fasta_file,
            '-o', output_dir,
            '--processes', '1',  # Adjust based on available CPU
            '--verbose'
        ]
        subprocess.run(cmd, check=True)
        
        # Add pathway descriptions
        pathway_file = os.path.join(output_dir, 'pathways_out/path_abun_unstrat.tsv.gz')
        if os.path.exists(pathway_file):
            cmd_desc = [
                'add_descriptions.py',
                '-i', pathway_file,
                '-m', 'PATHWAY',
                '-o', os.path.join(output_dir, 'pathways_with_descriptions.tsv')
            ]
            subprocess.run(cmd_desc, check=True)
            
        return True
    except subprocess.CalledProcessError as e:
        print(f"Error running PICRUSt2: {e}")
        return False

def analyze_functional_profiles(picrust_output_dir, known_corrosive_bacteria):
    """
    Analyze functional profiles to compare with known corrosive bacteria
    
    Parameters:
    picrust_output_dir: directory containing PICRUSt2 output
    known_corrosive_bacteria: list of known corrosive bacteria names
    """
    # Read PICRUSt2 output
    pathway_file = os.path.join(picrust_output_dir, 'pathways_with_descriptions.tsv')
    pathways_df = pd.read_csv(pathway_file, sep='\t')
    
    # Focus on relevant pathways
    relevant_pathways = [
        'Sulfur metabolism',
        'Iron metabolism',
        'Energy metabolism',
        'Biofilm formation',
        'Metal transport'
    ]
    
    # Filter and analyze pathways
    filtered_pathways = pathways_df[pathways_df['description'].str.contains('|'.join(relevant_pathways), 
                                                                                case=False, na=False)]
    # Compare profiles between known and candidate bacteria
    comparison_results = {
        'pathway_similarities': {},
        'functional_predictions': {},
        'correlation_scores': {}
    }
    
    return filtered_pathways, comparison_results

def main_analysis_pipeline(input_sequences, output_dir, known_corrosive_bacteria):
    """
    Main pipeline for functional analysis
    """
    # Prepare sequences
    fasta_file = prepare_sequences_for_picrust(input_sequences, output_dir)
    
    # Run PICRUSt2
    success = run_picrust2_pipeline(fasta_file, output_dir)
    if not success:
        return None
    
    # Analyze results
    pathways, results = analyze_functional_profiles(output_dir, known_corrosive_bacteria)
    
    # Save results
    timestamp = pd.Timestamp.now().strftime('%Y%m%d_%H%M')
    results_file = os.path.join(output_dir, f'functional_analysis_{timestamp}.xlsx')
    
    with pd.ExcelWriter(results_file) as writer:
        pathways.to_excel(writer, sheet_name='Pathway_Analysis', index=False)
        pd.DataFrame(results['pathway_similarities']).to_excel(writer, sheet_name='Similarities')
        pd.DataFrame(results['functional_predictions']).to_excel(writer, sheet_name='Predictions')
    
    return results_file

In [None]:
'hydrogenase',
'[NiFe]-hydrogenase', 
'[FeFe]-hydrogenase',
'hydA', 'hybA', 'hyaA'  # Common hydrogenase genes

'ochre formation',
'iron oxide deposits',
'iron precipitation',
'rust formation'

'organic acid production',
'acetate production',
'lactate metabolism',
'formate production',
'pyruvate metabolism',
'citrate production'

search_terms.extend([
    f"{bacteria_name}[Organism] AND (hydrogenase OR hydA OR hybA OR hyaA)",
    f"{bacteria_name}[Organism] AND (ochre formation OR iron oxide deposits OR rust formation)",
    f"{bacteria_name}[Organism] AND (organic acid production OR acetate OR lactate OR formate) AND (corrosion OR metal)"
])

3. Calling the Function

In [5]:
input_seqs = "/home/beatriz/MIC/2_Micro/data_tree/aligned_sequences.fasta"
output_directory = "/home/beatriz/MIC/2_Micro/data_picrus"

results = main_analysis_pipeline(input_seqs, output_directory, known_bacteria)

usage: picrust2_pipeline.py [-h] -s PATH -i PATH -o PATH [-p PROCESSES]
                            [-t epa-ng|sepp] [-r PATH] [--in_traits IN_TRAITS]
                            [--custom_trait_tables PATH]
                            [--marker_gene_table PATH] [--pathway_map MAP]
                            [--reaction_func MAP] [--no_pathways]
                            [--regroup_map ID_MAP] [--no_regroup]
                            [--stratified] [--max_nsti FLOAT]
                            [--min_reads INT] [--min_samples INT]
                            [-m {mp,emp_prob,pic,scp,subtree_average}]
                            [-e EDGE_EXPONENT] [--min_align MIN_ALIGN]
                            [--skip_nsti] [--skip_minpath] [--no_gap_fill]
                            [--coverage] [--per_sequence_contrib]
                            [--wide_table] [--skip_norm]
                            [--remove_intermediate] [--verbose] [-v]
picrust2_pipeline.py: error: the following argum

Error running PICRUSt2: Command '['picrust2_pipeline.py', '-s', '/home/beatriz/MIC/2_Micro/data_tree/aligned_sequences.fasta', '-o', '/home/beatriz/MIC/2_Micro/data_picrus', '--processes', '1', '--verbose']' returned non-zero exit status 2.
