In [None]:
"""
Sliding Window Peptide Library Generator

Purpose:
Generates overlapping peptide-mers of specified length from protein
or conserved region sequences for immunoinformatics analysis.

Use cases:
- HLA Class I epitope prediction (e.g., 9-mers)
- HLA Class II epitope prediction (e.g., 15-mers)
- Linear B-cell epitope screening

Developed during MSc Biotechnology thesis (2023â€“2025).
"""

In [4]:
from typing import List, Dict, Union, Optional
import os


def generate_peptides(sequence: str, length: int) -> List[str]:
    """
    Generate peptide fragments of specified length from a protein sequence.
    
    Args:
        sequence: The protein sequence as a string
        length: Length of peptides to generate (e.g., 9 or 15)
        
    Returns:
        List of peptide fragments
    """
    # Remove any whitespace and convert to uppercase
    sequence = sequence.replace(" ", "").upper()
    
    # Generate peptides
    peptides = []
    for i in range(len(sequence) - length + 1):
        peptide = sequence[i:i+length]
        peptides.append(peptide)
    
    return peptides


def read_fasta(file_path: str) -> Dict[str, str]:
    """
    Read sequences from a FASTA file.
    
    Args:
        file_path: Path to the FASTA file
        
    Returns:
        Dictionary with sequence IDs as keys and sequences as values
    """
    sequences = {}
    current_id = None
    current_seq = []
    
    with open(file_path, 'r') as file:
        for line in file:
            line = line.strip()
            if line.startswith('>'):
                if current_id:
                    sequences[current_id] = ''.join(current_seq)
                current_id = line[1:]
                current_seq = []
            else:
                current_seq.append(line)
    
    if current_id:
        sequences[current_id] = ''.join(current_seq)
    
    return sequences


def process_sequence(sequence: str, peptide_length: int = 9) -> List[Dict[str, Union[str, int]]]:
    """
    Process a single protein sequence and generate peptides with position information.
    
    Args:
        sequence: Protein sequence string
        peptide_length: Length of peptides to generate (default: 9)
        
    Returns:
        List of dictionaries containing peptides and their start positions
    """
    result = []
    for i in range(len(sequence) - peptide_length + 1):
        peptide = sequence[i:i+peptide_length]
        result.append({
            'peptide': peptide,
            'position': i + 1  # 1-based position
        })
    return result


def process_fasta(fasta_path: str, peptide_length: int = 9) -> Dict[str, List[Dict[str, Union[str, int]]]]:
    """
    Process a FASTA file and generate peptides for each sequence with position information.
    
    Args:
        fasta_path: Path to the FASTA file
        peptide_length: Length of peptides to generate (default: 9)
        
    Returns:
        Dictionary with sequence IDs as keys and lists of peptide information as values
    """
    sequences = read_fasta(fasta_path)
    results = {}
    
    for seq_id, sequence in sequences.items():
        results[seq_id] = process_sequence(sequence, peptide_length)
    
    return results


def write_peptides_to_tsv(peptides: Union[List[Dict[str, Union[str, int]]], Dict[str, List[Dict[str, Union[str, int]]]]], 
                          output_path: str) -> None:
    """
    Write peptides to a tab-delimited file.
    
    Args:
        peptides: List of peptide dictionaries or dictionary with sequence IDs and peptide lists
        output_path: Path to the output file
    """
    with open(output_path, 'w') as f:
        if isinstance(peptides, list):
            # Single sequence case
            f.write("Peptide\tPosition\n")
            for item in peptides:
                f.write(f"{item['peptide']}\t{item['position']}\n")
        else:
            # Multiple sequences case (from FASTA)
            f.write("Sequence_ID\tPeptide\tPosition\n")
            for seq_id, peptide_list in peptides.items():
                for item in peptide_list:
                    f.write(f"{seq_id}\t{item['peptide']}\t{item['position']}\n")


def create_peptides(input_data: Union[str, Dict[str, str]], 
                    peptide_length: int = 9, 
                    output_file: Optional[str] = None) -> Union[List[Dict[str, Union[str, int]]], Dict[str, List[Dict[str, Union[str, int]]]]]:
    """
    Main function to create peptides from input data and optionally save as tab-delimited file.
    
    Args:
        input_data: Either a protein sequence string, a path to a FASTA file, 
                   or a dictionary of sequences (with IDs as keys)
        peptide_length: Length of peptides to generate (default: 9)
        output_file: Path to save results as tab-delimited file (optional)
        
    Returns:
        List of peptide dictionaries for a single sequence, 
        or dictionary with IDs and peptide lists for multiple sequences
    """
    # Determine the type of input
    if isinstance(input_data, str):
        if os.path.isfile(input_data):
            # Input is a file path
            results = process_fasta(input_data, peptide_length)
        else:
            # Input is a sequence string
            results = process_sequence(input_data, peptide_length)
    elif isinstance(input_data, dict):
        # Input is a dictionary of sequences
        results = {}
        for seq_id, sequence in input_data.items():
            results[seq_id] = process_sequence(sequence, peptide_length)
    else:
        raise TypeError("Input must be a string (sequence or file path) or a dictionary of sequences")
    
    # Write to tab-delimited file if specified
    if output_file:
        write_peptides_to_tsv(results, output_file)
    
    return results


# Example usage
if __name__ == "__main__":
    # Example 1: Process a single sequence and save as TSV
    sequence = "DLAENTQASSTSFQTKSSEVNLRGLCLIP"
    output = "9mer_Peptides_example.fasta"
    peptides = create_peptides(sequence, peptide_length=9, output_file= output)
    print(f"Generated {len(peptides)} 9-mer peptides and saved to" + f" {output}")

    sequence = "DLAENTQASSTSFQTKSSEVNLRGLCLIP"
    output = "15mer_Peptides_example.fasta"
    peptides = create_peptides(sequence, peptide_length=15, output_file= output)
    print(f"Generated {len(peptides)} 15-mer peptides and saved to" + f" {output}")

Generated 21 9-mer peptides and saved to 9mer_Peptides_example.fasta
Generated 15 15-mer peptides and saved to 15mer_Peptides_example.fasta
