In [None]:
"""
Sliding Window Peptide Library Generator

Purpose:
Generates overlapping peptide-mers of specified length from protein
or conserved region sequences for immunoinformatics analysis.

Use cases:
- HLA Class I epitope prediction (e.g., 9-mers)
- HLA Class II epitope prediction (e.g., 15-mers)
- Linear B-cell epitope screening

Developed during MSc Biotechnology thesis (2023â€“2025).
"""

In [1]:
def find_peptide_position(reference_sequence, query_sequences):
    """
    Find the start and end positions of multiple query peptide sequences within a reference sequence.
    
    Args:
        reference_sequence (str): The full reference peptide sequence
        query_sequences (list): List of peptide sequences to find
    
    Returns:
        dict: Dictionary with query sequences as keys and (start_position, end_position) as values
              If a sequence is not found, its value will be (None, None)
    """
    # Remove any whitespace and convert to uppercase for consistent matching
    reference_sequence = reference_sequence.replace(" ", "").upper()
    
    results = {}
    
    # Process each query sequence
    for query in query_sequences:
        # Clean the query sequence
        clean_query = query.replace(" ", "").upper()
        
        # Find the start position
        start_position = reference_sequence.find(clean_query)
        
        # If sequence is found
        if start_position != -1:
            # Calculate end position 
            # End position is start position + length of query sequence
            end_position = start_position + len(clean_query)
            results[query] = (start_position, end_position)
        else:
            results[query] = (None, None)
    
    return results

# Example usage
reference_seq = "MSEWSRIAVEFGEQQLNLTELEDFARELAYEGLDPALIIKKLKETGGDDWVKDTKFIIVFALTRGNKIVKASGKMSNSGSKRLMALQEKYGLVERAETRLSITPVRVAQSLPTWTCAAAAALKEYLPVGPAVMNLKVENYPPEMMCMAFGSLIPTAGVSEATTKTLMEAYSLWQDAFTKTINVKMRGASKTEVYNSFRDPLHAAVNSVFFPNDVRVKWLKAKGILGPDGVPSRAAEVAAAAYRNL"

# List of query sequences to find
query_sequences = [
    "FARELAYEGLDPALI",	
    "ARELAYEGLDPALII",
    "RELAYEGLDPALIIK",
    "ELAYEGLDPALIIKK",
    "LAYEGLDPALIIKKL",
    "AYEGLDPALIIKKLK",
    "YEGLDPALIIKKLKE",
    "EGLDPALIIKKLKET",
    "GLDPALIIKKLKETG",
    "LDPALIIKKLKETGG",
    "DPALIIKKLKETGGD",
    "YNSFRDPLHAAVNSV"

]

# Find positions for all queries
results = find_peptide_position(reference_seq, query_sequences)

# Display results
print("Results:")
print("-" * 60)
for query, (start, end) in results.items():
    if start is not None:
        start_1_indexed = start + 1  # Convert to 1-indexed for display
        print(f"Query: {query}")
        print(f"  Found at positions {start_1_indexed} to {end}")
        print(f"  Matched subsequence: {reference_seq[start:end]}")
    else:
        print(f"Query: {query}")
        print(f"  Not found in reference sequence")
    print("-" * 60)

Results:
------------------------------------------------------------
Query: FARELAYEGLDPALI
  Found at positions 24 to 38
  Matched subsequence: FARELAYEGLDPALI
------------------------------------------------------------
Query: ARELAYEGLDPALII
  Found at positions 25 to 39
  Matched subsequence: ARELAYEGLDPALII
------------------------------------------------------------
Query: RELAYEGLDPALIIK
  Found at positions 26 to 40
  Matched subsequence: RELAYEGLDPALIIK
------------------------------------------------------------
Query: ELAYEGLDPALIIKK
  Found at positions 27 to 41
  Matched subsequence: ELAYEGLDPALIIKK
------------------------------------------------------------
Query: LAYEGLDPALIIKKL
  Found at positions 28 to 42
  Matched subsequence: LAYEGLDPALIIKKL
------------------------------------------------------------
Query: AYEGLDPALIIKKLK
  Found at positions 29 to 43
  Matched subsequence: AYEGLDPALIIKKLK
------------------------------------------------------------
Query: YEGLD

In [5]:
def extract_peptide_by_position(reference_sequence, start_position, end_position, is_one_indexed=True):
    """
    Extract a peptide subsequence from a reference sequence based on start and end positions.
    
    Args:
        reference_sequence (str): The full reference peptide sequence
        start_position (int): The starting position of the subsequence
        end_position (int): The ending position of the subsequence
        is_one_indexed (bool): Whether the provided positions are 1-indexed (True) or 0-indexed (False)
                               Defaults to True (biological convention)
    
    Returns:
        str: The extracted peptide subsequence
        
    Note:
        If is_one_indexed is True, the function will convert the positions to 0-indexed for extraction
        and the end position is inclusive (the residue at end_position is included in the result).
        If is_one_indexed is False, the function uses the positions as-is and end position is exclusive
        (consistent with Python slicing).
    """
    # Remove any whitespace and convert to uppercase for consistent handling
    reference_sequence = reference_sequence.replace(" ", "").upper()
    
    # Adjust positions if they are 1-indexed
    if is_one_indexed:
        # Convert to 0-indexed for Python slicing
        start_idx = start_position - 1
        # Make end position inclusive for biological convention
        end_idx = end_position
    else:
        # Keep positions as-is for 0-indexed input
        start_idx = start_position
        end_idx = end_position
    
    # Validate positions
    if start_idx < 0 or end_idx > len(reference_sequence) or start_idx >= end_idx:
        return None
    
    # Extract the subsequence
    extracted_peptide = reference_sequence[start_idx:end_idx]
    
    return extracted_peptide


def main():
    # Example reference sequences
    reference_seq = "MSEWSRIAVEFGEQQLNLTELEDFARELAYEGLDPALIIKKLKETGGDDWVKDTKFIIVFALTRGNKIVKASGKMSNSGSKRLMALQEKYGLVERAETRLSITPVRVAQSLPTWTCAAAAALKEYLPVGPAVMNLKVENYPPEMMCMAFGSLIPTAGVSEATTKTLMEAYSLWQDAFTKTINVKMRGASKTEVYNSFRDPLHAAVNSVFFPNDVRVKWLKAKGILGPDGVPSRAAEVAAAAYRNL"
    
    # Get user input for positions
    print("\nPeptide Sequence Extractor")
    print("-" * 40)
    
    print("Reference sequence length:", len(reference_seq))
    
    try:
        start_pos = int(input("Enter start position (1-indexed): "))
        end_pos = int(input("Enter end position (1-indexed, inclusive): "))
        
        # Extract the peptide
        peptide = extract_peptide_by_position(reference_seq, start_pos, end_pos)
        
        if peptide:
            print("\nResults:")
            print("-" * 40)
            print(f"Extracted peptide ({end_pos - start_pos + 1} residues): {peptide}")
        else:
            print("\nInvalid positions. Please check your input and try again.")
            
    except ValueError:
        print("Invalid input. Please enter numeric values for positions.")


# Allow running as a script or importing as a module
if __name__ == "__main__":
    main()





Peptide Sequence Extractor
----------------------------------------
Reference sequence length: 245


Enter start position (1-indexed):  11
Enter end position (1-indexed, inclusive):  25



Results:
----------------------------------------
Extracted peptide (15 residues): FGEQQLNLTELEDFA
