In [None]:
import pandas as pd
import numpy as np
import re
import os

def add_plus_after_lowercase(sequence):
    """
    Add '+' character after each lowercase letter in the sequence.
    Handles None values.
    """
    if pd.isna(sequence):
        return None
        
    result = ''
    for char in str(sequence):
        if char.islower():
            result += char + '+'
        else:
            result += char
    return result.upper()

def add_questionmarks(seq):
    """Add question mark after each S, T, or Y in the sequence"""
    if pd.isna(seq):
        return None
    return re.sub(r'([STY])', r'\1?', seq)

def add_minus_if_no_plus(seq):
    """Add minus after S, T, or Y if they're not followed by a plus"""
    if pd.isna(seq):
        return None
    return re.sub(r'([STY])(?!\+)', r'\1-', seq)

def create_sliding_windows(sequence, seqlenmax=128, slide=32, overlap=96):
    """
    Create sliding windows for a sequence with specified parameters.
    Handles None values.
    
    Parameters:
    sequence (str): Input sequence
    seqlenmax (int): Maximum sequence length
    slide (int): Window slide size
    overlap (int): Window overlap size
    
    Returns:
    list: List of pairs (window_index, subsequence)
    """
    if pd.isna(sequence):
        return [(0, None)]
        
    sequence = str(sequence)  # convert to string for safety
    
    if len(sequence) <= seqlenmax:
        return [(0, sequence)]
    
    windows = []
    start = 0
    window_idx = 0
    
    while start < len(sequence):
        end = min(start + seqlenmax, len(sequence))
        windows.append((window_idx, sequence[start:end]))
        
        if end == len(sequence):
            break
            
        start += slide
        window_idx += 1
    
    return windows

In [None]:
def get_seq_size(length):
    """Determine sequence size symbol based on its length"""
    boundaries = [32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536]
    symbols = ['!', '@', '#', '$', '%', '^', '&', '*', '<', '>', ',', '.']
    
    for boundary, symbol in zip(boundaries, symbols):
        if length <= boundary:
            return symbol
    return '.'

def number_to_base26_3slots(n):
    """Convert a number to a 3-character code in a 26-letter system"""
    try:
        num = float(n)
        if np.isnan(num) or num < 1:
            return 'aaa'
        
        letters = 'abcdefghijklmnopqrstuvwxyz'
        result = ''
        num = int(num)
        
        while num > 0:
            num -= 1
            result = letters[num % 26] + result
            num //= 26
        
        # Pad to 3 characters using 'a'
        result = 'a' * (3 - len(result)) + result
        return result[-3:]
    except:
        return 'aaa'

def process_metadata(df):
    """
    Add metadata columns for sequence size and subsequence location coding
    """
    # Add seq_size column based on seq_len
    df['seq_size'] = df['seq_len'].apply(get_seq_size)
    
    # Extract number from ID (after '_') and convert to letter code
    numbers = df['ID'].str.extract(r'_(\d+)$', expand=False)
    df['subseq_location'] = numbers.apply(number_to_base26_3slots)
    
    # Combine seq_size and subseq_location in [X][YYY] format
    df['seq_metadata'] = '[' + df['seq_size'] + '][' + df['subseq_location'] + ']'
    
    return df

In [None]:
def process_sequences(input_file, output_file):
    """
    Process phosphorylation protein sequences to add plus/minus markers
    """
    print(f"Loading file: {input_file}")
    df = pd.read_csv(input_file)
    
    # Add sequence with plus signs after modifications
    print("Adding '+' markers after lowercase letters...")
    df['Seq_plus'] = df['Modified_Sequence'].apply(add_plus_after_lowercase)
    
    # Add question marks after S,T,Y
    print("Adding question marks after S,T,Y in original sequences...")
    df['seqwithquestionmark'] = df['Sequence'].apply(add_questionmarks)
    
    # Add minus signs after S,T,Y if not followed by plus
    print("Adding minus signs after S,T,Y without phosphorylation...")
    df['seqwithplusminus'] = df['Seq_plus'].apply(add_minus_if_no_plus)
    
    # Keep only rows with equal lengths
    original_rows = len(df)
    df_clean = df[df['seqwithquestionmark'].str.len() == df['seqwithplusminus'].str.len()]
    new_rows = len(df_clean)
    removed_rows = original_rows - new_rows
    
    print(f"Original rows: {original_rows}")
    print(f"Rows after cleaning: {new_rows}")
    print(f"Removed rows: {removed_rows}")
    
    # Reset indices and add sequence length
    df = df_clean.reset_index(drop=True)
    df['seq_len'] = df['Sequence'].str.len()
    
    # Save result
    df.to_csv(output_file, index=False)
    print(f"Saved processed data to: {output_file}")
    
    return df

def create_sequence_windows(df, output_file, window_size=128, slide=32):
    """
    Create sliding windows from processed sequences
    """
    # List to store new records
    new_records = []
    
    # Process each sequence
    print("Creating sliding windows...")
    for _, row in df.iterrows():
        uniprot_id = row['ID'] if 'ID' in row else None
        sequence = row['seqwithplusminus']
        
        # Create windows for sequence
        windows = create_sliding_windows(sequence, seqlenmax=window_size, slide=slide)
        
        # Add each window as a new record
        for window_idx, window_sequence in windows:
            if window_sequence is not None:  # add only non-empty sequences
                new_records.append({
                    'ID': f"{uniprot_id}_{window_idx}" if uniprot_id else f"unknown_{window_idx}",
                    'Original_ID': uniprot_id,
                    'Window_Index': window_idx,
                    'Modified_Sequence': window_sequence,
                    'Sequence_Length': len(window_sequence),
                    'seq_len': row['seq_len'],
                })
    
    # Create new DataFrame
    windowed_df = pd.DataFrame(new_records)
    
    # Print basic statistics
    print("\nStatistics:")
    print(f"Number of original sequences: {len(df)}")
    print(f"Number of sequences after splitting: {len(windowed_df)}")
    print("\nSequence length distribution after splitting:")
    print(windowed_df['Sequence_Length'].describe())
    
    # Save results
    windowed_df.to_csv(output_file, index=False)
    print(f"Saved windowed sequences to: {output_file}")
    
    return windowed_df

In [None]:
def finalize_sequences(df, output_file):
    """
    Finalize sequences by adding metadata and formatting
    """
    # Rename columns for clarity
    df = df.rename(columns={
        'Modified_Sequence': 'Seq',
    })
    
    # Create a version without plus/minus signs
    df['Seq_no_plus'] = df['Seq'].str.replace('+', '?')
    df['Seq_no_plus'] = df['Seq_no_plus'].str.replace('-', '?')
    
    # Remove leading special characters
    df['Seq'] = df['Seq'].str.lstrip('+-?')
    df['Seq_no_plus'] = df['Seq_no_plus'].str.lstrip('+-?')
    
    # Drop rows with NaN values
    df = df.dropna(subset=['Seq', 'Seq_no_plus'])
    
    # Filter sequences containing '+'
    count_plus = df['Seq'].str.contains('+', regex=False).sum()
    print(f"Number of sequences containing '+': {count_plus}")
    
    # Keep only sequences with '+' and min length 112
    df = df[df['Seq'].str.contains('+', regex=False) & (df['Seq'].str.len() >= 112)]
    
    # Add metadata to sequences
    df = process_metadata(df)
    
    # Prepend metadata to sequences
    df['Seq'] = df['seq_metadata'] + df['Seq']
    df['Seq_no_plus'] = df['seq_metadata'] + df['Seq_no_plus']
    
    # Remove duplicates
    df_no_duplicates = df.drop_duplicates(subset=['Seq'], keep='first')
    print(f"Final number of sequences after removing duplicates: {len(df_no_duplicates)}")
    
    # Save result
    df_no_duplicates.to_csv(output_file, index=False)
    print(f"Saved finalized data to: {output_file}")
    
    return df_no_duplicates

def main():
    """Main function to execute the full processing pipeline"""
    # Define file paths
    input_file = "phosphorylated_proteins_with_modified_sequences.csv"
    processed_file = "dataset_plus_to_win.csv"
    windowed_file = "sequences_windowed64_dataset_slide32-128.csv" 
    final_file = "dataset_sequences_renamed64plus_filtered_final.csv"
    
    # Check if input file exists
    if not os.path.exists(input_file):
        print(f"ERROR: Input file {input_file} not found.")
        return
        
    # Process sequences (add markers)
    df = process_sequences(input_file, processed_file)
    
    # Create sliding windows
    windowed_df = create_sequence_windows(df, windowed_file)
    
    # Finalize sequences with metadata
    final_df = finalize_sequences(windowed_df, final_file)
    
    print("Processing complete!")

if __name__ == "__main__":
    main()