In [1]:
import pysam

# Extract rolling windows from input sequence (using pysam)
def extract_rolling_windows(input_fasta, window_size=20):
    samfile = pysam.FastaFile(input_fasta)
    windows = []
    
    # Iterate over all sequences in the FASTA file
    for seq_name in samfile.references:
        sequence = samfile.fetch(seq_name)
        
        # Generate rolling windows of size `window_size`
        for i in range(len(sequence) - window_size + 1):
            windows.append(sequence[i:i + window_size])
    
    samfile.close()
    return windows

# Save rolling windows to a FASTA file
def save_to_fasta(windows, output_fasta):
    with open(output_fasta, 'w') as f:
        # Write each window to the FASTA file with a header like >seq_1, >seq_2, etc.
        for i, seq in enumerate(windows):
            f.write(f">seq_{i+1}\n{seq}\n")

In [3]:
# dux4 50
if __name__ == "__main__":
    input_fasta = "/lustre/scratch126/casm/team274sb/lr26/dux_search/dux4_intron2.fasta"
    output_fasta = "/lustre/scratch126/casm/team274sb/lr26/dux_search/dux4_intron2_window50.fasta"
    
    # Extract rolling windows and save them
    windows = extract_rolling_windows(input_fasta, window_size=50)
    save_to_fasta(windows, output_fasta)
    
    print(f"✅ Saved {len(windows)} windows to {output_fasta}")
    
# rpl 50
if __name__ == "__main__":
    input_fasta = "/lustre/scratch126/casm/team274sb/lr26/dux_search/rpl23.fasta"
    output_fasta = "/lustre/scratch126/casm/team274sb/lr26/dux_search/rpl23_window50.fasta"
    
    # Extract rolling windows and save them
    windows = extract_rolling_windows(input_fasta, window_size=50)
    save_to_fasta(windows, output_fasta)
    
    print(f"✅ Saved {len(windows)} windows to {output_fasta}")
    
# Example usage
if __name__ == "__main__":
    input_fasta = "/lustre/scratch126/casm/team274sb/lr26/dux_search/dux4_intron2.fasta"
    output_fasta = "/lustre/scratch126/casm/team274sb/lr26/dux_search/dux4_window100.fasta"
    
    # Extract rolling windows and save them
    windows = extract_rolling_windows(input_fasta, window_size=100)
    save_to_fasta(windows, output_fasta)
    
    print(f"✅ Saved {len(windows)} windows to {output_fasta}")
    
# Example usage
if __name__ == "__main__":
    input_fasta = "/lustre/scratch126/casm/team274sb/lr26/dux_search/rpl23.fasta"
    output_fasta = "/lustre/scratch126/casm/team274sb/lr26/dux_search/rpl23_window100.fasta"
    
    # Extract rolling windows and save them
    windows = extract_rolling_windows(input_fasta, window_size=100)
    save_to_fasta(windows, output_fasta)
    
    print(f"✅ Saved {len(windows)} windows to {output_fasta}")
    

✅ Saved 6222 windows to /lustre/scratch126/casm/team274sb/lr26/dux_search/dux4_intron2_window50.fasta
✅ Saved 405 windows to /lustre/scratch126/casm/team274sb/lr26/dux_search/rpl23_window50.fasta
✅ Saved 6172 windows to /lustre/scratch126/casm/team274sb/lr26/dux_search/dux4_window100.fasta
✅ Saved 355 windows to /lustre/scratch126/casm/team274sb/lr26/dux_search/rpl23_window100.fasta


In [1]:
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord

def extract_sliding_windows(input_fasta, output_fasta, window_size=50):
    """
    Extract sliding windows from unindexed FASTA file using Biopython.
    """
    records = []
    for record in SeqIO.parse(input_fasta, "fasta"):
        seq = str(record.seq)
        for i in range(len(seq) - window_size + 1):
            window_seq = seq[i:i + window_size]
            window_id = f"{record.id}_{i+1}_{i+window_size}"
            records.append(SeqRecord(seq=window_seq, id=window_id, description=""))
    SeqIO.write(records, output_fasta, "fasta")
    print(f"✅ Wrote {len(records)} windows to {output_fasta}")

# === Example usage ===
if __name__ == "__main__":
    extract_sliding_windows("input.fasta", "windows_20bp.fasta", window_size=50)

ModuleNotFoundError: No module named 'Bio'