In [24]:
import os
import subprocess
from pathlib import Path
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
import sys

input_trf_script = '/home/jpereira/OEs/OE1.v2/Scripts/trf'
input_filtered_fasta = '/home/jpereira/OEs/Results/OE1/NamSeqs/Data/mcl_clustering/sequneces_clusters/cluster_1.fasta'
params_trf = "2 7 7 80 10 50 2000"
output_joined_sequences_fasta = '/home/jpereira/OEs/Results/OE1/NamSeqs/Data/blocks_str/joined_sequnces.fasta' 
output_str_blocks_tsv = '/home/jpereira/OEs/Results/OE1/NamSeqs/Data/blocks_str/blocks_str.tsv'

input_trf_script = '/home/jpereira/OEs/OE1.v2/Scripts/trf'
input_filtered_fasta = '/home/jpereira/OEs/Results/OE1/ToxoPasteur1/Data/mcl_clustering/sequneces_clusters/cluster_1.fasta'
params_trf = "2 7 7 80 10 50 2000"
output_joined_sequences_fasta = '/home/jpereira/OEs/Results/OE1/ToxoPasteur1/Data/blocks_str/joined_sequnces.fasta' 
output_str_blocks_tsv = '/home/jpereira/OEs/Results/OE1/ToxoPasteur1/Data/blocks_str/blocks_str.tsv'


os.makedirs(os.path.dirname(output_str_blocks_tsv), exist_ok=True)


In [25]:
def join_fasta_sequences(input_fasta: str, output_fasta: str, joined_id: str = "joined_seq") -> None:
    """
    Concatenate all sequences from a FASTA file into a single sequence.

    Parameters
    ----------
    input_fasta : str
        Path to the input FASTA file.
    output_fasta : str
        Path to the output FASTA file with the single joined sequence.
    joined_id : str
        Identifier for the output sequence.
    """
    sequences = list(SeqIO.parse(input_fasta, "fasta"))
    joined_seq = "".join(str(record.seq) for record in sequences)

    joined_record = SeqRecord(
        Seq(joined_seq),
        id=joined_id,
        description=f"Concatenated {len(sequences)} sequences from {input_fasta}"
    )

    SeqIO.write(joined_record, output_fasta, "fasta")
    print(f"✅ Joined sequence written to: {output_fasta}")
    

join_fasta_sequences(input_fasta=input_filtered_fasta, output_fasta=output_joined_sequences_fasta)

✅ Joined sequence written to: /home/jpereira/OEs/Results/OE1/ToxoPasteur1/Data/blocks_str/joined_sequnces.fasta


In [26]:


def find_str(decont_low_gc_fasta: str,
             trf_script: str,
             trf_params: str,
             str_table: str) -> None:
    """
    Run Tandem Repeats Finder (TRF) on a FASTA and parse its .dat output
    into a TSV of short tandem repeats.

    Parameters
    ----------
    decont_low_gc_fasta
        Path to input FASTA.
    trf_script
        Path to the 'trf' executable.
    trf_params
        Space-separated TRF parameters, e.g. "2 7 7 80 10 50 2000".
    str_table
        Path to output TSV file.
    """
    # 1) Ensure output directory exists
    outdir = os.path.dirname(str_table)
    os.makedirs(outdir, exist_ok=True)

    # 2) Run TRF and capture stdout/stderr in log.trf (ignore errors)
    log_path = os.path.join(outdir, "log.trf")
    cmd = [trf_script, decont_low_gc_fasta] + trf_params.split() + ["-h", "-d"]
    with open(log_path, "w") as log_fh:
        subprocess.run(cmd, stdout=log_fh, stderr=log_fh, check=False)

    # 3) Locate the .dat file
    params_name = trf_params.replace(" ", ".")
    fasta_basename = os.path.basename(decont_low_gc_fasta)
    dat_file = os.path.join(outdir, f"{fasta_basename}.{params_name}.dat")

    # 4) Prepare TSV header
    header = [
        "seqid", "rstart", "rend", "period_size", "copy_number",
        "consensus_size", "percent_matches", "percent_indels",
        "score", "A", "C", "G", "T", "entropy(0-2)", "repeat_sequence"
    ]

    # 5) If .dat exists and is non-empty, parse it; otherwise, write only header
    if os.path.exists(dat_file) and os.path.getsize(dat_file) > 0:
        # Read and clean lines
        with open(dat_file) as fh:
            lines = fh.readlines()[5:]  # skip first 5 header lines
        lines = [l for l in lines if l.strip() and "Parameters" not in l]

        # Split into records by "Sequence: "
        text = "".join(lines)
        records = text.split("Sequence: ")

        rows = []
        for rec in records:
            rec = rec.strip()
            if not rec:
                continue
            rec_lines = rec.splitlines()
            seqid = rec_lines[0].strip()
            for data_line in rec_lines[1:]:
                data_line = data_line.strip()
                if not data_line:
                    continue
                parts = data_line.split()

                parts = parts[:14] 
                rows.append([seqid] + parts)

        # Write TSV
        with open(str_table, "w") as out_fh:
            out_fh.write("\t".join(header) + "\n")
            for row in rows:
                out_fh.write("\t".join(row) + "\n")
    else:
        # No data: write header only
        with open(str_table, "w") as out_fh:
            out_fh.write("\t".join(header) + "\n")

find_str(
    decont_low_gc_fasta=output_joined_sequences_fasta,
    trf_script=input_trf_script,
    trf_params=params_trf,
    str_table=output_str_blocks_tsv
    )

import pandas as pd

blocks_str = pd.read_csv(output_str_blocks_tsv, sep='\t')
blocks_str


Unnamed: 0,seqid,rstart,rend,period_size,copy_number,consensus_size,percent_matches,percent_indels,score,A,C,G,T,entropy(0-2),repeat_sequence


Unnamed: 0,seqid,rstart,rend,period_size,copy_number,consensus_size,percent_matches,percent_indels,score,A,C,G,T,entropy(0-2),repeat_sequence
09d6f4a1-4c78-4aa5-aa9d-bcbfc0bfeddf,12242,12355,59,1.9,59,98,0,219,34,21,16,27,1.95,GACTAAAGTCAGCATAATCAATAAAAAGGTTTGTTCAGCCACTGGT...,GACTAAAGTCAGCATAATCAATAAAAAGGTTTGTTCAGCCACTGGT...
188f54cf-e93c-47f4-beb9-79cf2224e25c,3996,7262,1183,2.8,1183,99,0,6479,34,18,17,29,1.94,GATGACATAAATACTAACAAACCACCGGTTTTGGATGGAATTACTT...,GATGACATAAATACTAACAAACCACCGGTTTTGGATGGAATTACTT...
6b30cf05-c49e-409d-8415-724b469c9fef,1245,4128,1093,2.6,1093,99,0,5707,28,18,19,33,1.96,CTTTTTCTGGGGAGTATATACTACGAGTTGGACTACTGGTTTAGAT...,CTTTTTCTGGGGAGTATATACTACGAGTTGGACTACTGGTTTAGAT...
80e16e15-e446-4e7a-8f3d-6bf9804672ae,11088,14294,1183,2.7,1183,99,0,6378,29,17,18,34,1.94,CTATCCTATTTAATTGGATTAATCTTCTTACAAGCGGCTTTTGGTT...,CTATCCTATTTAATTGGATTAATCTTCTTACAAGCGGCTTTTGGTT...
bc9c3b7a-803c-4d75-896c-79afa5c78867,1014,1064,25,2.0,25,100,0,102,23,13,27,35,1.93,CGGAGGATATGGTAACTTCTTTGTA,CGGAGGATATGGTAACTTCTTTGTACGGAGGATATGGTAACTTCTT...
cd0c4600-e5f6-4666-803b-dda5975e0db2,1666,2569,447,2.0,447,99,0,1790,32,18,18,29,1.95,GTAAACAAAGACCTTCAAGATCTAAACCAGTAGTCCAACTCGTAGT...,GTAAACAAAGACCTTCAAGATCTAAACCAGTAGTCCAACTCGTAGT...
e98b32e1-8494-454d-afc0-d178476e50c1,851,3129,1097,2.1,1097,99,0,4531,28,18,19,33,1.96,CTTTTTCTGGGGAGTATATACTACGAGTTGGACTACTGGTTTAGAT...,CTTTTTCTGGGGAGTATATACTACGAGTTGGACTACTGGTTTAGAT...
fc2f9aaf-fef5-4d97-aef4-dd91dc3e343f,3493,5614,1143,1.9,1138,99,0,3926,32,18,18,30,1.95,CACAGTTCAACCCTGTATTATTTAACTCAGTTAATTAGTATTAGGT...,CACAGTTCAACCCTGTATTATTTAACTCAGTTAATTAGTATTAGGT...
