In [None]:
"""
HMMSCAN PIPELINE

This script runs hmmscan on all .faa files in your dataset.

HOW TO STRUCTURE YOUR DATA:
- Create a main folder (e.g., 'Your_datasets').
- Put all .faa files directly inside this folder **OR**
- Organize them into subfolders (optional) if you have a large number of files. 
  Each subfolder will be treated as a "group" and processed separately.
- Place 'Pfam-A.hmm' in the main folder.

OUTPUT:
- hmmscan results will be saved in 'hmmscan_output' inside the main folder.
- Progress will be logged in 'hmmscan_log.txt'.
"""

import os
import subprocess
from datetime import datetime

# --- GENERAL SETTINGS ---
# Main datasets folder (replace with your own folder if needed)
main_dir = "./Your_datasets"

# Pfam database file
pfam_hmm = os.path.join(main_dir, "Pfam-A.hmm")  

# Output directory for hmmscan results
output_dir = os.path.join(main_dir, "hmmscan_output")  

# Log file to track progress
log_file = os.path.join(output_dir, "hmmscan_log.txt")  

# Create output folder if it doesn't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# --- PROCESSING EACH GROUP ---
# If you have subfolders (groups), each will be processed separately.
# Otherwise, just put all .faa files directly in main_dir.
for group_folder in os.listdir(main_dir):
    group_path = os.path.join(main_dir, group_folder)
    if os.path.isdir(group_path):  # Only process folders
        fasta_files = [f for f in os.listdir(group_path) if f.endswith('.faa')]
        processed_count = 0

        with open(log_file, 'a') as log:
            log.write("{} - Starting hmmscan for group: {}\n".format(datetime.now(), group_folder))

            for fasta_file in fasta_files:
                input_file = os.path.join(group_path, fasta_file)
                output_file = os.path.join(output_dir, "{}_domains.tbl".format(os.path.splitext(fasta_file)[0]))

                try:
                    print("Running hmmscan for {}...".format(fasta_file))
                    log.write("{} - Running hmmscan for {}...\n".format(datetime.now(), fasta_file))

                    command = [
                        "hmmscan",
                        "--tblout", output_file,
                        "-E", "0.001",
                        "--domE", "0.001",
                        pfam_hmm,
                        input_file
                    ]
                    subprocess.check_call(command)

                    log.write("{} - Completed hmmscan for {}. Output saved to {}\n".format(datetime.now(), fasta_file, output_file))
                    print("hmmscan completed for {}.".format(fasta_file))
                    processed_count += 1

                except subprocess.CalledProcessError as e:
                    log.write("{} - Error running hmmscan for {}: {}\n".format(datetime.now(), fasta_file, e))
                    print("Error running hmmscan for {}: {}".format(fasta_file, e))

            log.write("{} - hmmscan completed for {} files in group {}.\n".format(datetime.now(), processed_count, group_folder))
            print("hmmscan completed for {} files in group {}.".format(processed_count, group_folder))
