Build HMM profiles for plastic-degrading enzymes.

Usage:
```bash
python build_hmms.py --fasta enzymes.faa --outdir hmms \
                        --cdhit_id 0.95 --min_seqs 4
```

In [None]:
# Libraries
import argparse, subprocess, tempfile, shutil, os, json
from pathlib import Path
from typing import List, Dict
from Bio import SeqIO, SearchIO
import requests, time
from dotenv import load_dotenv


In [28]:
# Configuration
## Working Directory
load_dotenv()
working_directory = os.getenv("working_dir")
os.chdir(working_directory)
print(os.getcwd())
## Paths
### Databases
if os.path.exists("input/uniprot_sprot.fasta"):
    print("Using local uniprot_sprot.fasta")
    Local_Uniprot_Database = "input/uniprot_sprot.fasta"
else:
    print("Missing local uniprot_sprot.fasta. \n To download it (47GB), run:\n wget https://ftp.uniprot.org/pub/databases/uniprot/uniref/uniref90/uniref90.fasta.gz")

Plastic_Seq_Database = "input/PlasticDB.fasta"

TEST_Seq_Database = "input/TEST_FASTA.fasta"

### Outputs
First_Cluster_output_path = "output/First_Cluster/"
Second_Cluster_output_path = "output/Second_Cluster/"

## Parameters
CDHIT_Identity = 0.95
CDHIT_prefix = "CDHIT"
WORD_SIZE = 5

/home/marcleo/work/git_reps/PlasticDB-HMM
Missing local uniprot_sprot.fasta. 
 To download it (47GB), run:
 wget https://ftp.uniprot.org/pub/databases/uniprot/uniref/uniref90/uniref90.fasta.gz


In [29]:
# Step 1: Cluster sequences using CD-HIT

def run_cdhit(input_fasta, output_path, output_prefix, identity, word_size):
    output_file = f"{output_path}{output_prefix}.fasta"
    mdhit_cmd = [
        "cd-hit",
        "-i", input_fasta,
        "-o", output_file,
        "-c", str(identity),
        "-n", str(word_size),
        "-g", "1",
    ]
    subprocess.run(mdhit_cmd, check=True)
    print(f"CD-HIT clustering completed. Output: {output_prefix}.fasta")
    return output_file

run_cdhit(TEST_Seq_Database, First_Cluster_output_path, CDHIT_prefix, CDHIT_Identity, WORD_SIZE)

Program: CD-HIT, V4.8.1 (+OpenMP), Apr 24 2025, 22:00:32
Command: cd-hit -i input/TEST_FASTA.fasta -o
         output/First_Cluster/CDHIT.fasta -c 0.95 -n 5 -g 1

Started: Wed May 21 19:51:37 2025
                            Output                              
----------------------------------------------------------------
total seq: 45
longest and shortest : 639 and 201
Total letters: 20446
Sequences have been sorted

Approximated minimal memory consumption:
Sequence        : 0M
Buffer          : 1 X 16M = 16M
Table           : 1 X 65M = 65M
Miscellaneous   : 0M
Total           : 81M

Table limit with the given memory limit:
Max number of representatives: 1208273
Max number of word counting entries: 89810798

comparing sequences from          0  to         45

       45  finished         37  clusters

Approximated maximum memory consumption: 81M
writing new database
writing clustering information
program completed !

Total CPU time 0.05
CD-HIT clustering completed. Output: CDHIT.fas

'output/First_Cluster/CDHIT.fasta'